From 4e17a91237f3ef8811eab274401adc8bbad0c85d Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Tue, 5 May 2026 23:19:04 -0700 Subject: [PATCH] feat(portal): chart lease telemetry history --- CHANGELOG.md | 1 + docs/commands/status.md | 2 + docs/orchestrator.md | 2 +- internal/cli/coordinator.go | 1 + internal/cli/status.go | 2 + worker/src/fleet.ts | 14 ++++ worker/src/portal.ts | 129 ++++++++++++++++++++++++++++++++++++ worker/src/types.ts | 1 + worker/test/fleet.test.ts | 96 +++++++++++++++++++++++++++ 9 files changed, 247 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 353d364..f4a7186 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ - Added portal run detail pages with command metadata, result summaries, dense viewport-fitted portal tables, provider/OS badges, active/ended/provider/target filters, sticky portal chrome, and copyable retained log previews. - Added admin portal visibility for non-owned runner leases, including `mine`/`system` filters and matching detail/code/VNC drilldowns for operator sessions. - Added latest lease telemetry snapshots for coordinator-backed Linux leases, including load, memory, disk, and uptime in `status --json` and the portal detail view. +- Added bounded lease telemetry history with portal sparklines and stale/high-resource badges on lease detail pages. - Added run-level telemetry summaries with start/end Linux resource snapshots in run history JSON, human history output, and portal run tables/details. - Added `.crabboxignore` for repo-local sync-only exclude patterns shared by `run` and `sync-plan`. - Documented the prebaked runner image boundary: provider-owned AMIs/snapshots hold machine capabilities while repo/runtime caches stay in QA workflows or warm leases. diff --git a/docs/commands/status.md b/docs/commands/status.md index c2e5898..661346b 100644 --- a/docs/commands/status.md +++ b/docs/commands/status.md @@ -33,3 +33,5 @@ Human and JSON output include the selected network. With Tailscale metadata, status also prints the tailnet host/state. For coordinator-backed Linux leases that have received a recent heartbeat, status also includes the latest best-effort telemetry snapshot: load, memory, disk, uptime, and capture age. +JSON status includes `telemetryHistory` when the coordinator has retained recent +samples for portal trend charts. diff --git a/docs/orchestrator.md b/docs/orchestrator.md index 99acf2a..018e519 100644 --- a/docs/orchestrator.md +++ b/docs/orchestrator.md @@ -43,7 +43,7 @@ The Worker stores coordinator leases as `active`, `released`, `expired`, or `fai `crabbox warmup --idle-timeout 30m` and `crabbox run --idle-timeout 30m` set inactivity expiry. `--ttl` is a separate maximum wall-clock lifetime. The CLI sends coordinator heartbeats while a lease is in use; each heartbeat updates `lastTouchedAt` and recomputes `expiresAt = min(createdAt + ttl, lastTouchedAt + idleTimeout)`. -For Linux leases, heartbeats also attach a best-effort latest telemetry snapshot when SSH is reachable. The Durable Object keeps only the latest sanitized load, memory, disk, uptime, source, and capture timestamp on the lease record; it is not a time-series store. +For Linux leases, heartbeats also attach best-effort telemetry when SSH is reachable. The Durable Object keeps the latest sanitized load, memory, disk, uptime, source, and capture timestamp on the lease record, plus a bounded `telemetryHistory` ring of the latest 60 samples for compact portal trends. Direct-provider mode does not have a central heartbeat or alarm. It labels machines with `created_at`, `last_touched_at`, `idle_timeout_secs`, `expires_at`, `state`, `lease`, and `slug`; `crabbox cleanup` uses those labels conservatively. diff --git a/internal/cli/coordinator.go b/internal/cli/coordinator.go index 254155f..537e9a9 100644 --- a/internal/cli/coordinator.go +++ b/internal/cli/coordinator.go @@ -58,6 +58,7 @@ type CoordinatorLease struct { LastTouchedAt string `json:"lastTouchedAt,omitempty"` ExpiresAt string `json:"expiresAt"` Telemetry *LeaseTelemetry `json:"telemetry,omitempty"` + TelemetryHistory []*LeaseTelemetry `json:"telemetryHistory,omitempty"` } type ProvisioningAttempt struct { diff --git a/internal/cli/status.go b/internal/cli/status.go index cd22303..8177060 100644 --- a/internal/cli/status.go +++ b/internal/cli/status.go @@ -90,6 +90,7 @@ type statusView struct { HasHost bool `json:"hasHost"` Ready bool `json:"ready"` Telemetry *LeaseTelemetry `json:"telemetry,omitempty"` + TelemetryHistory []*LeaseTelemetry `json:"telemetryHistory,omitempty"` } func (a App) leaseStatus(ctx context.Context, cfg Config, id string) (statusView, error) { @@ -133,6 +134,7 @@ func (a App) leaseStatus(ctx context.Context, cfg Config, id string) (statusView HasHost: hasHost, Ready: ready, Telemetry: lease.Telemetry, + TelemetryHistory: lease.TelemetryHistory, }, nil } server, target, leaseID, err := a.findLease(ctx, cfg, id) diff --git a/worker/src/fleet.ts b/worker/src/fleet.ts index 48ad016..b4f0640 100644 --- a/worker/src/fleet.ts +++ b/worker/src/fleet.ts @@ -46,6 +46,7 @@ import { costLimits, enforceCostLimits, leaseCost, requestOrg, usageSummary } fr const fleetID = "default"; const maxStoredRunLogBytes = 8 * 1024 * 1024; const runLogChunkBytes = 64 * 1024; +const maxLeaseTelemetryHistory = 60; const webVNCTicketTTLSeconds = 120; const codeTicketTTLSeconds = 120; const maxPendingWebVNCBytes = 1024 * 1024; @@ -551,6 +552,7 @@ export class FleetDurableObject implements DurableObject { const telemetry = sanitizeLeaseTelemetry(body.telemetry, now); if (telemetry) { lease.telemetry = telemetry; + lease.telemetryHistory = appendLeaseTelemetryHistory(lease.telemetryHistory, telemetry); } lease.updatedAt = now.toISOString(); lease.lastTouchedAt = now.toISOString(); @@ -2635,6 +2637,18 @@ function sanitizeRunTelemetry( return telemetry; } +function appendLeaseTelemetryHistory( + history: LeaseTelemetry[] | undefined, + telemetry: LeaseTelemetry, +): LeaseTelemetry[] { + const existing = Array.isArray(history) ? history : []; + const next = [ + ...existing.filter((sample) => sample && sample.capturedAt !== telemetry.capturedAt), + telemetry, + ].toSorted((left, right) => left.capturedAt.localeCompare(right.capturedAt)); + return next.slice(-maxLeaseTelemetryHistory); +} + function sanitizeTelemetryTimestamp(value: string | undefined, now: Date): string { const parsed = Date.parse(value ?? ""); if (!Number.isFinite(parsed)) { diff --git a/worker/src/portal.ts b/worker/src/portal.ts index 6a8678e..2de5a59 100644 --- a/worker/src/portal.ts +++ b/worker/src/portal.ts @@ -132,6 +132,7 @@ export function portalLeaseDetail( ${leaseTelemetryRows(lease.telemetry)} ${metaRow("expires", shortTime(lease.expiresAt))} + ${leaseTelemetryTimeline(lease.telemetry, lease.telemetryHistory)} ${ active ? `
@@ -750,6 +751,126 @@ function leaseTelemetryRows(telemetry: LeaseRecord["telemetry"]): string { ].join(""); } +function leaseTelemetryTimeline( + telemetry: LeaseRecord["telemetry"], + history: LeaseRecord["telemetryHistory"], +): string { + const samples = telemetrySamples(telemetry, history); + if (!telemetry && samples.length === 0) { + return ""; + } + const health = telemetryHealthPills(telemetry); + return `
+
+ box telemetry +
${health}
+
+ ${telemetrySparkline( + "load", + samples.map((sample) => sample.load1), + "load", + )} + ${telemetrySparkline( + "memory", + samples.map((sample) => sample.memoryPercent), + "%", + )} + ${telemetrySparkline( + "disk", + samples.map((sample) => sample.diskPercent), + "%", + )} +
`; +} + +function telemetrySamples( + telemetry: LeaseRecord["telemetry"], + history: LeaseRecord["telemetryHistory"], +): LeaseTelemetrySample[] { + const byTime = new Map(); + for (const sample of Array.isArray(history) ? history : []) { + if (sample?.capturedAt) { + byTime.set(sample.capturedAt, sample); + } + } + if (telemetry?.capturedAt) { + byTime.set(telemetry.capturedAt, telemetry); + } + return [...byTime.values()].toSorted((left, right) => + left.capturedAt.localeCompare(right.capturedAt), + ); +} + +type LeaseTelemetrySample = NonNullable; + +function telemetryHealthPills(telemetry: LeaseRecord["telemetry"]): string { + if (!telemetry?.capturedAt) { + return `no signal`; + } + const pills = []; + const ageMs = Date.now() - Date.parse(telemetry.capturedAt); + if (!Number.isFinite(ageMs) || ageMs > 10 * 60 * 1000) { + pills.push( + `stale ${escapeHTML(relativeTime(telemetry.capturedAt))}`, + ); + } else { + pills.push(`live`); + } + if ((telemetry.memoryPercent ?? 0) >= 85) { + pills.push( + `memory ${Math.round(telemetry.memoryPercent ?? 0)}%`, + ); + } + if ((telemetry.diskPercent ?? 0) >= 85) { + pills.push( + `disk ${Math.round(telemetry.diskPercent ?? 0)}%`, + ); + } + if ((telemetry.load1 ?? 0) >= 16) { + pills.push(`load ${telemetry.load1?.toFixed(1)}`); + } + return pills.join(""); +} + +function telemetrySparkline( + label: string, + rawValues: Array, + unit: string, +): string { + const values = rawValues.filter((value): value is number => Number.isFinite(value)); + const latest = values.at(-1); + if (values.length < 2 || latest === undefined) { + return `
${escapeHTML(label)}waiting for samples
`; + } + const max = unit === "%" ? 100 : Math.max(1, ...values); + const points = telemetryPolylinePoints(values, max); + return `
+ ${escapeHTML(label)} + + + + ${escapeHTML(formatTelemetryValue(latest, unit))} +
`; +} + +function telemetryPolylinePoints(values: number[], max: number): string { + const lastIndex = Math.max(1, values.length - 1); + return values + .map((value, index) => { + const x = (index / lastIndex) * 100; + const y = 26 - (Math.max(0, Math.min(value, max)) / max) * 24; + return `${x.toFixed(1)},${y.toFixed(1)}`; + }) + .join(" "); +} + +function formatTelemetryValue(value: number, unit: string): string { + if (unit === "%") { + return `${Math.round(value)}%`; + } + return value.toFixed(2); +} + function runTelemetryRows(telemetry: RunRecord["telemetry"]): string { if (!telemetry) { return ""; @@ -952,6 +1073,14 @@ function html(title: string, body: string, status = 200, nonce = ""): Response { .meta-grid div { padding:8px 10px; border-bottom:1px solid var(--line-soft); } .meta-grid dt { color:var(--muted); font-size:11px; text-transform:uppercase; margin-bottom:3px; } .meta-grid dd { margin:0; overflow:hidden; text-overflow:ellipsis; white-space:nowrap; } + .telemetry-strip { display:grid; gap:7px; padding:9px 10px; border-top:1px solid var(--line-soft); background:var(--panel-2); } + .telemetry-strip-head { display:flex; justify-content:space-between; align-items:center; gap:8px; color:var(--muted); font-size:11px; text-transform:uppercase; } + .telemetry-strip-head div { display:flex; gap:4px; align-items:center; flex-wrap:wrap; justify-content:flex-end; } + .telemetry-line { display:grid; grid-template-columns:58px minmax(0,1fr) 52px; gap:8px; align-items:center; min-height:24px; font-size:12px; } + .telemetry-line > span:first-child { color:var(--muted); text-transform:uppercase; font-size:10px; } + .telemetry-line > span:last-child { text-align:right; font-family:var(--mono); color:#d1fae5; } + .telemetry-chart { width:100%; height:24px; display:block; overflow:visible; } + .telemetry-chart polyline { fill:none; stroke:var(--accent); stroke-width:1.8; vector-effect:non-scaling-stroke; } .stop-form { padding:10px; } .bridge-grid { display:grid; gap:0; } .bridge-row { display:grid; grid-template-columns:minmax(0,1fr) auto auto; gap:8px; align-items:center; padding:9px 10px; border-bottom:1px solid var(--line-soft); } diff --git a/worker/src/types.ts b/worker/src/types.ts index ba458cd..efddb2f 100644 --- a/worker/src/types.ts +++ b/worker/src/types.ts @@ -148,6 +148,7 @@ export interface LeaseRecord { lastTouchedAt?: string; expiresAt: string; telemetry?: LeaseTelemetry; + telemetryHistory?: LeaseTelemetry[]; releasedAt?: string; endedAt?: string; } diff --git a/worker/test/fleet.test.ts b/worker/test/fleet.test.ts index db9b8bd..7f332fa 100644 --- a/worker/test/fleet.test.ts +++ b/worker/test/fleet.test.ts @@ -435,7 +435,83 @@ describe("fleet lease identity and idle", () => { memoryTotalBytes: 2048, memoryPercent: 50, }); + expect(lease.telemetryHistory).toHaveLength(1); + expect(lease.telemetryHistory?.[0]).toMatchObject({ load1: 0.42, memoryPercent: 50 }); expect(Date.parse(lease.expiresAt)).toBeGreaterThan(expiresAt.getTime()); + + const secondHeartbeat = await fleet.fetch( + request("POST", "/v1/leases/cbx_000000000001/heartbeat", { + headers: { + "x-crabbox-owner": "peter@example.com", + "x-crabbox-org": "openclaw", + }, + body: { + telemetry: { + capturedAt: "2026-05-05T01:03:03Z", + source: "ssh-linux", + load1: 0.84, + memoryPercent: 55, + }, + }, + }), + ); + expect(secondHeartbeat.status).toBe(200); + const second = (await secondHeartbeat.json()) as { lease: LeaseRecord }; + expect(second.lease.telemetry).toMatchObject({ + capturedAt: "2026-05-05T01:03:03.000Z", + load1: 0.84, + memoryPercent: 55, + }); + expect(second.lease.telemetryHistory?.map((sample) => sample.load1)).toEqual([0.42, 0.84]); + expect(second.lease.telemetryHistory?.map((sample) => sample.capturedAt)).toEqual([ + "2026-05-05T01:02:03.000Z", + "2026-05-05T01:03:03.000Z", + ]); + }); + + it("keeps lease telemetry history bounded to the latest samples", async () => { + const storage = new MemoryStorage(); + const fleet = testFleet(storage); + storage.seed( + "lease:cbx_000000000001", + testLease({ + id: "cbx_000000000001", + slug: "blue-lobster", + owner: "peter@example.com", + org: "openclaw", + expiresAt: new Date(Date.now() + 60 * 60 * 1000).toISOString(), + telemetryHistory: Array.from({ length: 60 }, (_, index) => ({ + capturedAt: new Date(Date.UTC(2026, 4, 5, 1, index, 0)).toISOString(), + source: "ssh-linux", + load1: index, + })), + }), + ); + + const heartbeat = await fleet.fetch( + request("POST", "/v1/leases/blue-lobster/heartbeat", { + headers: { + "x-crabbox-owner": "peter@example.com", + "x-crabbox-org": "openclaw", + }, + body: { + telemetry: { + capturedAt: "2026-05-05T02:00:00Z", + source: "ssh-linux", + load1: 61, + }, + }, + }), + ); + + expect(heartbeat.status).toBe(200); + const { lease } = (await heartbeat.json()) as { lease: LeaseRecord }; + expect(lease.telemetryHistory).toHaveLength(60); + expect(lease.telemetryHistory?.[0]?.capturedAt).toBe("2026-05-05T01:01:00.000Z"); + expect(lease.telemetryHistory?.at(-1)).toMatchObject({ + capturedAt: "2026-05-05T02:00:00.000Z", + load1: 61, + }); }); it("hides exact lease IDs and lists from other non-admin users", async () => { @@ -728,6 +804,22 @@ describe("fleet lease identity and idle", () => { diskPercent: 25, uptimeSeconds: 3600, }, + telemetryHistory: [ + { + capturedAt: new Date(Date.now() - 45_000).toISOString(), + source: "ssh-linux", + load1: 0.22, + memoryPercent: 42, + diskPercent: 24, + }, + { + capturedAt: new Date(Date.now() - 30_000).toISOString(), + source: "ssh-linux", + load1: 0.32, + memoryPercent: 47, + diskPercent: 25, + }, + ], expiresAt: new Date(Date.now() + 60 * 60 * 1000).toISOString(), }), ); @@ -830,6 +922,10 @@ describe("fleet lease identity and idle", () => { expect(body).toContain("
memory
1.0 KiB / 2.0 KiB (50%)
"); expect(body).toContain("
disk
1.0 GiB / 4.0 GiB (25%)
"); expect(body).toContain("
uptime
1h
"); + expect(body).toContain("box telemetry"); + expect(body).toContain('class="telemetry-chart"'); + expect(body).toContain("0.42"); + expect(body).toContain("50%"); expect(body).toContain("load 0.42 · mem 75% · +512 B"); expect(body).toContain("table-search"); expect(body).toContain("/portal/runs/run_000000000001");