feat(portal): chart lease telemetry history

This commit is contained in:
Vincent Koc 2026-05-05 23:19:04 -07:00
parent 81e7603d32
commit 4e17a91237
No known key found for this signature in database
9 changed files with 247 additions and 1 deletions

View File

@ -12,6 +12,7 @@
- Added portal run detail pages with command metadata, result summaries, dense viewport-fitted portal tables, provider/OS badges, active/ended/provider/target filters, sticky portal chrome, and copyable retained log previews.
- Added admin portal visibility for non-owned runner leases, including `mine`/`system` filters and matching detail/code/VNC drilldowns for operator sessions.
- Added latest lease telemetry snapshots for coordinator-backed Linux leases, including load, memory, disk, and uptime in `status --json` and the portal detail view.
- Added bounded lease telemetry history with portal sparklines and stale/high-resource badges on lease detail pages.
- Added run-level telemetry summaries with start/end Linux resource snapshots in run history JSON, human history output, and portal run tables/details.
- Added `.crabboxignore` for repo-local sync-only exclude patterns shared by `run` and `sync-plan`.
- Documented the prebaked runner image boundary: provider-owned AMIs/snapshots hold machine capabilities while repo/runtime caches stay in QA workflows or warm leases.

View File

@ -33,3 +33,5 @@ Human and JSON output include the selected network. With Tailscale metadata,
status also prints the tailnet host/state. For coordinator-backed Linux leases
that have received a recent heartbeat, status also includes the latest
best-effort telemetry snapshot: load, memory, disk, uptime, and capture age.
JSON status includes `telemetryHistory` when the coordinator has retained recent
samples for portal trend charts.

View File

@ -43,7 +43,7 @@ The Worker stores coordinator leases as `active`, `released`, `expired`, or `fai
`crabbox warmup --idle-timeout 30m` and `crabbox run --idle-timeout 30m` set inactivity expiry. `--ttl` is a separate maximum wall-clock lifetime. The CLI sends coordinator heartbeats while a lease is in use; each heartbeat updates `lastTouchedAt` and recomputes `expiresAt = min(createdAt + ttl, lastTouchedAt + idleTimeout)`.
For Linux leases, heartbeats also attach a best-effort latest telemetry snapshot when SSH is reachable. The Durable Object keeps only the latest sanitized load, memory, disk, uptime, source, and capture timestamp on the lease record; it is not a time-series store.
For Linux leases, heartbeats also attach best-effort telemetry when SSH is reachable. The Durable Object keeps the latest sanitized load, memory, disk, uptime, source, and capture timestamp on the lease record, plus a bounded `telemetryHistory` ring of the latest 60 samples for compact portal trends.
Direct-provider mode does not have a central heartbeat or alarm. It labels machines with `created_at`, `last_touched_at`, `idle_timeout_secs`, `expires_at`, `state`, `lease`, and `slug`; `crabbox cleanup` uses those labels conservatively.

View File

@ -58,6 +58,7 @@ type CoordinatorLease struct {
LastTouchedAt string `json:"lastTouchedAt,omitempty"`
ExpiresAt string `json:"expiresAt"`
Telemetry *LeaseTelemetry `json:"telemetry,omitempty"`
TelemetryHistory []*LeaseTelemetry `json:"telemetryHistory,omitempty"`
}
type ProvisioningAttempt struct {

View File

@ -90,6 +90,7 @@ type statusView struct {
HasHost bool `json:"hasHost"`
Ready bool `json:"ready"`
Telemetry *LeaseTelemetry `json:"telemetry,omitempty"`
TelemetryHistory []*LeaseTelemetry `json:"telemetryHistory,omitempty"`
}
func (a App) leaseStatus(ctx context.Context, cfg Config, id string) (statusView, error) {
@ -133,6 +134,7 @@ func (a App) leaseStatus(ctx context.Context, cfg Config, id string) (statusView
HasHost: hasHost,
Ready: ready,
Telemetry: lease.Telemetry,
TelemetryHistory: lease.TelemetryHistory,
}, nil
}
server, target, leaseID, err := a.findLease(ctx, cfg, id)

View File

@ -46,6 +46,7 @@ import { costLimits, enforceCostLimits, leaseCost, requestOrg, usageSummary } fr
const fleetID = "default";
const maxStoredRunLogBytes = 8 * 1024 * 1024;
const runLogChunkBytes = 64 * 1024;
const maxLeaseTelemetryHistory = 60;
const webVNCTicketTTLSeconds = 120;
const codeTicketTTLSeconds = 120;
const maxPendingWebVNCBytes = 1024 * 1024;
@ -551,6 +552,7 @@ export class FleetDurableObject implements DurableObject {
const telemetry = sanitizeLeaseTelemetry(body.telemetry, now);
if (telemetry) {
lease.telemetry = telemetry;
lease.telemetryHistory = appendLeaseTelemetryHistory(lease.telemetryHistory, telemetry);
}
lease.updatedAt = now.toISOString();
lease.lastTouchedAt = now.toISOString();
@ -2635,6 +2637,18 @@ function sanitizeRunTelemetry(
return telemetry;
}
function appendLeaseTelemetryHistory(
history: LeaseTelemetry[] | undefined,
telemetry: LeaseTelemetry,
): LeaseTelemetry[] {
const existing = Array.isArray(history) ? history : [];
const next = [
...existing.filter((sample) => sample && sample.capturedAt !== telemetry.capturedAt),
telemetry,
].toSorted((left, right) => left.capturedAt.localeCompare(right.capturedAt));
return next.slice(-maxLeaseTelemetryHistory);
}
function sanitizeTelemetryTimestamp(value: string | undefined, now: Date): string {
const parsed = Date.parse(value ?? "");
if (!Number.isFinite(parsed)) {

View File

@ -132,6 +132,7 @@ export function portalLeaseDetail(
${leaseTelemetryRows(lease.telemetry)}
${metaRow("expires", shortTime(lease.expiresAt))}
</dl>
${leaseTelemetryTimeline(lease.telemetry, lease.telemetryHistory)}
${
active
? `<form method="post" action="/portal/leases/${encodeURIComponent(lease.id)}/release" class="stop-form">
@ -750,6 +751,126 @@ function leaseTelemetryRows(telemetry: LeaseRecord["telemetry"]): string {
].join("");
}
function leaseTelemetryTimeline(
telemetry: LeaseRecord["telemetry"],
history: LeaseRecord["telemetryHistory"],
): string {
const samples = telemetrySamples(telemetry, history);
if (!telemetry && samples.length === 0) {
return "";
}
const health = telemetryHealthPills(telemetry);
return `<div class="telemetry-strip">
<div class="telemetry-strip-head">
<span>box telemetry</span>
<div>${health}</div>
</div>
${telemetrySparkline(
"load",
samples.map((sample) => sample.load1),
"load",
)}
${telemetrySparkline(
"memory",
samples.map((sample) => sample.memoryPercent),
"%",
)}
${telemetrySparkline(
"disk",
samples.map((sample) => sample.diskPercent),
"%",
)}
</div>`;
}
function telemetrySamples(
telemetry: LeaseRecord["telemetry"],
history: LeaseRecord["telemetryHistory"],
): LeaseTelemetrySample[] {
const byTime = new Map<string, LeaseTelemetrySample>();
for (const sample of Array.isArray(history) ? history : []) {
if (sample?.capturedAt) {
byTime.set(sample.capturedAt, sample);
}
}
if (telemetry?.capturedAt) {
byTime.set(telemetry.capturedAt, telemetry);
}
return [...byTime.values()].toSorted((left, right) =>
left.capturedAt.localeCompare(right.capturedAt),
);
}
type LeaseTelemetrySample = NonNullable<LeaseRecord["telemetry"]>;
function telemetryHealthPills(telemetry: LeaseRecord["telemetry"]): string {
if (!telemetry?.capturedAt) {
return `<span class="pill" data-tone="warn">no signal</span>`;
}
const pills = [];
const ageMs = Date.now() - Date.parse(telemetry.capturedAt);
if (!Number.isFinite(ageMs) || ageMs > 10 * 60 * 1000) {
pills.push(
`<span class="pill" data-tone="warn">stale ${escapeHTML(relativeTime(telemetry.capturedAt))}</span>`,
);
} else {
pills.push(`<span class="pill" data-tone="ok">live</span>`);
}
if ((telemetry.memoryPercent ?? 0) >= 85) {
pills.push(
`<span class="pill" data-tone="bad">memory ${Math.round(telemetry.memoryPercent ?? 0)}%</span>`,
);
}
if ((telemetry.diskPercent ?? 0) >= 85) {
pills.push(
`<span class="pill" data-tone="bad">disk ${Math.round(telemetry.diskPercent ?? 0)}%</span>`,
);
}
if ((telemetry.load1 ?? 0) >= 16) {
pills.push(`<span class="pill" data-tone="warn">load ${telemetry.load1?.toFixed(1)}</span>`);
}
return pills.join("");
}
function telemetrySparkline(
label: string,
rawValues: Array<number | undefined>,
unit: string,
): string {
const values = rawValues.filter((value): value is number => Number.isFinite(value));
const latest = values.at(-1);
if (values.length < 2 || latest === undefined) {
return `<div class="telemetry-line"><span>${escapeHTML(label)}</span><span class="muted">waiting for samples</span></div>`;
}
const max = unit === "%" ? 100 : Math.max(1, ...values);
const points = telemetryPolylinePoints(values, max);
return `<div class="telemetry-line">
<span>${escapeHTML(label)}</span>
<svg class="telemetry-chart" viewBox="0 0 100 28" preserveAspectRatio="none" aria-label="${escapeHTML(label)} telemetry trend">
<polyline points="${points}" />
</svg>
<span>${escapeHTML(formatTelemetryValue(latest, unit))}</span>
</div>`;
}
function telemetryPolylinePoints(values: number[], max: number): string {
const lastIndex = Math.max(1, values.length - 1);
return values
.map((value, index) => {
const x = (index / lastIndex) * 100;
const y = 26 - (Math.max(0, Math.min(value, max)) / max) * 24;
return `${x.toFixed(1)},${y.toFixed(1)}`;
})
.join(" ");
}
function formatTelemetryValue(value: number, unit: string): string {
if (unit === "%") {
return `${Math.round(value)}%`;
}
return value.toFixed(2);
}
function runTelemetryRows(telemetry: RunRecord["telemetry"]): string {
if (!telemetry) {
return "";
@ -952,6 +1073,14 @@ function html(title: string, body: string, status = 200, nonce = ""): Response {
.meta-grid div { padding:8px 10px; border-bottom:1px solid var(--line-soft); }
.meta-grid dt { color:var(--muted); font-size:11px; text-transform:uppercase; margin-bottom:3px; }
.meta-grid dd { margin:0; overflow:hidden; text-overflow:ellipsis; white-space:nowrap; }
.telemetry-strip { display:grid; gap:7px; padding:9px 10px; border-top:1px solid var(--line-soft); background:var(--panel-2); }
.telemetry-strip-head { display:flex; justify-content:space-between; align-items:center; gap:8px; color:var(--muted); font-size:11px; text-transform:uppercase; }
.telemetry-strip-head div { display:flex; gap:4px; align-items:center; flex-wrap:wrap; justify-content:flex-end; }
.telemetry-line { display:grid; grid-template-columns:58px minmax(0,1fr) 52px; gap:8px; align-items:center; min-height:24px; font-size:12px; }
.telemetry-line > span:first-child { color:var(--muted); text-transform:uppercase; font-size:10px; }
.telemetry-line > span:last-child { text-align:right; font-family:var(--mono); color:#d1fae5; }
.telemetry-chart { width:100%; height:24px; display:block; overflow:visible; }
.telemetry-chart polyline { fill:none; stroke:var(--accent); stroke-width:1.8; vector-effect:non-scaling-stroke; }
.stop-form { padding:10px; }
.bridge-grid { display:grid; gap:0; }
.bridge-row { display:grid; grid-template-columns:minmax(0,1fr) auto auto; gap:8px; align-items:center; padding:9px 10px; border-bottom:1px solid var(--line-soft); }

View File

@ -148,6 +148,7 @@ export interface LeaseRecord {
lastTouchedAt?: string;
expiresAt: string;
telemetry?: LeaseTelemetry;
telemetryHistory?: LeaseTelemetry[];
releasedAt?: string;
endedAt?: string;
}

View File

@ -435,7 +435,83 @@ describe("fleet lease identity and idle", () => {
memoryTotalBytes: 2048,
memoryPercent: 50,
});
expect(lease.telemetryHistory).toHaveLength(1);
expect(lease.telemetryHistory?.[0]).toMatchObject({ load1: 0.42, memoryPercent: 50 });
expect(Date.parse(lease.expiresAt)).toBeGreaterThan(expiresAt.getTime());
const secondHeartbeat = await fleet.fetch(
request("POST", "/v1/leases/cbx_000000000001/heartbeat", {
headers: {
"x-crabbox-owner": "peter@example.com",
"x-crabbox-org": "openclaw",
},
body: {
telemetry: {
capturedAt: "2026-05-05T01:03:03Z",
source: "ssh-linux",
load1: 0.84,
memoryPercent: 55,
},
},
}),
);
expect(secondHeartbeat.status).toBe(200);
const second = (await secondHeartbeat.json()) as { lease: LeaseRecord };
expect(second.lease.telemetry).toMatchObject({
capturedAt: "2026-05-05T01:03:03.000Z",
load1: 0.84,
memoryPercent: 55,
});
expect(second.lease.telemetryHistory?.map((sample) => sample.load1)).toEqual([0.42, 0.84]);
expect(second.lease.telemetryHistory?.map((sample) => sample.capturedAt)).toEqual([
"2026-05-05T01:02:03.000Z",
"2026-05-05T01:03:03.000Z",
]);
});
it("keeps lease telemetry history bounded to the latest samples", async () => {
const storage = new MemoryStorage();
const fleet = testFleet(storage);
storage.seed(
"lease:cbx_000000000001",
testLease({
id: "cbx_000000000001",
slug: "blue-lobster",
owner: "peter@example.com",
org: "openclaw",
expiresAt: new Date(Date.now() + 60 * 60 * 1000).toISOString(),
telemetryHistory: Array.from({ length: 60 }, (_, index) => ({
capturedAt: new Date(Date.UTC(2026, 4, 5, 1, index, 0)).toISOString(),
source: "ssh-linux",
load1: index,
})),
}),
);
const heartbeat = await fleet.fetch(
request("POST", "/v1/leases/blue-lobster/heartbeat", {
headers: {
"x-crabbox-owner": "peter@example.com",
"x-crabbox-org": "openclaw",
},
body: {
telemetry: {
capturedAt: "2026-05-05T02:00:00Z",
source: "ssh-linux",
load1: 61,
},
},
}),
);
expect(heartbeat.status).toBe(200);
const { lease } = (await heartbeat.json()) as { lease: LeaseRecord };
expect(lease.telemetryHistory).toHaveLength(60);
expect(lease.telemetryHistory?.[0]?.capturedAt).toBe("2026-05-05T01:01:00.000Z");
expect(lease.telemetryHistory?.at(-1)).toMatchObject({
capturedAt: "2026-05-05T02:00:00.000Z",
load1: 61,
});
});
it("hides exact lease IDs and lists from other non-admin users", async () => {
@ -728,6 +804,22 @@ describe("fleet lease identity and idle", () => {
diskPercent: 25,
uptimeSeconds: 3600,
},
telemetryHistory: [
{
capturedAt: new Date(Date.now() - 45_000).toISOString(),
source: "ssh-linux",
load1: 0.22,
memoryPercent: 42,
diskPercent: 24,
},
{
capturedAt: new Date(Date.now() - 30_000).toISOString(),
source: "ssh-linux",
load1: 0.32,
memoryPercent: 47,
diskPercent: 25,
},
],
expiresAt: new Date(Date.now() + 60 * 60 * 1000).toISOString(),
}),
);
@ -830,6 +922,10 @@ describe("fleet lease identity and idle", () => {
expect(body).toContain("<dt>memory</dt><dd>1.0 KiB / 2.0 KiB (50%)</dd>");
expect(body).toContain("<dt>disk</dt><dd>1.0 GiB / 4.0 GiB (25%)</dd>");
expect(body).toContain("<dt>uptime</dt><dd>1h</dd>");
expect(body).toContain("box telemetry");
expect(body).toContain('class="telemetry-chart"');
expect(body).toContain("<span>0.42</span>");
expect(body).toContain("<span>50%</span>");
expect(body).toContain("load 0.42 · mem 75% · +512 B");
expect(body).toContain("table-search");
expect(body).toContain("/portal/runs/run_000000000001");