feat(portal): chart lease telemetry history
This commit is contained in:
parent
81e7603d32
commit
4e17a91237
@ -12,6 +12,7 @@
|
||||
- Added portal run detail pages with command metadata, result summaries, dense viewport-fitted portal tables, provider/OS badges, active/ended/provider/target filters, sticky portal chrome, and copyable retained log previews.
|
||||
- Added admin portal visibility for non-owned runner leases, including `mine`/`system` filters and matching detail/code/VNC drilldowns for operator sessions.
|
||||
- Added latest lease telemetry snapshots for coordinator-backed Linux leases, including load, memory, disk, and uptime in `status --json` and the portal detail view.
|
||||
- Added bounded lease telemetry history with portal sparklines and stale/high-resource badges on lease detail pages.
|
||||
- Added run-level telemetry summaries with start/end Linux resource snapshots in run history JSON, human history output, and portal run tables/details.
|
||||
- Added `.crabboxignore` for repo-local sync-only exclude patterns shared by `run` and `sync-plan`.
|
||||
- Documented the prebaked runner image boundary: provider-owned AMIs/snapshots hold machine capabilities while repo/runtime caches stay in QA workflows or warm leases.
|
||||
|
||||
@ -33,3 +33,5 @@ Human and JSON output include the selected network. With Tailscale metadata,
|
||||
status also prints the tailnet host/state. For coordinator-backed Linux leases
|
||||
that have received a recent heartbeat, status also includes the latest
|
||||
best-effort telemetry snapshot: load, memory, disk, uptime, and capture age.
|
||||
JSON status includes `telemetryHistory` when the coordinator has retained recent
|
||||
samples for portal trend charts.
|
||||
|
||||
@ -43,7 +43,7 @@ The Worker stores coordinator leases as `active`, `released`, `expired`, or `fai
|
||||
|
||||
`crabbox warmup --idle-timeout 30m` and `crabbox run --idle-timeout 30m` set inactivity expiry. `--ttl` is a separate maximum wall-clock lifetime. The CLI sends coordinator heartbeats while a lease is in use; each heartbeat updates `lastTouchedAt` and recomputes `expiresAt = min(createdAt + ttl, lastTouchedAt + idleTimeout)`.
|
||||
|
||||
For Linux leases, heartbeats also attach a best-effort latest telemetry snapshot when SSH is reachable. The Durable Object keeps only the latest sanitized load, memory, disk, uptime, source, and capture timestamp on the lease record; it is not a time-series store.
|
||||
For Linux leases, heartbeats also attach best-effort telemetry when SSH is reachable. The Durable Object keeps the latest sanitized load, memory, disk, uptime, source, and capture timestamp on the lease record, plus a bounded `telemetryHistory` ring of the latest 60 samples for compact portal trends.
|
||||
|
||||
Direct-provider mode does not have a central heartbeat or alarm. It labels machines with `created_at`, `last_touched_at`, `idle_timeout_secs`, `expires_at`, `state`, `lease`, and `slug`; `crabbox cleanup` uses those labels conservatively.
|
||||
|
||||
|
||||
@ -58,6 +58,7 @@ type CoordinatorLease struct {
|
||||
LastTouchedAt string `json:"lastTouchedAt,omitempty"`
|
||||
ExpiresAt string `json:"expiresAt"`
|
||||
Telemetry *LeaseTelemetry `json:"telemetry,omitempty"`
|
||||
TelemetryHistory []*LeaseTelemetry `json:"telemetryHistory,omitempty"`
|
||||
}
|
||||
|
||||
type ProvisioningAttempt struct {
|
||||
|
||||
@ -90,6 +90,7 @@ type statusView struct {
|
||||
HasHost bool `json:"hasHost"`
|
||||
Ready bool `json:"ready"`
|
||||
Telemetry *LeaseTelemetry `json:"telemetry,omitempty"`
|
||||
TelemetryHistory []*LeaseTelemetry `json:"telemetryHistory,omitempty"`
|
||||
}
|
||||
|
||||
func (a App) leaseStatus(ctx context.Context, cfg Config, id string) (statusView, error) {
|
||||
@ -133,6 +134,7 @@ func (a App) leaseStatus(ctx context.Context, cfg Config, id string) (statusView
|
||||
HasHost: hasHost,
|
||||
Ready: ready,
|
||||
Telemetry: lease.Telemetry,
|
||||
TelemetryHistory: lease.TelemetryHistory,
|
||||
}, nil
|
||||
}
|
||||
server, target, leaseID, err := a.findLease(ctx, cfg, id)
|
||||
|
||||
@ -46,6 +46,7 @@ import { costLimits, enforceCostLimits, leaseCost, requestOrg, usageSummary } fr
|
||||
const fleetID = "default";
|
||||
const maxStoredRunLogBytes = 8 * 1024 * 1024;
|
||||
const runLogChunkBytes = 64 * 1024;
|
||||
const maxLeaseTelemetryHistory = 60;
|
||||
const webVNCTicketTTLSeconds = 120;
|
||||
const codeTicketTTLSeconds = 120;
|
||||
const maxPendingWebVNCBytes = 1024 * 1024;
|
||||
@ -551,6 +552,7 @@ export class FleetDurableObject implements DurableObject {
|
||||
const telemetry = sanitizeLeaseTelemetry(body.telemetry, now);
|
||||
if (telemetry) {
|
||||
lease.telemetry = telemetry;
|
||||
lease.telemetryHistory = appendLeaseTelemetryHistory(lease.telemetryHistory, telemetry);
|
||||
}
|
||||
lease.updatedAt = now.toISOString();
|
||||
lease.lastTouchedAt = now.toISOString();
|
||||
@ -2635,6 +2637,18 @@ function sanitizeRunTelemetry(
|
||||
return telemetry;
|
||||
}
|
||||
|
||||
function appendLeaseTelemetryHistory(
|
||||
history: LeaseTelemetry[] | undefined,
|
||||
telemetry: LeaseTelemetry,
|
||||
): LeaseTelemetry[] {
|
||||
const existing = Array.isArray(history) ? history : [];
|
||||
const next = [
|
||||
...existing.filter((sample) => sample && sample.capturedAt !== telemetry.capturedAt),
|
||||
telemetry,
|
||||
].toSorted((left, right) => left.capturedAt.localeCompare(right.capturedAt));
|
||||
return next.slice(-maxLeaseTelemetryHistory);
|
||||
}
|
||||
|
||||
function sanitizeTelemetryTimestamp(value: string | undefined, now: Date): string {
|
||||
const parsed = Date.parse(value ?? "");
|
||||
if (!Number.isFinite(parsed)) {
|
||||
|
||||
@ -132,6 +132,7 @@ export function portalLeaseDetail(
|
||||
${leaseTelemetryRows(lease.telemetry)}
|
||||
${metaRow("expires", shortTime(lease.expiresAt))}
|
||||
</dl>
|
||||
${leaseTelemetryTimeline(lease.telemetry, lease.telemetryHistory)}
|
||||
${
|
||||
active
|
||||
? `<form method="post" action="/portal/leases/${encodeURIComponent(lease.id)}/release" class="stop-form">
|
||||
@ -750,6 +751,126 @@ function leaseTelemetryRows(telemetry: LeaseRecord["telemetry"]): string {
|
||||
].join("");
|
||||
}
|
||||
|
||||
function leaseTelemetryTimeline(
|
||||
telemetry: LeaseRecord["telemetry"],
|
||||
history: LeaseRecord["telemetryHistory"],
|
||||
): string {
|
||||
const samples = telemetrySamples(telemetry, history);
|
||||
if (!telemetry && samples.length === 0) {
|
||||
return "";
|
||||
}
|
||||
const health = telemetryHealthPills(telemetry);
|
||||
return `<div class="telemetry-strip">
|
||||
<div class="telemetry-strip-head">
|
||||
<span>box telemetry</span>
|
||||
<div>${health}</div>
|
||||
</div>
|
||||
${telemetrySparkline(
|
||||
"load",
|
||||
samples.map((sample) => sample.load1),
|
||||
"load",
|
||||
)}
|
||||
${telemetrySparkline(
|
||||
"memory",
|
||||
samples.map((sample) => sample.memoryPercent),
|
||||
"%",
|
||||
)}
|
||||
${telemetrySparkline(
|
||||
"disk",
|
||||
samples.map((sample) => sample.diskPercent),
|
||||
"%",
|
||||
)}
|
||||
</div>`;
|
||||
}
|
||||
|
||||
function telemetrySamples(
|
||||
telemetry: LeaseRecord["telemetry"],
|
||||
history: LeaseRecord["telemetryHistory"],
|
||||
): LeaseTelemetrySample[] {
|
||||
const byTime = new Map<string, LeaseTelemetrySample>();
|
||||
for (const sample of Array.isArray(history) ? history : []) {
|
||||
if (sample?.capturedAt) {
|
||||
byTime.set(sample.capturedAt, sample);
|
||||
}
|
||||
}
|
||||
if (telemetry?.capturedAt) {
|
||||
byTime.set(telemetry.capturedAt, telemetry);
|
||||
}
|
||||
return [...byTime.values()].toSorted((left, right) =>
|
||||
left.capturedAt.localeCompare(right.capturedAt),
|
||||
);
|
||||
}
|
||||
|
||||
type LeaseTelemetrySample = NonNullable<LeaseRecord["telemetry"]>;
|
||||
|
||||
function telemetryHealthPills(telemetry: LeaseRecord["telemetry"]): string {
|
||||
if (!telemetry?.capturedAt) {
|
||||
return `<span class="pill" data-tone="warn">no signal</span>`;
|
||||
}
|
||||
const pills = [];
|
||||
const ageMs = Date.now() - Date.parse(telemetry.capturedAt);
|
||||
if (!Number.isFinite(ageMs) || ageMs > 10 * 60 * 1000) {
|
||||
pills.push(
|
||||
`<span class="pill" data-tone="warn">stale ${escapeHTML(relativeTime(telemetry.capturedAt))}</span>`,
|
||||
);
|
||||
} else {
|
||||
pills.push(`<span class="pill" data-tone="ok">live</span>`);
|
||||
}
|
||||
if ((telemetry.memoryPercent ?? 0) >= 85) {
|
||||
pills.push(
|
||||
`<span class="pill" data-tone="bad">memory ${Math.round(telemetry.memoryPercent ?? 0)}%</span>`,
|
||||
);
|
||||
}
|
||||
if ((telemetry.diskPercent ?? 0) >= 85) {
|
||||
pills.push(
|
||||
`<span class="pill" data-tone="bad">disk ${Math.round(telemetry.diskPercent ?? 0)}%</span>`,
|
||||
);
|
||||
}
|
||||
if ((telemetry.load1 ?? 0) >= 16) {
|
||||
pills.push(`<span class="pill" data-tone="warn">load ${telemetry.load1?.toFixed(1)}</span>`);
|
||||
}
|
||||
return pills.join("");
|
||||
}
|
||||
|
||||
function telemetrySparkline(
|
||||
label: string,
|
||||
rawValues: Array<number | undefined>,
|
||||
unit: string,
|
||||
): string {
|
||||
const values = rawValues.filter((value): value is number => Number.isFinite(value));
|
||||
const latest = values.at(-1);
|
||||
if (values.length < 2 || latest === undefined) {
|
||||
return `<div class="telemetry-line"><span>${escapeHTML(label)}</span><span class="muted">waiting for samples</span></div>`;
|
||||
}
|
||||
const max = unit === "%" ? 100 : Math.max(1, ...values);
|
||||
const points = telemetryPolylinePoints(values, max);
|
||||
return `<div class="telemetry-line">
|
||||
<span>${escapeHTML(label)}</span>
|
||||
<svg class="telemetry-chart" viewBox="0 0 100 28" preserveAspectRatio="none" aria-label="${escapeHTML(label)} telemetry trend">
|
||||
<polyline points="${points}" />
|
||||
</svg>
|
||||
<span>${escapeHTML(formatTelemetryValue(latest, unit))}</span>
|
||||
</div>`;
|
||||
}
|
||||
|
||||
function telemetryPolylinePoints(values: number[], max: number): string {
|
||||
const lastIndex = Math.max(1, values.length - 1);
|
||||
return values
|
||||
.map((value, index) => {
|
||||
const x = (index / lastIndex) * 100;
|
||||
const y = 26 - (Math.max(0, Math.min(value, max)) / max) * 24;
|
||||
return `${x.toFixed(1)},${y.toFixed(1)}`;
|
||||
})
|
||||
.join(" ");
|
||||
}
|
||||
|
||||
function formatTelemetryValue(value: number, unit: string): string {
|
||||
if (unit === "%") {
|
||||
return `${Math.round(value)}%`;
|
||||
}
|
||||
return value.toFixed(2);
|
||||
}
|
||||
|
||||
function runTelemetryRows(telemetry: RunRecord["telemetry"]): string {
|
||||
if (!telemetry) {
|
||||
return "";
|
||||
@ -952,6 +1073,14 @@ function html(title: string, body: string, status = 200, nonce = ""): Response {
|
||||
.meta-grid div { padding:8px 10px; border-bottom:1px solid var(--line-soft); }
|
||||
.meta-grid dt { color:var(--muted); font-size:11px; text-transform:uppercase; margin-bottom:3px; }
|
||||
.meta-grid dd { margin:0; overflow:hidden; text-overflow:ellipsis; white-space:nowrap; }
|
||||
.telemetry-strip { display:grid; gap:7px; padding:9px 10px; border-top:1px solid var(--line-soft); background:var(--panel-2); }
|
||||
.telemetry-strip-head { display:flex; justify-content:space-between; align-items:center; gap:8px; color:var(--muted); font-size:11px; text-transform:uppercase; }
|
||||
.telemetry-strip-head div { display:flex; gap:4px; align-items:center; flex-wrap:wrap; justify-content:flex-end; }
|
||||
.telemetry-line { display:grid; grid-template-columns:58px minmax(0,1fr) 52px; gap:8px; align-items:center; min-height:24px; font-size:12px; }
|
||||
.telemetry-line > span:first-child { color:var(--muted); text-transform:uppercase; font-size:10px; }
|
||||
.telemetry-line > span:last-child { text-align:right; font-family:var(--mono); color:#d1fae5; }
|
||||
.telemetry-chart { width:100%; height:24px; display:block; overflow:visible; }
|
||||
.telemetry-chart polyline { fill:none; stroke:var(--accent); stroke-width:1.8; vector-effect:non-scaling-stroke; }
|
||||
.stop-form { padding:10px; }
|
||||
.bridge-grid { display:grid; gap:0; }
|
||||
.bridge-row { display:grid; grid-template-columns:minmax(0,1fr) auto auto; gap:8px; align-items:center; padding:9px 10px; border-bottom:1px solid var(--line-soft); }
|
||||
|
||||
@ -148,6 +148,7 @@ export interface LeaseRecord {
|
||||
lastTouchedAt?: string;
|
||||
expiresAt: string;
|
||||
telemetry?: LeaseTelemetry;
|
||||
telemetryHistory?: LeaseTelemetry[];
|
||||
releasedAt?: string;
|
||||
endedAt?: string;
|
||||
}
|
||||
|
||||
@ -435,7 +435,83 @@ describe("fleet lease identity and idle", () => {
|
||||
memoryTotalBytes: 2048,
|
||||
memoryPercent: 50,
|
||||
});
|
||||
expect(lease.telemetryHistory).toHaveLength(1);
|
||||
expect(lease.telemetryHistory?.[0]).toMatchObject({ load1: 0.42, memoryPercent: 50 });
|
||||
expect(Date.parse(lease.expiresAt)).toBeGreaterThan(expiresAt.getTime());
|
||||
|
||||
const secondHeartbeat = await fleet.fetch(
|
||||
request("POST", "/v1/leases/cbx_000000000001/heartbeat", {
|
||||
headers: {
|
||||
"x-crabbox-owner": "peter@example.com",
|
||||
"x-crabbox-org": "openclaw",
|
||||
},
|
||||
body: {
|
||||
telemetry: {
|
||||
capturedAt: "2026-05-05T01:03:03Z",
|
||||
source: "ssh-linux",
|
||||
load1: 0.84,
|
||||
memoryPercent: 55,
|
||||
},
|
||||
},
|
||||
}),
|
||||
);
|
||||
expect(secondHeartbeat.status).toBe(200);
|
||||
const second = (await secondHeartbeat.json()) as { lease: LeaseRecord };
|
||||
expect(second.lease.telemetry).toMatchObject({
|
||||
capturedAt: "2026-05-05T01:03:03.000Z",
|
||||
load1: 0.84,
|
||||
memoryPercent: 55,
|
||||
});
|
||||
expect(second.lease.telemetryHistory?.map((sample) => sample.load1)).toEqual([0.42, 0.84]);
|
||||
expect(second.lease.telemetryHistory?.map((sample) => sample.capturedAt)).toEqual([
|
||||
"2026-05-05T01:02:03.000Z",
|
||||
"2026-05-05T01:03:03.000Z",
|
||||
]);
|
||||
});
|
||||
|
||||
it("keeps lease telemetry history bounded to the latest samples", async () => {
|
||||
const storage = new MemoryStorage();
|
||||
const fleet = testFleet(storage);
|
||||
storage.seed(
|
||||
"lease:cbx_000000000001",
|
||||
testLease({
|
||||
id: "cbx_000000000001",
|
||||
slug: "blue-lobster",
|
||||
owner: "peter@example.com",
|
||||
org: "openclaw",
|
||||
expiresAt: new Date(Date.now() + 60 * 60 * 1000).toISOString(),
|
||||
telemetryHistory: Array.from({ length: 60 }, (_, index) => ({
|
||||
capturedAt: new Date(Date.UTC(2026, 4, 5, 1, index, 0)).toISOString(),
|
||||
source: "ssh-linux",
|
||||
load1: index,
|
||||
})),
|
||||
}),
|
||||
);
|
||||
|
||||
const heartbeat = await fleet.fetch(
|
||||
request("POST", "/v1/leases/blue-lobster/heartbeat", {
|
||||
headers: {
|
||||
"x-crabbox-owner": "peter@example.com",
|
||||
"x-crabbox-org": "openclaw",
|
||||
},
|
||||
body: {
|
||||
telemetry: {
|
||||
capturedAt: "2026-05-05T02:00:00Z",
|
||||
source: "ssh-linux",
|
||||
load1: 61,
|
||||
},
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
expect(heartbeat.status).toBe(200);
|
||||
const { lease } = (await heartbeat.json()) as { lease: LeaseRecord };
|
||||
expect(lease.telemetryHistory).toHaveLength(60);
|
||||
expect(lease.telemetryHistory?.[0]?.capturedAt).toBe("2026-05-05T01:01:00.000Z");
|
||||
expect(lease.telemetryHistory?.at(-1)).toMatchObject({
|
||||
capturedAt: "2026-05-05T02:00:00.000Z",
|
||||
load1: 61,
|
||||
});
|
||||
});
|
||||
|
||||
it("hides exact lease IDs and lists from other non-admin users", async () => {
|
||||
@ -728,6 +804,22 @@ describe("fleet lease identity and idle", () => {
|
||||
diskPercent: 25,
|
||||
uptimeSeconds: 3600,
|
||||
},
|
||||
telemetryHistory: [
|
||||
{
|
||||
capturedAt: new Date(Date.now() - 45_000).toISOString(),
|
||||
source: "ssh-linux",
|
||||
load1: 0.22,
|
||||
memoryPercent: 42,
|
||||
diskPercent: 24,
|
||||
},
|
||||
{
|
||||
capturedAt: new Date(Date.now() - 30_000).toISOString(),
|
||||
source: "ssh-linux",
|
||||
load1: 0.32,
|
||||
memoryPercent: 47,
|
||||
diskPercent: 25,
|
||||
},
|
||||
],
|
||||
expiresAt: new Date(Date.now() + 60 * 60 * 1000).toISOString(),
|
||||
}),
|
||||
);
|
||||
@ -830,6 +922,10 @@ describe("fleet lease identity and idle", () => {
|
||||
expect(body).toContain("<dt>memory</dt><dd>1.0 KiB / 2.0 KiB (50%)</dd>");
|
||||
expect(body).toContain("<dt>disk</dt><dd>1.0 GiB / 4.0 GiB (25%)</dd>");
|
||||
expect(body).toContain("<dt>uptime</dt><dd>1h</dd>");
|
||||
expect(body).toContain("box telemetry");
|
||||
expect(body).toContain('class="telemetry-chart"');
|
||||
expect(body).toContain("<span>0.42</span>");
|
||||
expect(body).toContain("<span>50%</span>");
|
||||
expect(body).toContain("load 0.42 · mem 75% · +512 B");
|
||||
expect(body).toContain("table-search");
|
||||
expect(body).toContain("/portal/runs/run_000000000001");
|
||||
|
||||
Loading…
Reference in New Issue
Block a user