diff --git a/docs/REPORT_SCHEMA.md b/docs/REPORT_SCHEMA.md index 761c7b2..f3e4036 100644 --- a/docs/REPORT_SCHEMA.md +++ b/docs/REPORT_SCHEMA.md @@ -224,6 +224,87 @@ Current metrics include: - runtime dependency staging grouped by bundled plugin when OpenClaw emits `runtimeDeps.stage` spans with `pluginId` attributes +## Health And Readiness + +Records keep existing compatibility fields such as `timeToListeningMs`, +`timeToHealthReadyMs`, `readinessClassification`, `healthFailures`, and +`healthP95Ms`. New readers should use `records[*].measurements.health`: + +```json +{ + "schemaVersion": "kova.health.v1", + "readiness": { + "phaseId": "cold-start", + "listeningReadyAtMs": 2536, + "healthReadyAtMs": 3005, + "classification": "ready", + "severity": "pass", + "reason": "gateway became healthy within the readiness threshold", + "thresholdMs": 30000, + "deadlineMs": 120000, + "attempts": 4 + }, + "startupSamples": { + "scope": "startup-sample", + "count": 4, + "okCount": 1, + "failureCount": 3, + "p95Ms": 120, + "maxMs": 120, + "slowestPhaseId": "cold-start" + }, + "postReadySamples": { + "scope": "post-ready", + "count": 9, + "okCount": 9, + "failureCount": 0, + "p95Ms": 469, + "maxMs": 652, + "slowestPhaseId": "api-latency" + }, + "unknownSamples": { + "scope": "unknown", + "count": 0, + "okCount": 0, + "failureCount": 0, + "p95Ms": null, + "maxMs": null, + "slowestPhaseId": null + }, + "final": { + "scope": "final", + "gatewayState": "running", + "ok": true, + "healthOk": true, + "failureCount": 0, + "p95Ms": 90, + "maxMs": 90, + "slowestPhaseId": "final" + }, + "slowestSample": { + "scope": "post-ready", + "phaseId": "api-latency", + "durationMs": 652 + } +} +``` + +Scenario phases declare `healthScope` so the evaluator does not infer meaning +from phase ids. Allowed values are `readiness`, `startup-sample`, `post-ready`, +`final`, and `none`. Old or externally produced reports without phase scope are +treated as `unknown` when summarized for compatibility. + +Compatibility derivation: + +- `timeToListeningMs`: `measurements.health.readiness.listeningReadyAtMs` +- `timeToHealthReadyMs`: `measurements.health.readiness.healthReadyAtMs` +- `readinessClassification`: `measurements.health.readiness.classification` +- `healthFailures`: startup + post-ready + unknown + final health failures +- `healthP95Ms`: max startup/post-ready p95, falling back to old aggregate p95 + for old reports +- `startupHealthP95Ms`: `measurements.health.startupSamples.p95Ms` +- `postReadyHealthP95Ms`: `measurements.health.postReadySamples.p95Ms` + Role-specific thresholds can fail a scenario separately from total process-tree thresholds. For example, a report can show that `gateway` exceeded memory while `package-manager` stayed normal, or that `package-manager` spiked during local @@ -276,8 +357,8 @@ Aggregate metric fields include: - `samples` Current aggregate metrics include startup readiness, TCP listening, RSS, CPU, -event-loop delay, agent turn latency, health p95, and runtime dependency -staging. +event-loop delay, agent turn latency, compatibility health p95, startup health +p95, post-ready health p95, and runtime dependency staging. Baseline stores use schema `kova.baselines.v1`. Baseline read/write requires `--execute` so stored evidence comes from real OpenClaw runs, not dry-run plans. diff --git a/metrics/known.json b/metrics/known.json index 92fa8b9..58ef29e 100644 --- a/metrics/known.json +++ b/metrics/known.json @@ -30,6 +30,7 @@ "diagnosticPresent", "doctorFixMs", "eventLoopMaxMs", + "finalHealthFailures", "gatewayReadyHardTimeoutMs", "gatewayReadyMs", "gatewayResponsive", @@ -64,6 +65,8 @@ "pluginIndexPresent", "pluginInstallMs", "pluginLoadFailures", + "postReadyHealthFailures", + "postReadyHealthP95Ms", "pluginUpdateDryRunMs", "pluginsListMs", "preProviderDominanceRatio", @@ -89,6 +92,8 @@ "statusAfterFailureMs", "statusAfterModelsMs", "statusMs", + "startupHealthFailures", + "startupHealthP95Ms", "syncFsStallDetected", "tuiSmokeMs", "upgradeMs", diff --git a/scenarios/agent-auth-missing.json b/scenarios/agent-auth-missing.json index ef6e6f2..e2d7a88 100644 --- a/scenarios/agent-auth-missing.json +++ b/scenarios/agent-auth-missing.json @@ -42,7 +42,8 @@ "runtime binding", "startup readiness", "no Kova auth setup phase" - ] + ], + "healthScope": "readiness" }, { "id": "missing-auth-agent-turn", @@ -57,7 +58,8 @@ "no provider request", "process leak snapshot", "role resource samples" - ] + ], + "healthScope": "post-ready" }, { "id": "post-auth-failure-health", @@ -72,7 +74,8 @@ "auth failure logs", "plugin errors", "memory after auth failure" - ] + ], + "healthScope": "post-ready" } ], "proves": [ diff --git a/scenarios/agent-cold-warm-message.json b/scenarios/agent-cold-warm-message.json index e5be32d..ca3c666 100644 --- a/scenarios/agent-cold-warm-message.json +++ b/scenarios/agent-cold-warm-message.json @@ -43,7 +43,8 @@ "gateway port", "runtime binding", "env created without service" - ] + ], + "healthScope": "none" }, { "id": "cold-agent-turn", @@ -58,7 +59,8 @@ "mock provider request timing", "gateway health after cold turn", "role resource samples" - ] + ], + "healthScope": "post-ready" }, { "id": "warm-agent-turn", @@ -73,7 +75,8 @@ "mock provider request timing", "cold/warm delta", "role resource samples" - ] + ], + "healthScope": "post-ready" }, { "id": "post-agent-health", @@ -86,7 +89,8 @@ "env status", "plugin errors", "memory after agent turns" - ] + ], + "healthScope": "post-ready" } ], "proves": [ diff --git a/scenarios/agent-gateway-rpc-turn.json b/scenarios/agent-gateway-rpc-turn.json index 3c2121a..662b41a 100644 --- a/scenarios/agent-gateway-rpc-turn.json +++ b/scenarios/agent-gateway-rpc-turn.json @@ -37,7 +37,8 @@ "gateway port", "runtime binding", "env created without service" - ] + ], + "healthScope": "none" }, { "id": "gateway-start", @@ -51,7 +52,8 @@ "gateway service installed", "gateway service started", "startup readiness" - ] + ], + "healthScope": "readiness" }, { "id": "gateway-agent-turn", @@ -66,7 +68,8 @@ "mock provider request timing", "gateway health after turn", "role resource samples" - ] + ], + "healthScope": "post-ready" }, { "id": "post-agent-health", @@ -81,7 +84,8 @@ "provider logs", "plugin errors", "memory after agent turn" - ] + ], + "healthScope": "post-ready" } ], "proves": [ diff --git a/scenarios/agent-long-session.json b/scenarios/agent-long-session.json index 2bea2bd..c1d8196 100644 --- a/scenarios/agent-long-session.json +++ b/scenarios/agent-long-session.json @@ -46,7 +46,8 @@ "gateway port", "runtime binding", "startup readiness" - ] + ], + "healthScope": "readiness" }, { "id": "cold-session-turn", @@ -60,7 +61,8 @@ "assistant text", "provider request timing", "role resource samples" - ] + ], + "healthScope": "post-ready" }, { "id": "warm-session-turn", @@ -74,7 +76,8 @@ "assistant text", "provider request timing", "cold/warm delta" - ] + ], + "healthScope": "post-ready" }, { "id": "session-turn-3", @@ -88,7 +91,8 @@ "assistant text", "provider request timing", "role resource samples" - ] + ], + "healthScope": "post-ready" }, { "id": "session-turn-4", @@ -102,7 +106,8 @@ "assistant text", "provider request timing", "role resource samples" - ] + ], + "healthScope": "post-ready" }, { "id": "session-turn-5", @@ -116,7 +121,8 @@ "assistant text", "provider request timing", "role resource samples" - ] + ], + "healthScope": "post-ready" }, { "id": "session-turn-6", @@ -130,7 +136,8 @@ "assistant text", "provider request timing", "role resource samples" - ] + ], + "healthScope": "post-ready" }, { "id": "post-session-health", @@ -146,7 +153,8 @@ "plugin errors", "memory after repeated turns", "process leak summary" - ] + ], + "healthScope": "post-ready" } ], "proves": [ diff --git a/scenarios/agent-network-offline.json b/scenarios/agent-network-offline.json index afdbbfd..528ec36 100644 --- a/scenarios/agent-network-offline.json +++ b/scenarios/agent-network-offline.json @@ -36,7 +36,8 @@ "gateway port", "runtime binding", "startup readiness" - ] + ], + "healthScope": "readiness" }, { "id": "network-offline-turn", @@ -49,7 +50,8 @@ "bounded network failure", "gateway status after failure", "role resource samples" - ] + ], + "healthScope": "post-ready" }, { "id": "post-network-health", @@ -64,7 +66,8 @@ "network/provider failure logs", "plugin errors", "memory after network failure" - ] + ], + "healthScope": "post-ready" } ], "proves": [ diff --git a/scenarios/agent-provider-concurrent.json b/scenarios/agent-provider-concurrent.json index 79ddef9..8fe2c45 100644 --- a/scenarios/agent-provider-concurrent.json +++ b/scenarios/agent-provider-concurrent.json @@ -47,7 +47,8 @@ "gateway port", "runtime binding", "startup readiness" - ] + ], + "healthScope": "readiness" }, { "id": "concurrent-provider-turns", @@ -63,7 +64,8 @@ "pre-provider timing", "role resource samples", "process leak snapshot" - ] + ], + "healthScope": "post-ready" }, { "id": "post-concurrency-health", @@ -78,7 +80,8 @@ "provider logs", "plugin errors", "memory after concurrent turns" - ] + ], + "healthScope": "post-ready" } ], "proves": [ diff --git a/scenarios/agent-provider-malformed.json b/scenarios/agent-provider-malformed.json index 3cbc72e..6d89a41 100644 --- a/scenarios/agent-provider-malformed.json +++ b/scenarios/agent-provider-malformed.json @@ -38,7 +38,8 @@ "gateway port", "runtime binding", "startup readiness" - ] + ], + "healthScope": "readiness" }, { "id": "malformed-provider-turn", @@ -53,7 +54,8 @@ "malformed provider evidence", "gateway remains supervised", "role resource samples" - ] + ], + "healthScope": "post-ready" }, { "id": "post-failure-health", @@ -68,7 +70,8 @@ "provider logs", "plugin errors", "memory after malformed response" - ] + ], + "healthScope": "post-ready" } ], "proves": [ diff --git a/scenarios/agent-provider-recovery.json b/scenarios/agent-provider-recovery.json index 367f4da..87661fd 100644 --- a/scenarios/agent-provider-recovery.json +++ b/scenarios/agent-provider-recovery.json @@ -42,7 +42,8 @@ "gateway port", "runtime binding", "startup readiness" - ] + ], + "healthScope": "readiness" }, { "id": "transient-provider-failure-turn", @@ -57,7 +58,8 @@ "provider 200 recovery evidence", "gateway remains supervised", "role resource samples" - ] + ], + "healthScope": "post-ready" }, { "id": "recovery-provider-turn", @@ -71,7 +73,8 @@ "provider recovery timing", "gateway remains healthy", "role resource samples" - ] + ], + "healthScope": "post-ready" }, { "id": "post-failure-health", @@ -86,7 +89,8 @@ "provider logs", "plugin errors", "memory after recovery" - ] + ], + "healthScope": "post-ready" } ], "proves": [ diff --git a/scenarios/agent-provider-slow.json b/scenarios/agent-provider-slow.json index f4c04b3..2ee9eed 100644 --- a/scenarios/agent-provider-slow.json +++ b/scenarios/agent-provider-slow.json @@ -44,7 +44,8 @@ "gateway port", "runtime binding", "startup readiness" - ] + ], + "healthScope": "readiness" }, { "id": "slow-provider-turn", @@ -58,7 +59,8 @@ "provider delay timing", "pre-provider timing", "role resource samples" - ] + ], + "healthScope": "post-ready" }, { "id": "post-failure-health", @@ -73,7 +75,8 @@ "provider logs", "plugin errors", "memory after provider delay" - ] + ], + "healthScope": "post-ready" } ], "proves": [ diff --git a/scenarios/agent-provider-streaming-stall.json b/scenarios/agent-provider-streaming-stall.json index c70a1f2..c6cd8a7 100644 --- a/scenarios/agent-provider-streaming-stall.json +++ b/scenarios/agent-provider-streaming-stall.json @@ -41,7 +41,8 @@ "gateway port", "runtime binding", "startup readiness" - ] + ], + "healthScope": "readiness" }, { "id": "streaming-stall-provider-turn", @@ -57,7 +58,8 @@ "process leak snapshot", "gateway remains supervised", "role resource samples" - ] + ], + "healthScope": "post-ready" }, { "id": "post-failure-health", @@ -72,7 +74,8 @@ "provider logs", "plugin errors", "memory after streaming stall" - ] + ], + "healthScope": "post-ready" } ], "proves": [ diff --git a/scenarios/agent-provider-timeout.json b/scenarios/agent-provider-timeout.json index 26c30c9..14a49f1 100644 --- a/scenarios/agent-provider-timeout.json +++ b/scenarios/agent-provider-timeout.json @@ -39,7 +39,8 @@ "gateway port", "runtime binding", "startup readiness" - ] + ], + "healthScope": "readiness" }, { "id": "timeout-provider-turn", @@ -54,7 +55,8 @@ "provider timeout/abort timing", "gateway remains supervised", "role resource samples" - ] + ], + "healthScope": "post-ready" }, { "id": "post-failure-health", @@ -69,7 +71,8 @@ "provider logs", "plugin errors", "memory after timeout" - ] + ], + "healthScope": "post-ready" } ], "proves": [ diff --git a/scenarios/browser-automation-smoke.json b/scenarios/browser-automation-smoke.json index 582adf0..1e48620 100644 --- a/scenarios/browser-automation-smoke.json +++ b/scenarios/browser-automation-smoke.json @@ -39,7 +39,8 @@ "gateway status", "gateway port", "readiness classification" - ] + ], + "healthScope": "readiness" }, { "id": "browser-smoke", @@ -54,7 +55,8 @@ "opened tab count", "snapshot timing", "browser stop timing" - ] + ], + "healthScope": "post-ready" }, { "id": "post-browser-health", @@ -68,7 +70,8 @@ "status after browser automation", "browser plugin errors", "gateway errors" - ] + ], + "healthScope": "post-ready" } ], "proves": [ diff --git a/scenarios/bundled-plugin-startup.json b/scenarios/bundled-plugin-startup.json index 715069a..2ebe8a2 100644 --- a/scenarios/bundled-plugin-startup.json +++ b/scenarios/bundled-plugin-startup.json @@ -30,7 +30,8 @@ "bundled plugin count", "readiness classification", "dependency staging" - ] + ], + "healthScope": "readiness" }, { "id": "inspect", @@ -46,7 +47,8 @@ "registry refresh", "missing package/module errors", "plugin service failures" - ] + ], + "healthScope": "post-ready" }, { "id": "restart", @@ -61,7 +63,8 @@ "warm readiness", "bundled plugin reload", "runtime dependency reuse" - ] + ], + "healthScope": "readiness" } ], "proves": [ diff --git a/scenarios/bundled-runtime-deps.json b/scenarios/bundled-runtime-deps.json index 7b7863d..374d431 100644 --- a/scenarios/bundled-runtime-deps.json +++ b/scenarios/bundled-runtime-deps.json @@ -29,7 +29,8 @@ "dependency staging duration", "installed dependency list", "missing dependency errors" - ] + ], + "healthScope": "readiness" }, { "id": "warm-restart", @@ -44,7 +45,8 @@ "warm ready time", "dependency staging reuse", "missing dependency errors" - ] + ], + "healthScope": "readiness" } ], "proves": [ diff --git a/scenarios/cross-platform-smoke.json b/scenarios/cross-platform-smoke.json index f60584e..868cdf6 100644 --- a/scenarios/cross-platform-smoke.json +++ b/scenarios/cross-platform-smoke.json @@ -37,7 +37,8 @@ "Node version", "runtime version", "gateway port" - ] + ], + "healthScope": "readiness" }, { "id": "core-smoke", @@ -53,7 +54,8 @@ "plugin list", "filesystem stall logs", "health latency" - ] + ], + "healthScope": "post-ready" } ], "proves": [ diff --git a/scenarios/dashboard-readiness.json b/scenarios/dashboard-readiness.json index eb1cfdd..c2e2d98 100644 --- a/scenarios/dashboard-readiness.json +++ b/scenarios/dashboard-readiness.json @@ -31,7 +31,8 @@ "gateway status", "gateway port", "readiness classification" - ] + ], + "healthScope": "readiness" }, { "id": "dashboard", @@ -44,7 +45,8 @@ "dashboard URL", "token handling", "command latency" - ] + ], + "healthScope": "post-ready" }, { "id": "post-dashboard-health", @@ -58,7 +60,8 @@ "status after dashboard command", "websocket disconnect logs", "gateway errors" - ] + ], + "healthScope": "post-ready" } ], "proves": [ diff --git a/scenarios/dashboard-session-send-turn-existing-user.json b/scenarios/dashboard-session-send-turn-existing-user.json index afcfb3a..c737f68 100644 --- a/scenarios/dashboard-session-send-turn-existing-user.json +++ b/scenarios/dashboard-session-send-turn-existing-user.json @@ -41,7 +41,8 @@ "source env", "clone root", "cloned OpenClaw config" - ] + ], + "healthScope": "none" }, { "id": "upgrade", @@ -55,7 +56,8 @@ "upgrade JSON", "runtime binding", "post-upgrade service state" - ] + ], + "healthScope": "readiness" }, { "id": "gateway-start", @@ -69,7 +71,8 @@ "gateway service installed", "gateway service started", "startup readiness" - ] + ], + "healthScope": "readiness" }, { "id": "dashboard-session-turn", @@ -85,7 +88,8 @@ "provider timing", "gateway health after turn", "role resource samples" - ] + ], + "healthScope": "post-ready" }, { "id": "post-dashboard-health", @@ -101,7 +105,8 @@ "liveness warnings", "plugin errors", "memory after dashboard turn" - ] + ], + "healthScope": "post-ready" } ], "proves": [ diff --git a/scenarios/dashboard-session-send-turn.json b/scenarios/dashboard-session-send-turn.json index 362ae6e..1057e39 100644 --- a/scenarios/dashboard-session-send-turn.json +++ b/scenarios/dashboard-session-send-turn.json @@ -38,7 +38,8 @@ "gateway port", "runtime binding", "env created without service" - ] + ], + "healthScope": "none" }, { "id": "gateway-start", @@ -52,7 +53,8 @@ "gateway service installed", "gateway service started", "startup readiness" - ] + ], + "healthScope": "readiness" }, { "id": "dashboard-session-turn", @@ -67,7 +69,8 @@ "mock provider request timing", "gateway health after turn", "role resource samples" - ] + ], + "healthScope": "post-ready" }, { "id": "post-dashboard-health", @@ -82,7 +85,8 @@ "provider logs", "plugin errors", "memory after dashboard turn" - ] + ], + "healthScope": "post-ready" } ], "proves": [ diff --git a/scenarios/failure-injection.json b/scenarios/failure-injection.json index 933b2cb..aeecdda 100644 --- a/scenarios/failure-injection.json +++ b/scenarios/failure-injection.json @@ -27,7 +27,8 @@ "evidence": [ "baseline status", "gateway PID" - ] + ], + "healthScope": "readiness" }, { "id": "diagnostics", @@ -41,7 +42,8 @@ "error classification", "gateway survival", "recovery guidance" - ] + ], + "healthScope": "post-ready" } ], "proves": [ diff --git a/scenarios/fresh-install.json b/scenarios/fresh-install.json index 759d5a6..6c03e68 100644 --- a/scenarios/fresh-install.json +++ b/scenarios/fresh-install.json @@ -30,7 +30,8 @@ "env name", "runtime binding", "gateway port" - ] + ], + "healthScope": "readiness" }, { "id": "readiness", @@ -45,7 +46,8 @@ "gateway state", "gateway PID", "health/status result" - ] + ], + "healthScope": "post-ready" }, { "id": "plugins", @@ -59,7 +61,8 @@ "plugins list output", "plugin update dry-run output", "missing dependency log scan" - ] + ], + "healthScope": "post-ready" }, { "id": "models", @@ -72,7 +75,8 @@ "models list duration", "timeout behavior", "gateway health after model list" - ] + ], + "healthScope": "post-ready" }, { "id": "logs", @@ -85,7 +89,8 @@ "startup logs", "missing dependency errors", "plugin metadata scan warnings" - ] + ], + "healthScope": "post-ready" }, { "id": "cleanup", @@ -96,7 +101,8 @@ ], "evidence": [ "destroy result" - ] + ], + "healthScope": "none" } ], "proves": [ diff --git a/scenarios/gateway-performance.json b/scenarios/gateway-performance.json index b076f5d..40f31bb 100644 --- a/scenarios/gateway-performance.json +++ b/scenarios/gateway-performance.json @@ -13,9 +13,9 @@ "thresholds": { "coldReadyMs": 30000, "warmReadyMs": 15000, - "healthP95Ms": 1000, "peakRssMb": 900, - "eventLoopMaxMs": 500 + "eventLoopMaxMs": 500, + "postReadyHealthP95Ms": 1000 }, "phases": [ { @@ -32,7 +32,8 @@ "RSS", "CPU", "startup logs" - ] + ], + "healthScope": "readiness" }, { "id": "api-latency", @@ -47,7 +48,8 @@ "command durations", "health after each command", "logs" - ] + ], + "healthScope": "post-ready" }, { "id": "warm-restart", @@ -61,7 +63,8 @@ "warm ready time", "RSS delta", "startup log delta" - ] + ], + "healthScope": "readiness" } ], "proves": [ diff --git a/scenarios/mcp-runtime-start-stop.json b/scenarios/mcp-runtime-start-stop.json index d74712b..e181f8e 100644 --- a/scenarios/mcp-runtime-start-stop.json +++ b/scenarios/mcp-runtime-start-stop.json @@ -37,7 +37,8 @@ "gateway status", "gateway port", "readiness classification" - ] + ], + "healthScope": "readiness" }, { "id": "mcp-bridge", @@ -51,7 +52,8 @@ "tools/list timing", "tool count", "bridge process exit" - ] + ], + "healthScope": "post-ready" }, { "id": "post-mcp-health", @@ -65,7 +67,8 @@ "status after MCP bridge", "MCP bridge errors", "gateway errors" - ] + ], + "healthScope": "post-ready" } ], "proves": [ diff --git a/scenarios/media-understanding-timeout.json b/scenarios/media-understanding-timeout.json index d42dd60..ce47d91 100644 --- a/scenarios/media-understanding-timeout.json +++ b/scenarios/media-understanding-timeout.json @@ -39,7 +39,8 @@ "gateway port", "runtime binding", "startup readiness" - ] + ], + "healthScope": "readiness" }, { "id": "media-timeout", @@ -53,7 +54,8 @@ "provider timeout observed", "gateway status after timeout", "mock provider request log" - ] + ], + "healthScope": "post-ready" }, { "id": "post-media-health", @@ -68,7 +70,8 @@ "provider timeout logs", "plugin errors", "memory after media timeout" - ] + ], + "healthScope": "post-ready" } ], "proves": [ diff --git a/scenarios/official-plugin-install.json b/scenarios/official-plugin-install.json index dba8c8a..b0a02e7 100644 --- a/scenarios/official-plugin-install.json +++ b/scenarios/official-plugin-install.json @@ -36,7 +36,8 @@ "evidence": [ "fresh env started", "baseline plugin list captured" - ] + ], + "healthScope": "readiness" }, { "id": "install", @@ -50,7 +51,8 @@ "security scanner results", "plugins appear in list", "registry refresh succeeds" - ] + ], + "healthScope": "post-ready" }, { "id": "restart", @@ -67,7 +69,8 @@ "official plugin remains installed", "plugin load logs", "missing dependency scan" - ] + ], + "healthScope": "readiness" } ], "proves": [ diff --git a/scenarios/openai-compatible-turn.json b/scenarios/openai-compatible-turn.json index 19c8f33..d5c2b4b 100644 --- a/scenarios/openai-compatible-turn.json +++ b/scenarios/openai-compatible-turn.json @@ -37,7 +37,8 @@ "gateway port", "runtime binding", "env created without service" - ] + ], + "healthScope": "none" }, { "id": "gateway-start", @@ -51,7 +52,8 @@ "gateway service installed", "gateway service started", "startup readiness" - ] + ], + "healthScope": "readiness" }, { "id": "openai-compatible-turn", @@ -66,7 +68,8 @@ "mock provider request timing", "gateway health after turn", "role resource samples" - ] + ], + "healthScope": "post-ready" }, { "id": "post-http-health", @@ -81,7 +84,8 @@ "provider logs", "plugin errors", "memory after HTTP turn" - ] + ], + "healthScope": "post-ready" } ], "proves": [ diff --git a/scenarios/plugin-bad-manifest.json b/scenarios/plugin-bad-manifest.json index 060f324..99f9030 100644 --- a/scenarios/plugin-bad-manifest.json +++ b/scenarios/plugin-bad-manifest.json @@ -28,7 +28,8 @@ "evidence": [ "baseline gateway status", "readiness classification" - ] + ], + "healthScope": "readiness" }, { "id": "reject-invalid-plugin", @@ -41,7 +42,8 @@ "install command rejected", "validation error", "no install record committed" - ] + ], + "healthScope": "post-ready" }, { "id": "post-failure-health", @@ -56,7 +58,8 @@ "gateway status", "plugin list", "logs after invalid install" - ] + ], + "healthScope": "post-ready" } ], "proves": [ diff --git a/scenarios/plugin-external-install.json b/scenarios/plugin-external-install.json index b3dedd2..9e6e280 100644 --- a/scenarios/plugin-external-install.json +++ b/scenarios/plugin-external-install.json @@ -29,7 +29,8 @@ "evidence": [ "baseline plugin list", "gateway readiness" - ] + ], + "healthScope": "readiness" }, { "id": "install", @@ -45,7 +46,8 @@ "plugin index update", "registry refresh", "plugin appears in list" - ] + ], + "healthScope": "post-ready" }, { "id": "restart", @@ -60,7 +62,8 @@ "restart readiness", "plugin load logs", "missing dependency scan" - ] + ], + "healthScope": "readiness" } ], "proves": [ diff --git a/scenarios/plugin-lifecycle.json b/scenarios/plugin-lifecycle.json index 2d9990b..2853cf8 100644 --- a/scenarios/plugin-lifecycle.json +++ b/scenarios/plugin-lifecycle.json @@ -28,7 +28,8 @@ "plugin list", "update dry-run", "runtime dependency errors" - ] + ], + "healthScope": "readiness" }, { "id": "restart", @@ -43,7 +44,8 @@ "restart status", "logs", "missing dependency scan" - ] + ], + "healthScope": "readiness" } ], "proves": [ diff --git a/scenarios/plugin-missing-runtime-deps.json b/scenarios/plugin-missing-runtime-deps.json index 75f8eb5..c75ed92 100644 --- a/scenarios/plugin-missing-runtime-deps.json +++ b/scenarios/plugin-missing-runtime-deps.json @@ -31,7 +31,8 @@ "install result", "plugin entry registered", "gateway readiness before load" - ] + ], + "healthScope": "readiness" }, { "id": "restart", @@ -46,7 +47,8 @@ "missing dependency diagnostic", "plugin load failure", "gateway remains supervised" - ] + ], + "healthScope": "readiness" }, { "id": "survival", @@ -59,7 +61,8 @@ "evidence": [ "status after plugin failure", "plugin list after failure" - ] + ], + "healthScope": "post-ready" } ], "proves": [ diff --git a/scenarios/plugin-remove.json b/scenarios/plugin-remove.json index 9bbcd5f..a29b740 100644 --- a/scenarios/plugin-remove.json +++ b/scenarios/plugin-remove.json @@ -31,7 +31,8 @@ "evidence": [ "install record", "plugin appears before uninstall" - ] + ], + "healthScope": "readiness" }, { "id": "remove", @@ -46,7 +47,8 @@ "uninstall output", "install index cleanup", "registry after removal" - ] + ], + "healthScope": "post-ready" }, { "id": "restart", @@ -61,7 +63,8 @@ "restart readiness", "removed plugin not loaded", "missing dependency scan" - ] + ], + "healthScope": "readiness" } ], "proves": [ diff --git a/scenarios/plugin-update.json b/scenarios/plugin-update.json index d3784d2..d8cbfd1 100644 --- a/scenarios/plugin-update.json +++ b/scenarios/plugin-update.json @@ -31,7 +31,8 @@ "evidence": [ "plugin install record", "plugin appears in list" - ] + ], + "healthScope": "readiness" }, { "id": "update", @@ -46,7 +47,8 @@ "plugin update dry-run output", "tracked plugin metadata", "registry refresh" - ] + ], + "healthScope": "post-ready" }, { "id": "post-update-health", @@ -60,7 +62,8 @@ "status after update", "plugin lifecycle logs", "dependency errors" - ] + ], + "healthScope": "post-ready" } ], "proves": [ diff --git a/scenarios/provider-models.json b/scenarios/provider-models.json index 21bc30f..343c8ec 100644 --- a/scenarios/provider-models.json +++ b/scenarios/provider-models.json @@ -28,7 +28,8 @@ "models list duration", "provider timeout warnings", "gateway status after model discovery" - ] + ], + "healthScope": "readiness" }, { "id": "logs", @@ -41,7 +42,8 @@ "timeout logs", "auth skip logs", "gateway stall logs" - ] + ], + "healthScope": "post-ready" } ], "proves": [ diff --git a/scenarios/release-runtime-startup.json b/scenarios/release-runtime-startup.json index 2c61eca..931acaa 100644 --- a/scenarios/release-runtime-startup.json +++ b/scenarios/release-runtime-startup.json @@ -36,7 +36,8 @@ "time to listening", "time to health ready", "readiness classification" - ] + ], + "healthScope": "readiness" }, { "id": "post-start", @@ -52,7 +53,8 @@ "status command latency", "plugin list", "plugin startup health" - ] + ], + "healthScope": "post-ready" }, { "id": "startup-logs", @@ -66,7 +68,8 @@ "missing dependency errors", "plugin service failures", "startup phase logs" - ] + ], + "healthScope": "post-ready" } ], "proves": [ diff --git a/scenarios/soak.json b/scenarios/soak.json index 9084fe6..4cdc924 100644 --- a/scenarios/soak.json +++ b/scenarios/soak.json @@ -16,7 +16,7 @@ "soakCommandFailures": 0, "soakHealthFailures": 0, "rssGrowthMb": 300, - "healthP95Ms": 1000 + "postReadyHealthP95Ms": 1000 }, "phases": [ { @@ -31,7 +31,8 @@ "baseline PID", "baseline RSS", "baseline health" - ] + ], + "healthScope": "readiness" }, { "id": "loop", @@ -44,7 +45,8 @@ "latency trend", "RSS trend", "logs during loop" - ] + ], + "healthScope": "post-ready" } ], "proves": [ diff --git a/scenarios/tui-message-turn.json b/scenarios/tui-message-turn.json index 7221182..a2dac94 100644 --- a/scenarios/tui-message-turn.json +++ b/scenarios/tui-message-turn.json @@ -37,7 +37,8 @@ "gateway port", "runtime binding", "env created without service" - ] + ], + "healthScope": "none" }, { "id": "gateway-start", @@ -51,7 +52,8 @@ "gateway service installed", "gateway service started", "startup readiness" - ] + ], + "healthScope": "readiness" }, { "id": "tui-message-turn", @@ -66,7 +68,8 @@ "mock provider request timing", "gateway health after turn", "role resource samples" - ] + ], + "healthScope": "post-ready" }, { "id": "post-tui-health", @@ -81,7 +84,8 @@ "provider logs", "plugin errors", "memory after TUI turn" - ] + ], + "healthScope": "post-ready" } ], "proves": [ diff --git a/scenarios/tui-responsiveness.json b/scenarios/tui-responsiveness.json index 968f635..bdd93a4 100644 --- a/scenarios/tui-responsiveness.json +++ b/scenarios/tui-responsiveness.json @@ -30,7 +30,8 @@ "evidence": [ "gateway status", "readiness classification" - ] + ], + "healthScope": "readiness" }, { "id": "tui-smoke", @@ -43,7 +44,8 @@ "TUI render time", "connected screen", "clean interrupt" - ] + ], + "healthScope": "post-ready" }, { "id": "post-tui-health", @@ -57,7 +59,8 @@ "status after TUI", "TUI disconnect logs", "gateway errors" - ] + ], + "healthScope": "post-ready" } ], "proves": [ diff --git a/scenarios/upgrade-durable-clone-to-local-build.json b/scenarios/upgrade-durable-clone-to-local-build.json index 04f796b..22e827e 100644 --- a/scenarios/upgrade-durable-clone-to-local-build.json +++ b/scenarios/upgrade-durable-clone-to-local-build.json @@ -41,7 +41,8 @@ "source env", "clone root", "pre-upgrade service status" - ] + ], + "healthScope": "none" }, { "id": "upgrade", @@ -55,7 +56,8 @@ "snapshot id", "doctor/update output", "rollback status" - ] + ], + "healthScope": "readiness" }, { "id": "post-upgrade", @@ -74,7 +76,8 @@ "plugins install index", "doctor output", "gateway logs without missing dependency/plugin load failures" - ] + ], + "healthScope": "post-ready" } ], "proves": [ diff --git a/scenarios/upgrade-existing-user.json b/scenarios/upgrade-existing-user.json index ac2c990..11ca671 100644 --- a/scenarios/upgrade-existing-user.json +++ b/scenarios/upgrade-existing-user.json @@ -28,7 +28,8 @@ "clone result", "source env", "clone root" - ] + ], + "healthScope": "none" }, { "id": "source-runtime", @@ -41,7 +42,8 @@ "evidence": [ "pre-upgrade runtime", "pre-upgrade gateway status" - ] + ], + "healthScope": "readiness" }, { "id": "upgrade", @@ -55,7 +57,8 @@ "snapshot id", "doctor/update output", "rollback status" - ] + ], + "healthScope": "readiness" }, { "id": "post-upgrade", @@ -72,7 +75,8 @@ "plugins folder/index presence", "doctor output", "gateway logs" - ] + ], + "healthScope": "post-ready" } ], "proves": [ diff --git a/scenarios/upgrade-from-2026-4-20.json b/scenarios/upgrade-from-2026-4-20.json index 7aa122e..f8add4a 100644 --- a/scenarios/upgrade-from-2026-4-20.json +++ b/scenarios/upgrade-from-2026-4-20.json @@ -32,7 +32,8 @@ "clone result", "source env", "clone root" - ] + ], + "healthScope": "none" }, { "id": "source-runtime", @@ -47,7 +48,8 @@ "2026.4.20 upgrade output", "pre-upgrade service status", "pre-upgrade OpenClaw status" - ] + ], + "healthScope": "readiness" }, { "id": "upgrade", @@ -61,7 +63,8 @@ "snapshot id", "doctor/update output", "rollback status" - ] + ], + "healthScope": "readiness" }, { "id": "post-upgrade", @@ -78,7 +81,8 @@ "plugins folder/index presence", "doctor output", "gateway logs" - ] + ], + "healthScope": "post-ready" } ], "proves": [ diff --git a/scenarios/upgrade-from-2026-4-24.json b/scenarios/upgrade-from-2026-4-24.json index f7a5563..50d6d09 100644 --- a/scenarios/upgrade-from-2026-4-24.json +++ b/scenarios/upgrade-from-2026-4-24.json @@ -35,7 +35,8 @@ "clone result", "source env", "clone root" - ] + ], + "healthScope": "none" }, { "id": "source-runtime", @@ -52,7 +53,8 @@ "pre-upgrade service status", "pre-upgrade OpenClaw status", "known 2026.4.24 plugin/runtime-deps logs" - ] + ], + "healthScope": "readiness" }, { "id": "upgrade", @@ -66,7 +68,8 @@ "snapshot id", "doctor/update output", "rollback status" - ] + ], + "healthScope": "readiness" }, { "id": "post-upgrade", @@ -83,7 +86,8 @@ "plugins install index", "doctor output", "gateway logs without missing dependency/plugin load failures" - ] + ], + "healthScope": "post-ready" } ], "proves": [ diff --git a/scenarios/upgrade-stable-channel-to-beta.json b/scenarios/upgrade-stable-channel-to-beta.json index ad62a6b..949c68d 100644 --- a/scenarios/upgrade-stable-channel-to-beta.json +++ b/scenarios/upgrade-stable-channel-to-beta.json @@ -42,7 +42,8 @@ "stable channel start output", "pre-upgrade gateway status", "pre-upgrade OpenClaw status" - ] + ], + "healthScope": "readiness" }, { "id": "upgrade", @@ -56,7 +57,8 @@ "snapshot id", "doctor/update output", "rollback status" - ] + ], + "healthScope": "readiness" }, { "id": "post-upgrade", @@ -75,7 +77,8 @@ "plugins install index", "doctor output", "gateway logs without missing dependency/plugin load failures" - ] + ], + "healthScope": "post-ready" } ], "proves": [ diff --git a/scenarios/upgrade-stable-channel-to-local-build.json b/scenarios/upgrade-stable-channel-to-local-build.json index f3669bf..940f47e 100644 --- a/scenarios/upgrade-stable-channel-to-local-build.json +++ b/scenarios/upgrade-stable-channel-to-local-build.json @@ -39,7 +39,8 @@ "stable channel start output", "pre-upgrade gateway status", "pre-upgrade OpenClaw status" - ] + ], + "healthScope": "readiness" }, { "id": "upgrade", @@ -53,7 +54,8 @@ "snapshot id", "doctor/update output", "rollback status" - ] + ], + "healthScope": "readiness" }, { "id": "post-upgrade", @@ -72,7 +74,8 @@ "plugins install index", "doctor output", "gateway logs without missing dependency/plugin load failures" - ] + ], + "healthScope": "post-ready" } ], "proves": [ diff --git a/scenarios/workspace-scan-pressure.json b/scenarios/workspace-scan-pressure.json index 99b3ea0..6dddba2 100644 --- a/scenarios/workspace-scan-pressure.json +++ b/scenarios/workspace-scan-pressure.json @@ -25,8 +25,8 @@ "soakCommandFailures": 0, "soakHealthFailures": 0, "soakHealthP95Ms": 1000, - "healthP95Ms": 1000, - "peakRssMb": 1000 + "peakRssMb": 1000, + "postReadyHealthP95Ms": 1000 }, "phases": [ { @@ -41,7 +41,8 @@ "startup readiness", "gateway PID", "baseline RSS and CPU" - ] + ], + "healthScope": "readiness" }, { "id": "restart-after-workspace", @@ -55,7 +56,8 @@ "restart readiness", "post-fixture gateway status", "resource samples during restart" - ] + ], + "healthScope": "readiness" }, { "id": "user-facing-commands", @@ -74,7 +76,8 @@ "short repeated command p95", "health p95", "RSS and CPU peaks" - ] + ], + "healthScope": "post-ready" } ], "proves": [ diff --git a/src/evaluator.mjs b/src/evaluator.mjs index db3a1ac..d1975e0 100644 --- a/src/evaluator.mjs +++ b/src/evaluator.mjs @@ -1,6 +1,7 @@ import { buildAgentTurnBreakdown } from "./collectors/agent-turns.mjs"; import { computeProviderTurnAttribution } from "./collectors/provider.mjs"; import { summarizeRuntimeDepsLogs } from "./collectors/logs.mjs"; +import { buildHealthMeasurement, deriveHealthCompatibility } from "./health.mjs"; import { resolveThresholdPolicy } from "./evaluation/thresholds.mjs"; import { checkAggregateThreshold, @@ -91,8 +92,15 @@ export function evaluateRecord(record, scenario, options = {}) { providerSimulation: agentProviderSimulation }); const finalGatewayState = record.finalMetrics?.service?.gatewayState ?? null; - const healthFailures = countHealthFailures(record); - const healthP95Ms = collectHealthP95(record); + const health = buildHealthMeasurement(record, scenario); + const healthCompatibility = deriveHealthCompatibility(health, record); + const healthFailures = healthCompatibility.healthFailures; + const healthP95Ms = healthCompatibility.healthP95Ms; + const startupHealthP95Ms = healthCompatibility.startupHealthP95Ms; + const postReadyHealthP95Ms = healthCompatibility.postReadyHealthP95Ms; + const startupHealthFailures = healthCompatibility.startupHealthFailures; + const postReadyHealthFailures = healthCompatibility.postReadyHealthFailures; + const finalHealthFailures = healthCompatibility.finalHealthFailures; const soakEvidence = collectSoakEvidence(allResults); const mcpBridgeEvidence = collectMcpBridgeEvidence(allResults); const browserAutomationEvidence = collectBrowserAutomationEvidence(allResults); @@ -101,10 +109,10 @@ export function evaluateRecord(record, scenario, options = {}) { const officialPluginEvidence = collectOfficialPluginEvidence(allResults); const listeningFailures = countListeningFailures(record); const tcpConnectMaxMs = collectTcpConnectMax(record); - const timeToListeningMs = collectTimeToListening(record); - const timeToHealthReadyMs = collectTimeToHealthReady(record); + const timeToListeningMs = healthCompatibility.timeToListeningMs ?? collectTimeToListening(record); + const timeToHealthReadyMs = healthCompatibility.timeToHealthReadyMs ?? collectTimeToHealthReady(record); const readinessFailures = countReadinessFailures(record); - const readinessClassification = collectWorstReadinessClassification(record); + const readinessClassification = healthCompatibility.readinessClassification ?? collectWorstReadinessClassification(record); const coldReadyMs = maxDurationWhere(allResults, (command) => command.startsWith("ocm start ")); const warmReadyMs = maxDurationWhere(allResults, (command) => command.startsWith("ocm service restart ")); const upgradeMs = maxDurationWhere(allResults, (command) => command.startsWith("ocm upgrade ")); @@ -203,6 +211,56 @@ export function evaluateRecord(record, scenario, options = {}) { }); } + if (typeof thresholds.startupHealthFailures === "number" && startupHealthFailures > thresholds.startupHealthFailures) { + violations.push({ + kind: "health", + metric: "startupHealthFailures", + expected: `<= ${thresholds.startupHealthFailures}`, + actual: startupHealthFailures, + message: `${startupHealthFailures} startup health check(s) failed, over threshold ${thresholds.startupHealthFailures}` + }); + } + + if (typeof thresholds.postReadyHealthFailures === "number" && postReadyHealthFailures > thresholds.postReadyHealthFailures) { + violations.push({ + kind: "health", + metric: "postReadyHealthFailures", + expected: `<= ${thresholds.postReadyHealthFailures}`, + actual: postReadyHealthFailures, + message: `${postReadyHealthFailures} post-ready liveness check(s) failed, over threshold ${thresholds.postReadyHealthFailures}` + }); + } + + if (typeof thresholds.finalHealthFailures === "number" && finalHealthFailures > thresholds.finalHealthFailures) { + violations.push({ + kind: "health", + metric: "finalHealthFailures", + expected: `<= ${thresholds.finalHealthFailures}`, + actual: finalHealthFailures, + message: `${finalHealthFailures} final health check(s) failed, over threshold ${thresholds.finalHealthFailures}` + }); + } + + if (typeof thresholds.startupHealthP95Ms === "number" && startupHealthP95Ms !== null && startupHealthP95Ms > thresholds.startupHealthP95Ms) { + violations.push({ + kind: "health", + metric: "startupHealthP95Ms", + expected: `<= ${thresholds.startupHealthP95Ms}`, + actual: startupHealthP95Ms, + message: `startup health sample p95 ${startupHealthP95Ms}ms exceeded threshold ${thresholds.startupHealthP95Ms}ms` + }); + } + + if (typeof thresholds.postReadyHealthP95Ms === "number" && postReadyHealthP95Ms !== null && postReadyHealthP95Ms > thresholds.postReadyHealthP95Ms) { + violations.push({ + kind: "health", + metric: "postReadyHealthP95Ms", + expected: `<= ${thresholds.postReadyHealthP95Ms}`, + actual: postReadyHealthP95Ms, + message: `post-ready liveness p95 ${postReadyHealthP95Ms}ms exceeded threshold ${thresholds.postReadyHealthP95Ms}ms` + }); + } + if (typeof thresholds.soakMinDurationMs === "number" && soakEvidence.durationMs !== null && soakEvidence.durationMs < thresholds.soakMinDurationMs) { violations.push({ kind: "soak", @@ -747,6 +805,7 @@ export function evaluateRecord(record, scenario, options = {}) { agentProviderRequestCount: providerTurn?.requestCount ?? null, agentProviderRequestMissing: providerTurn?.missingProviderRequest ?? null, agentProviderAttribution: providerTurn, + health, tcpConnectMaxMs, timeToListeningMs, timeToHealthReadyMs, @@ -758,6 +817,11 @@ export function evaluateRecord(record, scenario, options = {}) { finalGatewayState, healthFailures, healthP95Ms, + startupHealthP95Ms, + postReadyHealthP95Ms, + startupHealthFailures, + postReadyHealthFailures, + finalHealthFailures, soakEvidence, mcpBridgeEvidence, mcpInitializeMs: mcpBridgeEvidence.initializeMs, @@ -1882,26 +1946,6 @@ function countGatewayRestarts(record) { return commandRestarts + countLogMetric(record, "gatewayRestartMentions"); } -function collectHealthP95(record) { - const p95Values = []; - for (const phase of record.phases ?? []) { - const p95 = phase.metrics?.healthSummary?.p95Ms; - if (typeof p95 === "number") { - p95Values.push(p95); - } - } - - const finalP95 = record.finalMetrics?.healthSummary?.p95Ms; - if (typeof finalP95 === "number") { - p95Values.push(finalP95); - } - - if (p95Values.length === 0) { - return null; - } - return Math.max(...p95Values); -} - function collectSoakEvidence(results) { const loops = results .filter((result) => result.command?.includes("run-soak-loop.mjs")) diff --git a/src/health.mjs b/src/health.mjs new file mode 100644 index 0000000..fe06b8e --- /dev/null +++ b/src/health.mjs @@ -0,0 +1,319 @@ +export const HEALTH_SCHEMA = "kova.health.v1"; +export const HEALTH_SCOPES = ["readiness", "startup-sample", "post-ready", "final", "none", "unknown"]; + +const startupScopes = new Set(["readiness", "startup-sample"]); + +export function buildHealthMeasurement(record, scenario = null) { + const phaseContracts = new Map((scenario?.phases ?? []).map((phase) => [phase.id, phase])); + const entries = []; + for (const phase of record.phases ?? []) { + entries.push({ + source: "phase", + phaseId: phase.id ?? null, + scope: normalizeHealthScope(phase.healthScope ?? phaseContracts.get(phase.id)?.healthScope), + metrics: phase.metrics ?? null + }); + } + + const finalEntry = { + source: "final", + phaseId: "final", + scope: "final", + metrics: record.finalMetrics ?? null + }; + entries.push(finalEntry); + + const readiness = selectReadiness(entries); + const startupSamples = summarizeScopedSamples( + entries.filter((entry) => startupScopes.has(entry.scope)), + "startup-sample", + startupSamplesForEntry + ); + const postReadySamples = summarizeScopedSamples( + entries.filter((entry) => entry.scope === "post-ready"), + "post-ready", + postReadySamplesForEntry + ); + const unknownSamples = summarizeScopedSamples( + entries.filter((entry) => entry.scope === "unknown"), + "unknown", + postReadySamplesForEntry + ); + const final = summarizeFinalHealth(finalEntry.metrics); + const slowestSample = selectSlowestSample([startupSamples, postReadySamples, final]); + + return { + schemaVersion: HEALTH_SCHEMA, + readiness, + startupSamples, + postReadySamples, + unknownSamples, + final, + slowestSample + }; +} + +export function deriveHealthCompatibility(health, record = null) { + const startupHealthP95Ms = health?.startupSamples?.p95Ms ?? null; + const postReadyHealthP95Ms = health?.postReadySamples?.p95Ms ?? null; + const scopedP95Ms = maxNullable(startupHealthP95Ms, postReadyHealthP95Ms); + const oldP95Ms = record ? collectOldHealthP95(record) : null; + const startupFailures = health?.startupSamples?.failureCount ?? 0; + const postReadyFailures = health?.postReadySamples?.failureCount ?? 0; + const unknownFailures = health?.unknownSamples?.failureCount ?? 0; + const finalFailures = health?.final?.failureCount ?? 0; + + return { + timeToListeningMs: health?.readiness?.listeningReadyAtMs ?? null, + timeToHealthReadyMs: health?.readiness?.healthReadyAtMs ?? null, + readinessClassification: health?.readiness + ? { + phaseId: health.readiness.phaseId, + state: health.readiness.classification, + severity: health.readiness.severity, + reason: health.readiness.reason, + thresholdMs: health.readiness.thresholdMs, + deadlineMs: health.readiness.deadlineMs, + listeningReadyAtMs: health.readiness.listeningReadyAtMs, + healthReadyAtMs: health.readiness.healthReadyAtMs + } + : null, + healthFailures: startupFailures + postReadyFailures + unknownFailures + finalFailures, + healthP95Ms: scopedP95Ms ?? oldP95Ms, + startupHealthP95Ms, + postReadyHealthP95Ms, + startupHealthFailures: startupFailures, + postReadyHealthFailures: postReadyFailures, + finalHealthFailures: finalFailures + }; +} + +function normalizeHealthScope(scope) { + return typeof scope === "string" && HEALTH_SCOPES.includes(scope) ? scope : "unknown"; +} + +function selectReadiness(entries) { + const scoped = entries + .filter((entry) => startupScopes.has(entry.scope)) + .map((entry) => readinessValue(entry.metrics?.readiness, entry.phaseId)) + .filter(Boolean); + const candidates = scoped.length > 0 + ? scoped + : entries.map((entry) => readinessValue(entry.metrics?.readiness, entry.phaseId)).filter(Boolean); + if (candidates.length === 0) { + return null; + } + candidates.sort((left, right) => { + const rankDelta = readinessRank(right.classification) - readinessRank(left.classification); + if (rankDelta !== 0) { + return rankDelta; + } + return (right.healthReadyAtMs ?? 0) - (left.healthReadyAtMs ?? 0); + }); + return candidates[0]; +} + +function readinessValue(readiness, phaseId) { + if (!readiness?.classification || !(readiness.deadlineMs > 0)) { + return null; + } + return { + phaseId, + listeningReadyAtMs: readiness.listeningReadyAtMs, + healthReadyAtMs: readiness.healthReadyAtMs, + classification: readiness.classification.state, + severity: readiness.classification.severity, + reason: readiness.classification.reason, + thresholdMs: readiness.thresholdMs, + deadlineMs: readiness.deadlineMs, + attempts: readiness.attempts ?? null + }; +} + +function readinessRank(state) { + if (state === "hard-failure") { + return 4; + } + if (state === "unhealthy") { + return 3; + } + if (state === "slow-startup") { + return 2; + } + if (state === "ready") { + return 1; + } + return 0; +} + +function startupSamplesForEntry(entry) { + const attempts = entry.metrics?.readiness?.healthAttempts; + if (Array.isArray(attempts) && attempts.length > 0) { + return attempts; + } + return entry.metrics?.healthSamples ?? []; +} + +function postReadySamplesForEntry(entry) { + return entry.metrics?.healthSamples ?? []; +} + +function summarizeScopedSamples(entries, scope, sampleSelector) { + const samples = []; + for (const entry of entries) { + for (const sample of sampleSelector(entry)) { + samples.push({ ...sample, phaseId: entry.phaseId }); + } + } + if (samples.length > 0) { + return summarizeSamples(samples, scope); + } + + const summaries = entries + .map((entry) => ({ phaseId: entry.phaseId, summary: entry.metrics?.healthSummary })) + .filter((entry) => entry.summary); + if (summaries.length === 0) { + return emptyHealthSummary(scope); + } + + let slowestPhaseId = null; + let maxMs = null; + for (const { phaseId, summary } of summaries) { + if (typeof summary.maxMs === "number" && (maxMs === null || summary.maxMs > maxMs)) { + maxMs = summary.maxMs; + slowestPhaseId = phaseId; + } + } + + return { + scope, + count: sum(summaries, "count"), + okCount: sum(summaries, "okCount"), + failureCount: sum(summaries, "failureCount"), + minMs: minNullable(...summaries.map(({ summary }) => summary.minMs)), + p50Ms: maxNullable(...summaries.map(({ summary }) => summary.p50Ms)), + p95Ms: maxNullable(...summaries.map(({ summary }) => summary.p95Ms)), + maxMs, + slowestPhaseId + }; +} + +function summarizeSamples(samples, scope) { + const durations = samples + .map((sample) => sample.durationMs) + .filter((duration) => typeof duration === "number") + .sort((left, right) => left - right); + let slowestPhaseId = null; + let slowestMs = null; + for (const sample of samples) { + if (typeof sample.durationMs === "number" && (slowestMs === null || sample.durationMs > slowestMs)) { + slowestMs = sample.durationMs; + slowestPhaseId = sample.phaseId ?? null; + } + } + + return { + scope, + count: samples.length, + okCount: samples.filter((sample) => sample.ok === true).length, + failureCount: samples.filter((sample) => sample.ok !== true).length, + minMs: durations.at(0) ?? null, + p50Ms: percentile(durations, 0.5), + p95Ms: percentile(durations, 0.95), + maxMs: durations.at(-1) ?? null, + slowestPhaseId + }; +} + +function emptyHealthSummary(scope) { + return { + scope, + count: 0, + okCount: 0, + failureCount: 0, + minMs: null, + p50Ms: null, + p95Ms: null, + maxMs: null, + slowestPhaseId: null + }; +} + +function summarizeFinalHealth(metrics) { + const samples = Array.isArray(metrics?.healthSamples) ? metrics.healthSamples : []; + const summary = samples.length > 0 ? summarizeSamples(samples.map((sample) => ({ ...sample, phaseId: "final" })), "final") : null; + const fallbackFailureCount = healthFailureCount([metrics?.health]); + const failureCount = summary?.failureCount ?? metrics?.healthSummary?.failureCount ?? fallbackFailureCount; + const maxMs = summary?.maxMs ?? metrics?.healthSummary?.maxMs ?? metrics?.health?.durationMs ?? null; + const p95Ms = summary?.p95Ms ?? metrics?.healthSummary?.p95Ms ?? null; + const gatewayState = metrics?.service?.gatewayState ?? null; + const ok = metrics + ? (gatewayState === null ? failureCount === 0 : gatewayState === "running" && failureCount === 0) + : null; + return { + scope: "final", + gatewayState, + ok, + healthOk: metrics?.health?.ok ?? null, + failureCount, + p95Ms, + maxMs, + slowestPhaseId: maxMs === null ? null : "final" + }; +} + +function selectSlowestSample(summaries) { + let slowest = null; + for (const summary of summaries) { + if (!summary || typeof summary.maxMs !== "number") { + continue; + } + if (!slowest || summary.maxMs > slowest.durationMs) { + slowest = { + scope: summary.scope, + phaseId: summary.slowestPhaseId ?? null, + durationMs: summary.maxMs + }; + } + } + return slowest; +} + +function collectOldHealthP95(record) { + const values = []; + for (const phase of record?.phases ?? []) { + if (typeof phase.metrics?.healthSummary?.p95Ms === "number") { + values.push(phase.metrics.healthSummary.p95Ms); + } + } + if (typeof record?.finalMetrics?.healthSummary?.p95Ms === "number") { + values.push(record.finalMetrics.healthSummary.p95Ms); + } + return values.length === 0 ? null : Math.max(...values); +} + +function healthFailureCount(samples) { + return samples.filter((sample) => sample && sample.ok === false).length; +} + +function sum(entries, key) { + return entries.reduce((total, entry) => total + (entry.summary?.[key] ?? 0), 0); +} + +function maxNullable(...values) { + const numeric = values.filter((value) => typeof value === "number"); + return numeric.length === 0 ? null : Math.max(...numeric); +} + +function minNullable(...values) { + const numeric = values.filter((value) => typeof value === "number"); + return numeric.length === 0 ? null : Math.min(...numeric); +} + +function percentile(values, percentileValue) { + if (values.length === 0) { + return null; + } + const index = Math.ceil(values.length * percentileValue) - 1; + return values[Math.min(Math.max(index, 0), values.length - 1)]; +} diff --git a/src/performance/stats.mjs b/src/performance/stats.mjs index 9a04c92..feac96b 100644 --- a/src/performance/stats.mjs +++ b/src/performance/stats.mjs @@ -20,6 +20,8 @@ export const PERFORMANCE_METRICS = [ { id: "coldPreProviderMs", title: "Cold Pre-Provider", unit: "ms", regressionKey: "agentLatencyRegressionPercent" }, { id: "warmPreProviderMs", title: "Warm Pre-Provider", unit: "ms", regressionKey: "agentLatencyRegressionPercent" }, { id: "healthP95Ms", title: "Health p95", unit: "ms", regressionKey: "startupRegressionPercent" }, + { id: "startupHealthP95Ms", title: "Startup Health p95", unit: "ms", regressionKey: "startupRegressionPercent" }, + { id: "postReadyHealthP95Ms", title: "Post-Ready Health p95", unit: "ms", regressionKey: "startupRegressionPercent" }, { id: "runtimeDepsStagingMs", title: "Runtime Deps Staging", unit: "ms", regressionKey: "startupRegressionPercent" } ]; diff --git a/src/registries/scenarios.mjs b/src/registries/scenarios.mjs index 48e90ca..c934d29 100644 --- a/src/registries/scenarios.mjs +++ b/src/registries/scenarios.mjs @@ -1,6 +1,8 @@ import { scenariosDir } from "../paths.mjs"; import { assertNoShapeErrors, loadJsonRegistry, requireArray, requireKebabId, requireObject, requireString } from "./validate.mjs"; +export const HEALTH_SCOPES = ["readiness", "startup-sample", "post-ready", "final", "none"]; + export async function loadScenarios(selectedId) { return loadJsonRegistry({ dir: scenariosDir, @@ -108,6 +110,7 @@ function validatePhases(phases, errors) { requireKebabId(phase, "id", errors, prefix); requireString(phase, "title", errors, prefix); requireString(phase, "intent", errors, prefix); + requireString(phase, "healthScope", errors, prefix); requireArray(phase, "commands", errors, prefix); requireArray(phase, "evidence", errors, prefix); @@ -120,6 +123,9 @@ function validatePhases(phases, errors) { validateStringArray(phase.commands, `${prefix}.commands`, errors); validateStringArray(phase.evidence, `${prefix}.evidence`, errors); + if (typeof phase.healthScope === "string" && !HEALTH_SCOPES.includes(phase.healthScope)) { + errors.push(`${prefix}.healthScope must be one of ${HEALTH_SCOPES.join(", ")}`); + } if (phase.expectedAgentFailure !== undefined && typeof phase.expectedAgentFailure !== "boolean") { errors.push(`${prefix}.expectedAgentFailure must be a boolean when set`); } diff --git a/src/reporting/compare.mjs b/src/reporting/compare.mjs index 0c07699..6614521 100644 --- a/src/reporting/compare.mjs +++ b/src/reporting/compare.mjs @@ -21,6 +21,11 @@ const defaultThresholds = { timeToHealthReadyMs: 5000, readinessFailures: 0, healthP95Ms: 1000, + startupHealthFailures: 0, + postReadyHealthFailures: 0, + finalHealthFailures: 0, + startupHealthP95Ms: 1000, + postReadyHealthP95Ms: 1000, gatewayRestartCount: 0, providerTimeoutMentions: 0, eventLoopDelayMentions: 0, @@ -317,6 +322,8 @@ function diagnosticRecordSummary(record) { providerFinalMs: measurements.agentProviderFinalMs ?? measurements.coldProviderFinalMs ?? null, runtimeDepsStagingMs: measurements.runtimeDepsStagingMs ?? null, timeToHealthReadyMs: measurements.timeToHealthReadyMs ?? null, + startupHealthP95Ms: measurements.startupHealthP95Ms ?? null, + postReadyHealthP95Ms: measurements.postReadyHealthP95Ms ?? null, peakRssMb: measurements.peakRssMb ?? null }; } @@ -406,6 +413,11 @@ function metricDeltas(baseline, current) { "timeToHealthReadyMs", "healthP95Ms", "healthFailures", + "startupHealthP95Ms", + "postReadyHealthP95Ms", + "startupHealthFailures", + "postReadyHealthFailures", + "finalHealthFailures", "readinessFailures", "missingDependencyErrors", "pluginLoadFailures", diff --git a/src/reporting/report.mjs b/src/reporting/report.mjs index 617b733..77ab858 100644 --- a/src/reporting/report.mjs +++ b/src/reporting/report.mjs @@ -114,8 +114,7 @@ export function renderMarkdownReport(report) { lines.push(`- TCP connect max: ${record.measurements.tcpConnectMaxMs ?? "unknown"} ms`); lines.push(`- Missing dependency errors: ${record.measurements.missingDependencyErrors ?? "unknown"}`); lines.push(`- Final gateway state: ${record.measurements.finalGatewayState ?? "unknown"}`); - lines.push(`- Health failures: ${record.measurements.healthFailures ?? "unknown"}`); - lines.push(`- Health p95: ${record.measurements.healthP95Ms ?? "unknown"} ms`); + lines.push(...formatHealthMeasurementLines(record.measurements)); if (record.measurements.soakEvidence?.available) { lines.push(`- Soak trend: duration ${record.measurements.soakDurationMs ?? "unknown"} ms; iterations ${record.measurements.soakIterations ?? "unknown"}; command p95 ${record.measurements.soakCommandP95Ms ?? "unknown"} ms; health p95 ${record.measurements.soakHealthP95Ms ?? "unknown"} ms; RSS growth ${record.measurements.rssGrowthMb ?? "unknown"} MB; gateway RSS growth ${record.measurements.gatewayRssGrowthMb ?? "unknown"} MB`); } @@ -499,6 +498,26 @@ function formatMetrics(metrics) { return lines.length > 0 ? lines : ["- unavailable"]; } +function formatHealthMeasurementLines(measurements) { + const health = measurements.health; + const lines = [ + `- Health failures: ${measurements.healthFailures ?? "unknown"}`, + `- Startup health p95: ${measurements.startupHealthP95Ms ?? health?.startupSamples?.p95Ms ?? "unknown"} ms`, + `- Post-ready liveness p95: ${measurements.postReadyHealthP95Ms ?? health?.postReadySamples?.p95Ms ?? "unknown"} ms`, + `- Final health failures: ${measurements.finalHealthFailures ?? health?.final?.failureCount ?? "unknown"}` + ]; + if (health?.final) { + const healthState = health.final.healthOk === null ? "unknown" : health.final.healthOk ? "ok" : "not-ok"; + lines.push(`- Final health state: gateway ${health.final.gatewayState ?? "unknown"}; health ${healthState}`); + } + if (health?.slowestSample) { + lines.push(`- Slowest health sample: ${health.slowestSample.scope} ${health.slowestSample.phaseId ?? "unknown"} ${health.slowestSample.durationMs} ms`); + } else if (measurements.healthP95Ms !== null && measurements.healthP95Ms !== undefined) { + lines.push(`- Compatibility health p95: ${measurements.healthP95Ms} ms`); + } + return lines; +} + function formatRecordFailureCards(records = []) { const cards = records .filter((record) => !["PASS", "DRY-RUN"].includes(record.status)) @@ -670,7 +689,14 @@ function summarizeMeasurements(measurements) { timeToHealthReadyMs: measurements.timeToHealthReadyMs ?? null, readinessClassification: measurements.readinessClassification ?? null, readinessClassificationReason: measurements.readinessClassificationReason ?? null, + health: measurements.health ?? null, healthFailures: measurements.healthFailures ?? null, + healthP95Ms: measurements.healthP95Ms ?? null, + startupHealthP95Ms: measurements.startupHealthP95Ms ?? null, + postReadyHealthP95Ms: measurements.postReadyHealthP95Ms ?? null, + startupHealthFailures: measurements.startupHealthFailures ?? null, + postReadyHealthFailures: measurements.postReadyHealthFailures ?? null, + finalHealthFailures: measurements.finalHealthFailures ?? null, missingDependencyErrors: measurements.missingDependencyErrors ?? null, pluginLoadFailures: measurements.pluginLoadFailures ?? null, officialPluginEvidence: measurements.officialPluginEvidence ?? null, @@ -1150,6 +1176,7 @@ function compactRolePeaks(measurements) { function pushMeasurementBrief(lines, measurements, { compact }) { lines.push("Measurements:"); lines.push(`- startup: listening ${valueMs(measurements.timeToListeningMs)}; health ${valueMs(measurements.timeToHealthReadyMs)}; readiness ${measurements.readinessClassification ?? "unknown"}; gateway ${measurements.finalGatewayState ?? "unknown"}; restarts ${measurements.gatewayRestartCount ?? "unknown"}`); + lines.push(`- health: startup p95 ${valueMs(measurements.startupHealthP95Ms)}; post-ready p95 ${valueMs(measurements.postReadyHealthP95Ms)}; failures ${measurements.healthFailures ?? "unknown"}; final failures ${measurements.finalHealthFailures ?? "unknown"}${healthSlowestText(measurements)}`); lines.push(`- resources: peak RSS ${valueMb(measurements.peakRssMb)}; max CPU ${valuePercent(measurements.cpuPercentMax)}; samples ${measurements.resourceSampleCount ?? "unknown"}; roles ${rolePeakText(measurements)}`); lines.push(`- agent: turn ${valueMs(measurements.agentTurnMs, "not-run")}; cold/warm ${valueMs(measurements.coldAgentTurnMs)}/${valueMs(measurements.warmAgentTurnMs)}; cold-warm delta ${valueMs(measurements.agentColdWarmDeltaMs)}; pre-provider ${valueMs(measurements.agentPreProviderMs)}; provider ${valueMs(measurements.agentProviderFinalMs)}; cleanup ${valueMs(measurements.agentCleanupMaxMs)}; diagnosis ${measurements.agentLatencyDiagnosis?.kind ?? "unknown"}; leaks ${measurements.agentProcessLeakCount ?? "unknown"}`); lines.push(`- plugins/runtime: missing deps ${measurements.missingDependencyErrors ?? "unknown"}; plugin failures ${measurements.pluginLoadFailures ?? "unknown"}; runtime deps ${valueMs(measurements.runtimeDepsStagingMs)}${runtimeDepsPluginText(measurements)}; warm restages ${measurements.warmRuntimeDepsRestageCount ?? "unknown"}; warm reuse ${measurements.runtimeDepsWarmReuseOk ?? "unknown"}`); @@ -1223,6 +1250,14 @@ function valuePercent(value) { return value === null || value === undefined ? "unknown" : `${value}%`; } +function healthSlowestText(measurements) { + const slowest = measurements.health?.slowestSample; + if (!slowest) { + return ""; + } + return `; slowest ${slowest.scope}/${slowest.phaseId ?? "unknown"} ${valueMs(slowest.durationMs)}`; +} + function buildFixerPrompt({ report, primaryBlocker, why, measurements, evidence, likelyOwner }) { const parts = [ `Investigate OpenClaw release gate failure ${primaryBlocker}.`, diff --git a/src/runner.mjs b/src/runner.mjs index cd5f017..d6bc1e8 100644 --- a/src/runner.mjs +++ b/src/runner.mjs @@ -142,6 +142,7 @@ export async function executeScenario(scenario, context) { id: phase.id, title: phase.title, intent: phase.intent, + healthScope: phase.healthScope, expectedAgentFailure: phase.expectedAgentFailure === true, commands, evidence: phase.evidence ?? [], @@ -347,6 +348,7 @@ function buildPlannedPhases(scenario, context, envName, artifactDir, authPolicy) id: phase.id, title: phase.title, intent: phase.intent, + healthScope: phase.healthScope, expectedAgentFailure: phase.expectedAgentFailure === true, commands: materializeScenarioPhaseCommands(phase, context, envName, artifactDir), evidence: phase.evidence ?? [] diff --git a/src/selfcheck.mjs b/src/selfcheck.mjs index 8c72abb..0b49264 100644 --- a/src/selfcheck.mjs +++ b/src/selfcheck.mjs @@ -356,6 +356,8 @@ export async function runSelfCheck(flags = {}) { checks.push(markdownFailureCardsCheck()); checks.push(reportRecommendedNextScenarioCheck()); checks.push(readinessClassificationCheck()); + checks.push(healthReadinessModelCheck()); + checks.push(oldHealthReportCompatibilityCheck()); checks.push(await resourceRoleAttributionCheck(tmp)); checks.push(await resourceRootCommandRoleBoundaryCheck()); checks.push(await resourceRolePollutionCheck()); @@ -365,6 +367,7 @@ export async function runSelfCheck(flags = {}) { checks.push(await cleanupRetryCheck(tmp)); checks.push(stateRegistryValidationCheck()); checks.push(scenarioCloneFirstValidationCheck()); + checks.push(scenarioHealthScopeValidationCheck()); checks.push(scenarioStateCompatibilityCheck()); checks.push(await cpuProfileParserCheck()); checks.push(await heapProfileParserCheck()); @@ -3880,6 +3883,7 @@ function readinessClassificationCheck() { phases: [ { id: "provision", + healthScope: "readiness", results: [], metrics: { readiness: { @@ -3951,6 +3955,186 @@ function readinessClassificationCheck() { } } +function healthReadinessModelCheck() { + try { + const record = { + status: "PASS", + phases: [ + { + id: "cold-start", + healthScope: "readiness", + results: [], + metrics: { + readiness: { + deadlineMs: 90000, + thresholdMs: 30000, + ready: true, + listeningReady: true, + listeningReadyAtMs: 120, + healthReadyAtMs: 200, + attempts: 2, + classification: { + state: "ready", + severity: "pass", + reason: "gateway became healthy within the readiness threshold" + }, + healthAttempts: [ + { ok: false, durationMs: 25 }, + { ok: true, durationMs: 30 } + ] + }, + healthSamples: [ + { ok: true, durationMs: 40 } + ], + healthSummary: { + count: 1, + okCount: 1, + failureCount: 0, + minMs: 40, + p50Ms: 40, + p95Ms: 40, + maxMs: 40 + } + } + }, + { + id: "api-latency", + healthScope: "post-ready", + results: [], + metrics: { + healthSamples: [ + { ok: true, durationMs: 10 }, + { ok: true, durationMs: 1500 } + ], + healthSummary: { + count: 2, + okCount: 2, + failureCount: 0, + minMs: 10, + p50Ms: 10, + p95Ms: 1500, + maxMs: 1500 + } + } + } + ], + finalMetrics: { + service: { gatewayState: "running" }, + healthSamples: [{ ok: true, durationMs: 50 }], + healthSummary: { + count: 1, + okCount: 1, + failureCount: 0, + minMs: 50, + p50Ms: 50, + p95Ms: 50, + maxMs: 50 + }, + health: { ok: true, durationMs: 50 } + } + }; + const scenario = { + phases: [ + { id: "cold-start", healthScope: "readiness" }, + { id: "api-latency", healthScope: "post-ready" } + ], + thresholds: { + gatewayReadyMs: 30000, + postReadyHealthP95Ms: 1000 + } + }; + evaluateRecord(record, scenario); + assertEqual(record.status, "FAIL", "post-ready health threshold fails"); + assertEqual(record.measurements.health.schemaVersion, "kova.health.v1", "health schema"); + assertEqual(record.measurements.timeToHealthReadyMs, 200, "readiness health ready derived"); + assertEqual(record.measurements.startupHealthP95Ms, 30, "startup health p95 derived from readiness attempts"); + assertEqual(record.measurements.postReadyHealthP95Ms, 1500, "post-ready health p95 derived from post-ready samples"); + assertEqual(record.measurements.healthP95Ms, 1500, "compatibility health p95 derived"); + assertEqual(record.measurements.health.slowestSample.scope, "post-ready", "slowest health scope"); + assertEqual( + record.violations.some((violation) => violation.metric === "postReadyHealthP95Ms"), + true, + "post-ready health violation" + ); + assertEqual( + record.violations.some((violation) => violation.metric === "timeToHealthReadyMs"), + false, + "post-ready liveness does not masquerade as readiness" + ); + return { + id: "health-readiness-model", + status: "PASS", + command: "evaluate synthetic scoped health record", + durationMs: 0 + }; + } catch (error) { + return { + id: "health-readiness-model", + status: "FAIL", + command: "evaluate synthetic scoped health record", + durationMs: 0, + message: error.message + }; + } +} + +function oldHealthReportCompatibilityCheck() { + try { + const report = { + schemaVersion: "kova.report.v1", + generatedAt: "2026-05-05T00:00:00.000Z", + runId: "old-health-report", + mode: "execution", + target: "runtime:stable", + platform: { os: "darwin", release: "25.0.0", arch: "arm64", node: process.version }, + summary: { total: 1, statuses: { PASS: 1 } }, + records: [{ + scenario: "fresh-install", + title: "Fresh Install", + status: "PASS", + target: "runtime:stable", + state: { id: "fresh", title: "Fresh" }, + envName: "kova-old-health", + likelyOwner: "OpenClaw", + objective: "Old report compatibility.", + measurements: { + peakRssMb: 100, + cpuPercentMax: 10, + timeToListeningMs: 100, + timeToHealthReadyMs: 200, + readinessClassification: "ready", + healthFailures: 0, + healthP95Ms: 900, + finalGatewayState: "running" + }, + phases: [], + violations: [] + }] + }; + const summary = renderReportSummary(report, { structured: true }); + assertEqual(summary.scenarios[0].measurements.health, null, "old report health object absent"); + assertEqual(summary.scenarios[0].measurements.healthP95Ms, 900, "old report health p95 summarized"); + const markdown = renderMarkdownReport(report); + assertEqual(markdown.includes("Compatibility health p95: 900 ms"), true, "old report markdown compatibility p95"); + const comparison = compareReports(report, report, { thresholds: { healthP95Ms: 0 } }); + assertEqual(comparison.ok, true, "old report compare remains ok"); + return { + id: "old-health-report-compatibility", + status: "PASS", + command: "summarize and compare legacy health report shape", + durationMs: 0 + }; + } catch (error) { + return { + id: "old-health-report-compatibility", + status: "FAIL", + command: "summarize and compare legacy health report shape", + durationMs: 0, + message: error.message + }; + } +} + async function resourceRoleAttributionCheck(tmp) { const command = "node -e 'setTimeout(() => {}, 650)'"; const artifactPath = join(tmp, "resource-role-attribution.jsonl"); @@ -4739,6 +4923,7 @@ function scenarioCloneFirstValidationCheck() { id: "status", title: "Status", intent: "Unsafe durable source access.", + healthScope: "post-ready", commands: ["ocm service status {sourceEnv} --json"], evidence: ["status"] }] @@ -4762,6 +4947,7 @@ function scenarioCloneFirstValidationCheck() { id: "clone", title: "Clone", intent: "Clone source.", + healthScope: "none", commands: ["ocm env clone {sourceEnv} {env} --json", "ocm logs {sourceEnv} --tail 20"], evidence: ["clone"] }] @@ -4783,12 +4969,14 @@ function scenarioCloneFirstValidationCheck() { id: "clone", title: "Clone", intent: "Clone source.", + healthScope: "none", commands: ["ocm env clone {sourceEnv} {env} --json"], evidence: ["clone"] }, { id: "upgrade", title: "Upgrade", intent: "Upgrade disposable clone.", + healthScope: "readiness", commands: ["ocm upgrade {env} --channel beta --json"], evidence: ["upgrade"] }] @@ -4811,6 +4999,72 @@ function scenarioCloneFirstValidationCheck() { } } +function scenarioHealthScopeValidationCheck() { + try { + let rejectedMissing = false; + try { + validateScenarioShape({ + id: "missing-health-scope", + surface: "fresh-install", + title: "Missing Health Scope", + objective: "Scenario phase without an explicit health scope.", + tags: ["fresh-user"], + proves: ["baseline"], + thresholds: {}, + phases: [{ + id: "start", + title: "Start", + intent: "Start gateway.", + commands: ["ocm start {env} {startSelector} --json"], + evidence: ["start"] + }] + }, "missing-health-scope.json"); + } catch (error) { + rejectedMissing = /phases\[0\]\.healthScope must be a non-empty string/.test(error.message); + } + assertEqual(rejectedMissing, true, "missing healthScope rejected"); + + let rejectedInvalid = false; + try { + validateScenarioShape({ + id: "invalid-health-scope", + surface: "fresh-install", + title: "Invalid Health Scope", + objective: "Scenario phase with an invalid health scope.", + tags: ["fresh-user"], + proves: ["baseline"], + thresholds: {}, + phases: [{ + id: "start", + title: "Start", + intent: "Start gateway.", + healthScope: "startup", + commands: ["ocm start {env} {startSelector} --json"], + evidence: ["start"] + }] + }, "invalid-health-scope.json"); + } catch (error) { + rejectedInvalid = /healthScope must be one of/.test(error.message); + } + assertEqual(rejectedInvalid, true, "invalid healthScope rejected"); + + return { + id: "scenario-health-scope-validation", + status: "PASS", + command: "validate scenario health scope contracts", + durationMs: 0 + }; + } catch (error) { + return { + id: "scenario-health-scope-validation", + status: "FAIL", + command: "validate scenario health scope contracts", + durationMs: 0, + message: error.message + }; + } +} + function scenarioStateCompatibilityCheck() { try { let rejected = false; diff --git a/surfaces/agent-cli-local-turn.json b/surfaces/agent-cli-local-turn.json index 91b5ef1..030b4f1 100644 --- a/surfaces/agent-cli-local-turn.json +++ b/surfaces/agent-cli-local-turn.json @@ -18,8 +18,8 @@ "preProviderMs": 10000, "providerFinalMs": 3000, "agentCleanupMs": 5000, - "healthP95Ms": 1000, - "peakRssMb": 900 + "peakRssMb": 900, + "postReadyHealthP95Ms": 1000 }, "roleThresholds": { "gateway": { @@ -81,7 +81,7 @@ "warmPreProviderMs", "agentPreProviderP95Ms", "agentCleanupMaxMs", - "healthP95Ms", + "postReadyHealthP95Ms", "peakRssMb", "providerTimeoutMentions", "pluginLoadFailures" diff --git a/surfaces/agent-gateway-rpc-turn.json b/surfaces/agent-gateway-rpc-turn.json index 05439ff..5b87a0a 100644 --- a/surfaces/agent-gateway-rpc-turn.json +++ b/surfaces/agent-gateway-rpc-turn.json @@ -14,8 +14,8 @@ "agentTurnMs": 45000, "preProviderMs": 10000, "providerFinalMs": 3000, - "healthP95Ms": 1000, - "peakRssMb": 900 + "peakRssMb": 900, + "postReadyHealthP95Ms": 1000 }, "roleThresholds": { "gateway": { @@ -63,7 +63,7 @@ "agentTurnP95Ms", "agentTurnMaxMs", "coldPreProviderMs", - "healthP95Ms", + "postReadyHealthP95Ms", "peakRssMb", "pluginLoadFailures" ] diff --git a/surfaces/dashboard-session-send-turn.json b/surfaces/dashboard-session-send-turn.json index 6d0f912..9a73832 100644 --- a/surfaces/dashboard-session-send-turn.json +++ b/surfaces/dashboard-session-send-turn.json @@ -14,8 +14,8 @@ "agentTurnMs": 45000, "preProviderMs": 10000, "providerFinalMs": 3000, - "healthP95Ms": 1000, - "peakRssMb": 900 + "peakRssMb": 900, + "postReadyHealthP95Ms": 1000 }, "roleThresholds": { "gateway": { @@ -61,7 +61,7 @@ "agentTurnMs", "agentTurnP95Ms", "coldPreProviderMs", - "healthP95Ms", + "postReadyHealthP95Ms", "peakRssMb", "pluginLoadFailures" ] diff --git a/surfaces/gateway-performance.json b/surfaces/gateway-performance.json index c4595c3..7a79382 100644 --- a/surfaces/gateway-performance.json +++ b/surfaces/gateway-performance.json @@ -12,8 +12,8 @@ "thresholds": { "coldReadyMs": 30000, "warmReadyMs": 15000, - "healthP95Ms": 1000, - "peakRssMb": 900 + "peakRssMb": 900, + "postReadyHealthP95Ms": 1000 }, "roleThresholds": { "gateway": { @@ -60,7 +60,7 @@ "metrics": [ "coldReadyMs", "warmReadyMs", - "healthP95Ms", + "postReadyHealthP95Ms", "peakRssMb", "eventLoopMaxMs" ] diff --git a/surfaces/openai-compatible-turn.json b/surfaces/openai-compatible-turn.json index a16842d..fa18b37 100644 --- a/surfaces/openai-compatible-turn.json +++ b/surfaces/openai-compatible-turn.json @@ -14,8 +14,8 @@ "agentTurnMs": 45000, "preProviderMs": 10000, "providerFinalMs": 3000, - "healthP95Ms": 1000, - "peakRssMb": 900 + "peakRssMb": 900, + "postReadyHealthP95Ms": 1000 }, "roleThresholds": { "gateway": { @@ -62,7 +62,7 @@ "agentTurnMs", "agentTurnP95Ms", "coldPreProviderMs", - "healthP95Ms", + "postReadyHealthP95Ms", "peakRssMb", "pluginLoadFailures" ] diff --git a/surfaces/soak.json b/surfaces/soak.json index cb19e38..b3c947c 100644 --- a/surfaces/soak.json +++ b/surfaces/soak.json @@ -19,7 +19,7 @@ "rssGrowthMb": 300, "gatewayRssGrowthMb": 300, "soakHealthP95Ms": 1000, - "healthP95Ms": 1000 + "postReadyHealthP95Ms": 1000 }, "roleThresholds": { "gateway": { diff --git a/surfaces/tui-message-turn.json b/surfaces/tui-message-turn.json index 1542e25..59b20df 100644 --- a/surfaces/tui-message-turn.json +++ b/surfaces/tui-message-turn.json @@ -14,8 +14,8 @@ "agentTurnMs": 45000, "preProviderMs": 10000, "providerFinalMs": 3000, - "healthP95Ms": 1000, - "peakRssMb": 900 + "peakRssMb": 900, + "postReadyHealthP95Ms": 1000 }, "roleThresholds": { "gateway": { @@ -61,7 +61,7 @@ "agentTurnMs", "agentTurnP95Ms", "coldPreProviderMs", - "healthP95Ms", + "postReadyHealthP95Ms", "peakRssMb", "pluginLoadFailures" ] diff --git a/surfaces/workspace-scan.json b/surfaces/workspace-scan.json index f51dca9..9201dc4 100644 --- a/surfaces/workspace-scan.json +++ b/surfaces/workspace-scan.json @@ -18,9 +18,9 @@ "modelsListMs": 20000, "soakCommandP95Ms": 12000, "soakHealthP95Ms": 1000, - "healthP95Ms": 1000, "peakRssMb": 1000, - "eventLoopMaxMs": 500 + "eventLoopMaxMs": 500, + "postReadyHealthP95Ms": 1000 }, "roleThresholds": { "gateway": { @@ -74,7 +74,7 @@ "soakCommandP95Ms", "soakHealthP95Ms", "peakRssMb", - "healthP95Ms", + "postReadyHealthP95Ms", "eventLoopMaxMs" ] }