feat: add scoped health readiness model
This commit is contained in:
parent
ed3384487a
commit
b8b7c023eb
@ -224,6 +224,87 @@ Current metrics include:
|
||||
- runtime dependency staging grouped by bundled plugin when OpenClaw emits
|
||||
`runtimeDeps.stage` spans with `pluginId` attributes
|
||||
|
||||
## Health And Readiness
|
||||
|
||||
Records keep existing compatibility fields such as `timeToListeningMs`,
|
||||
`timeToHealthReadyMs`, `readinessClassification`, `healthFailures`, and
|
||||
`healthP95Ms`. New readers should use `records[*].measurements.health`:
|
||||
|
||||
```json
|
||||
{
|
||||
"schemaVersion": "kova.health.v1",
|
||||
"readiness": {
|
||||
"phaseId": "cold-start",
|
||||
"listeningReadyAtMs": 2536,
|
||||
"healthReadyAtMs": 3005,
|
||||
"classification": "ready",
|
||||
"severity": "pass",
|
||||
"reason": "gateway became healthy within the readiness threshold",
|
||||
"thresholdMs": 30000,
|
||||
"deadlineMs": 120000,
|
||||
"attempts": 4
|
||||
},
|
||||
"startupSamples": {
|
||||
"scope": "startup-sample",
|
||||
"count": 4,
|
||||
"okCount": 1,
|
||||
"failureCount": 3,
|
||||
"p95Ms": 120,
|
||||
"maxMs": 120,
|
||||
"slowestPhaseId": "cold-start"
|
||||
},
|
||||
"postReadySamples": {
|
||||
"scope": "post-ready",
|
||||
"count": 9,
|
||||
"okCount": 9,
|
||||
"failureCount": 0,
|
||||
"p95Ms": 469,
|
||||
"maxMs": 652,
|
||||
"slowestPhaseId": "api-latency"
|
||||
},
|
||||
"unknownSamples": {
|
||||
"scope": "unknown",
|
||||
"count": 0,
|
||||
"okCount": 0,
|
||||
"failureCount": 0,
|
||||
"p95Ms": null,
|
||||
"maxMs": null,
|
||||
"slowestPhaseId": null
|
||||
},
|
||||
"final": {
|
||||
"scope": "final",
|
||||
"gatewayState": "running",
|
||||
"ok": true,
|
||||
"healthOk": true,
|
||||
"failureCount": 0,
|
||||
"p95Ms": 90,
|
||||
"maxMs": 90,
|
||||
"slowestPhaseId": "final"
|
||||
},
|
||||
"slowestSample": {
|
||||
"scope": "post-ready",
|
||||
"phaseId": "api-latency",
|
||||
"durationMs": 652
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Scenario phases declare `healthScope` so the evaluator does not infer meaning
|
||||
from phase ids. Allowed values are `readiness`, `startup-sample`, `post-ready`,
|
||||
`final`, and `none`. Old or externally produced reports without phase scope are
|
||||
treated as `unknown` when summarized for compatibility.
|
||||
|
||||
Compatibility derivation:
|
||||
|
||||
- `timeToListeningMs`: `measurements.health.readiness.listeningReadyAtMs`
|
||||
- `timeToHealthReadyMs`: `measurements.health.readiness.healthReadyAtMs`
|
||||
- `readinessClassification`: `measurements.health.readiness.classification`
|
||||
- `healthFailures`: startup + post-ready + unknown + final health failures
|
||||
- `healthP95Ms`: max startup/post-ready p95, falling back to old aggregate p95
|
||||
for old reports
|
||||
- `startupHealthP95Ms`: `measurements.health.startupSamples.p95Ms`
|
||||
- `postReadyHealthP95Ms`: `measurements.health.postReadySamples.p95Ms`
|
||||
|
||||
Role-specific thresholds can fail a scenario separately from total process-tree
|
||||
thresholds. For example, a report can show that `gateway` exceeded memory while
|
||||
`package-manager` stayed normal, or that `package-manager` spiked during local
|
||||
@ -276,8 +357,8 @@ Aggregate metric fields include:
|
||||
- `samples`
|
||||
|
||||
Current aggregate metrics include startup readiness, TCP listening, RSS, CPU,
|
||||
event-loop delay, agent turn latency, health p95, and runtime dependency
|
||||
staging.
|
||||
event-loop delay, agent turn latency, compatibility health p95, startup health
|
||||
p95, post-ready health p95, and runtime dependency staging.
|
||||
|
||||
Baseline stores use schema `kova.baselines.v1`. Baseline read/write requires
|
||||
`--execute` so stored evidence comes from real OpenClaw runs, not dry-run plans.
|
||||
|
||||
@ -30,6 +30,7 @@
|
||||
"diagnosticPresent",
|
||||
"doctorFixMs",
|
||||
"eventLoopMaxMs",
|
||||
"finalHealthFailures",
|
||||
"gatewayReadyHardTimeoutMs",
|
||||
"gatewayReadyMs",
|
||||
"gatewayResponsive",
|
||||
@ -64,6 +65,8 @@
|
||||
"pluginIndexPresent",
|
||||
"pluginInstallMs",
|
||||
"pluginLoadFailures",
|
||||
"postReadyHealthFailures",
|
||||
"postReadyHealthP95Ms",
|
||||
"pluginUpdateDryRunMs",
|
||||
"pluginsListMs",
|
||||
"preProviderDominanceRatio",
|
||||
@ -89,6 +92,8 @@
|
||||
"statusAfterFailureMs",
|
||||
"statusAfterModelsMs",
|
||||
"statusMs",
|
||||
"startupHealthFailures",
|
||||
"startupHealthP95Ms",
|
||||
"syncFsStallDetected",
|
||||
"tuiSmokeMs",
|
||||
"upgradeMs",
|
||||
|
||||
@ -42,7 +42,8 @@
|
||||
"runtime binding",
|
||||
"startup readiness",
|
||||
"no Kova auth setup phase"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "missing-auth-agent-turn",
|
||||
@ -57,7 +58,8 @@
|
||||
"no provider request",
|
||||
"process leak snapshot",
|
||||
"role resource samples"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
},
|
||||
{
|
||||
"id": "post-auth-failure-health",
|
||||
@ -72,7 +74,8 @@
|
||||
"auth failure logs",
|
||||
"plugin errors",
|
||||
"memory after auth failure"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
}
|
||||
],
|
||||
"proves": [
|
||||
|
||||
@ -43,7 +43,8 @@
|
||||
"gateway port",
|
||||
"runtime binding",
|
||||
"env created without service"
|
||||
]
|
||||
],
|
||||
"healthScope": "none"
|
||||
},
|
||||
{
|
||||
"id": "cold-agent-turn",
|
||||
@ -58,7 +59,8 @@
|
||||
"mock provider request timing",
|
||||
"gateway health after cold turn",
|
||||
"role resource samples"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
},
|
||||
{
|
||||
"id": "warm-agent-turn",
|
||||
@ -73,7 +75,8 @@
|
||||
"mock provider request timing",
|
||||
"cold/warm delta",
|
||||
"role resource samples"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
},
|
||||
{
|
||||
"id": "post-agent-health",
|
||||
@ -86,7 +89,8 @@
|
||||
"env status",
|
||||
"plugin errors",
|
||||
"memory after agent turns"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
}
|
||||
],
|
||||
"proves": [
|
||||
|
||||
@ -37,7 +37,8 @@
|
||||
"gateway port",
|
||||
"runtime binding",
|
||||
"env created without service"
|
||||
]
|
||||
],
|
||||
"healthScope": "none"
|
||||
},
|
||||
{
|
||||
"id": "gateway-start",
|
||||
@ -51,7 +52,8 @@
|
||||
"gateway service installed",
|
||||
"gateway service started",
|
||||
"startup readiness"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "gateway-agent-turn",
|
||||
@ -66,7 +68,8 @@
|
||||
"mock provider request timing",
|
||||
"gateway health after turn",
|
||||
"role resource samples"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
},
|
||||
{
|
||||
"id": "post-agent-health",
|
||||
@ -81,7 +84,8 @@
|
||||
"provider logs",
|
||||
"plugin errors",
|
||||
"memory after agent turn"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
}
|
||||
],
|
||||
"proves": [
|
||||
|
||||
@ -46,7 +46,8 @@
|
||||
"gateway port",
|
||||
"runtime binding",
|
||||
"startup readiness"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "cold-session-turn",
|
||||
@ -60,7 +61,8 @@
|
||||
"assistant text",
|
||||
"provider request timing",
|
||||
"role resource samples"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
},
|
||||
{
|
||||
"id": "warm-session-turn",
|
||||
@ -74,7 +76,8 @@
|
||||
"assistant text",
|
||||
"provider request timing",
|
||||
"cold/warm delta"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
},
|
||||
{
|
||||
"id": "session-turn-3",
|
||||
@ -88,7 +91,8 @@
|
||||
"assistant text",
|
||||
"provider request timing",
|
||||
"role resource samples"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
},
|
||||
{
|
||||
"id": "session-turn-4",
|
||||
@ -102,7 +106,8 @@
|
||||
"assistant text",
|
||||
"provider request timing",
|
||||
"role resource samples"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
},
|
||||
{
|
||||
"id": "session-turn-5",
|
||||
@ -116,7 +121,8 @@
|
||||
"assistant text",
|
||||
"provider request timing",
|
||||
"role resource samples"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
},
|
||||
{
|
||||
"id": "session-turn-6",
|
||||
@ -130,7 +136,8 @@
|
||||
"assistant text",
|
||||
"provider request timing",
|
||||
"role resource samples"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
},
|
||||
{
|
||||
"id": "post-session-health",
|
||||
@ -146,7 +153,8 @@
|
||||
"plugin errors",
|
||||
"memory after repeated turns",
|
||||
"process leak summary"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
}
|
||||
],
|
||||
"proves": [
|
||||
|
||||
@ -36,7 +36,8 @@
|
||||
"gateway port",
|
||||
"runtime binding",
|
||||
"startup readiness"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "network-offline-turn",
|
||||
@ -49,7 +50,8 @@
|
||||
"bounded network failure",
|
||||
"gateway status after failure",
|
||||
"role resource samples"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
},
|
||||
{
|
||||
"id": "post-network-health",
|
||||
@ -64,7 +66,8 @@
|
||||
"network/provider failure logs",
|
||||
"plugin errors",
|
||||
"memory after network failure"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
}
|
||||
],
|
||||
"proves": [
|
||||
|
||||
@ -47,7 +47,8 @@
|
||||
"gateway port",
|
||||
"runtime binding",
|
||||
"startup readiness"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "concurrent-provider-turns",
|
||||
@ -63,7 +64,8 @@
|
||||
"pre-provider timing",
|
||||
"role resource samples",
|
||||
"process leak snapshot"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
},
|
||||
{
|
||||
"id": "post-concurrency-health",
|
||||
@ -78,7 +80,8 @@
|
||||
"provider logs",
|
||||
"plugin errors",
|
||||
"memory after concurrent turns"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
}
|
||||
],
|
||||
"proves": [
|
||||
|
||||
@ -38,7 +38,8 @@
|
||||
"gateway port",
|
||||
"runtime binding",
|
||||
"startup readiness"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "malformed-provider-turn",
|
||||
@ -53,7 +54,8 @@
|
||||
"malformed provider evidence",
|
||||
"gateway remains supervised",
|
||||
"role resource samples"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
},
|
||||
{
|
||||
"id": "post-failure-health",
|
||||
@ -68,7 +70,8 @@
|
||||
"provider logs",
|
||||
"plugin errors",
|
||||
"memory after malformed response"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
}
|
||||
],
|
||||
"proves": [
|
||||
|
||||
@ -42,7 +42,8 @@
|
||||
"gateway port",
|
||||
"runtime binding",
|
||||
"startup readiness"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "transient-provider-failure-turn",
|
||||
@ -57,7 +58,8 @@
|
||||
"provider 200 recovery evidence",
|
||||
"gateway remains supervised",
|
||||
"role resource samples"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
},
|
||||
{
|
||||
"id": "recovery-provider-turn",
|
||||
@ -71,7 +73,8 @@
|
||||
"provider recovery timing",
|
||||
"gateway remains healthy",
|
||||
"role resource samples"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
},
|
||||
{
|
||||
"id": "post-failure-health",
|
||||
@ -86,7 +89,8 @@
|
||||
"provider logs",
|
||||
"plugin errors",
|
||||
"memory after recovery"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
}
|
||||
],
|
||||
"proves": [
|
||||
|
||||
@ -44,7 +44,8 @@
|
||||
"gateway port",
|
||||
"runtime binding",
|
||||
"startup readiness"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "slow-provider-turn",
|
||||
@ -58,7 +59,8 @@
|
||||
"provider delay timing",
|
||||
"pre-provider timing",
|
||||
"role resource samples"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
},
|
||||
{
|
||||
"id": "post-failure-health",
|
||||
@ -73,7 +75,8 @@
|
||||
"provider logs",
|
||||
"plugin errors",
|
||||
"memory after provider delay"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
}
|
||||
],
|
||||
"proves": [
|
||||
|
||||
@ -41,7 +41,8 @@
|
||||
"gateway port",
|
||||
"runtime binding",
|
||||
"startup readiness"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "streaming-stall-provider-turn",
|
||||
@ -57,7 +58,8 @@
|
||||
"process leak snapshot",
|
||||
"gateway remains supervised",
|
||||
"role resource samples"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
},
|
||||
{
|
||||
"id": "post-failure-health",
|
||||
@ -72,7 +74,8 @@
|
||||
"provider logs",
|
||||
"plugin errors",
|
||||
"memory after streaming stall"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
}
|
||||
],
|
||||
"proves": [
|
||||
|
||||
@ -39,7 +39,8 @@
|
||||
"gateway port",
|
||||
"runtime binding",
|
||||
"startup readiness"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "timeout-provider-turn",
|
||||
@ -54,7 +55,8 @@
|
||||
"provider timeout/abort timing",
|
||||
"gateway remains supervised",
|
||||
"role resource samples"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
},
|
||||
{
|
||||
"id": "post-failure-health",
|
||||
@ -69,7 +71,8 @@
|
||||
"provider logs",
|
||||
"plugin errors",
|
||||
"memory after timeout"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
}
|
||||
],
|
||||
"proves": [
|
||||
|
||||
@ -39,7 +39,8 @@
|
||||
"gateway status",
|
||||
"gateway port",
|
||||
"readiness classification"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "browser-smoke",
|
||||
@ -54,7 +55,8 @@
|
||||
"opened tab count",
|
||||
"snapshot timing",
|
||||
"browser stop timing"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
},
|
||||
{
|
||||
"id": "post-browser-health",
|
||||
@ -68,7 +70,8 @@
|
||||
"status after browser automation",
|
||||
"browser plugin errors",
|
||||
"gateway errors"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
}
|
||||
],
|
||||
"proves": [
|
||||
|
||||
@ -30,7 +30,8 @@
|
||||
"bundled plugin count",
|
||||
"readiness classification",
|
||||
"dependency staging"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "inspect",
|
||||
@ -46,7 +47,8 @@
|
||||
"registry refresh",
|
||||
"missing package/module errors",
|
||||
"plugin service failures"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
},
|
||||
{
|
||||
"id": "restart",
|
||||
@ -61,7 +63,8 @@
|
||||
"warm readiness",
|
||||
"bundled plugin reload",
|
||||
"runtime dependency reuse"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
}
|
||||
],
|
||||
"proves": [
|
||||
|
||||
@ -29,7 +29,8 @@
|
||||
"dependency staging duration",
|
||||
"installed dependency list",
|
||||
"missing dependency errors"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "warm-restart",
|
||||
@ -44,7 +45,8 @@
|
||||
"warm ready time",
|
||||
"dependency staging reuse",
|
||||
"missing dependency errors"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
}
|
||||
],
|
||||
"proves": [
|
||||
|
||||
@ -37,7 +37,8 @@
|
||||
"Node version",
|
||||
"runtime version",
|
||||
"gateway port"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "core-smoke",
|
||||
@ -53,7 +54,8 @@
|
||||
"plugin list",
|
||||
"filesystem stall logs",
|
||||
"health latency"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
}
|
||||
],
|
||||
"proves": [
|
||||
|
||||
@ -31,7 +31,8 @@
|
||||
"gateway status",
|
||||
"gateway port",
|
||||
"readiness classification"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "dashboard",
|
||||
@ -44,7 +45,8 @@
|
||||
"dashboard URL",
|
||||
"token handling",
|
||||
"command latency"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
},
|
||||
{
|
||||
"id": "post-dashboard-health",
|
||||
@ -58,7 +60,8 @@
|
||||
"status after dashboard command",
|
||||
"websocket disconnect logs",
|
||||
"gateway errors"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
}
|
||||
],
|
||||
"proves": [
|
||||
|
||||
@ -41,7 +41,8 @@
|
||||
"source env",
|
||||
"clone root",
|
||||
"cloned OpenClaw config"
|
||||
]
|
||||
],
|
||||
"healthScope": "none"
|
||||
},
|
||||
{
|
||||
"id": "upgrade",
|
||||
@ -55,7 +56,8 @@
|
||||
"upgrade JSON",
|
||||
"runtime binding",
|
||||
"post-upgrade service state"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "gateway-start",
|
||||
@ -69,7 +71,8 @@
|
||||
"gateway service installed",
|
||||
"gateway service started",
|
||||
"startup readiness"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "dashboard-session-turn",
|
||||
@ -85,7 +88,8 @@
|
||||
"provider timing",
|
||||
"gateway health after turn",
|
||||
"role resource samples"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
},
|
||||
{
|
||||
"id": "post-dashboard-health",
|
||||
@ -101,7 +105,8 @@
|
||||
"liveness warnings",
|
||||
"plugin errors",
|
||||
"memory after dashboard turn"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
}
|
||||
],
|
||||
"proves": [
|
||||
|
||||
@ -38,7 +38,8 @@
|
||||
"gateway port",
|
||||
"runtime binding",
|
||||
"env created without service"
|
||||
]
|
||||
],
|
||||
"healthScope": "none"
|
||||
},
|
||||
{
|
||||
"id": "gateway-start",
|
||||
@ -52,7 +53,8 @@
|
||||
"gateway service installed",
|
||||
"gateway service started",
|
||||
"startup readiness"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "dashboard-session-turn",
|
||||
@ -67,7 +69,8 @@
|
||||
"mock provider request timing",
|
||||
"gateway health after turn",
|
||||
"role resource samples"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
},
|
||||
{
|
||||
"id": "post-dashboard-health",
|
||||
@ -82,7 +85,8 @@
|
||||
"provider logs",
|
||||
"plugin errors",
|
||||
"memory after dashboard turn"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
}
|
||||
],
|
||||
"proves": [
|
||||
|
||||
@ -27,7 +27,8 @@
|
||||
"evidence": [
|
||||
"baseline status",
|
||||
"gateway PID"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "diagnostics",
|
||||
@ -41,7 +42,8 @@
|
||||
"error classification",
|
||||
"gateway survival",
|
||||
"recovery guidance"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
}
|
||||
],
|
||||
"proves": [
|
||||
|
||||
@ -30,7 +30,8 @@
|
||||
"env name",
|
||||
"runtime binding",
|
||||
"gateway port"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "readiness",
|
||||
@ -45,7 +46,8 @@
|
||||
"gateway state",
|
||||
"gateway PID",
|
||||
"health/status result"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
},
|
||||
{
|
||||
"id": "plugins",
|
||||
@ -59,7 +61,8 @@
|
||||
"plugins list output",
|
||||
"plugin update dry-run output",
|
||||
"missing dependency log scan"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
},
|
||||
{
|
||||
"id": "models",
|
||||
@ -72,7 +75,8 @@
|
||||
"models list duration",
|
||||
"timeout behavior",
|
||||
"gateway health after model list"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
},
|
||||
{
|
||||
"id": "logs",
|
||||
@ -85,7 +89,8 @@
|
||||
"startup logs",
|
||||
"missing dependency errors",
|
||||
"plugin metadata scan warnings"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
},
|
||||
{
|
||||
"id": "cleanup",
|
||||
@ -96,7 +101,8 @@
|
||||
],
|
||||
"evidence": [
|
||||
"destroy result"
|
||||
]
|
||||
],
|
||||
"healthScope": "none"
|
||||
}
|
||||
],
|
||||
"proves": [
|
||||
|
||||
@ -13,9 +13,9 @@
|
||||
"thresholds": {
|
||||
"coldReadyMs": 30000,
|
||||
"warmReadyMs": 15000,
|
||||
"healthP95Ms": 1000,
|
||||
"peakRssMb": 900,
|
||||
"eventLoopMaxMs": 500
|
||||
"eventLoopMaxMs": 500,
|
||||
"postReadyHealthP95Ms": 1000
|
||||
},
|
||||
"phases": [
|
||||
{
|
||||
@ -32,7 +32,8 @@
|
||||
"RSS",
|
||||
"CPU",
|
||||
"startup logs"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "api-latency",
|
||||
@ -47,7 +48,8 @@
|
||||
"command durations",
|
||||
"health after each command",
|
||||
"logs"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
},
|
||||
{
|
||||
"id": "warm-restart",
|
||||
@ -61,7 +63,8 @@
|
||||
"warm ready time",
|
||||
"RSS delta",
|
||||
"startup log delta"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
}
|
||||
],
|
||||
"proves": [
|
||||
|
||||
@ -37,7 +37,8 @@
|
||||
"gateway status",
|
||||
"gateway port",
|
||||
"readiness classification"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "mcp-bridge",
|
||||
@ -51,7 +52,8 @@
|
||||
"tools/list timing",
|
||||
"tool count",
|
||||
"bridge process exit"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
},
|
||||
{
|
||||
"id": "post-mcp-health",
|
||||
@ -65,7 +67,8 @@
|
||||
"status after MCP bridge",
|
||||
"MCP bridge errors",
|
||||
"gateway errors"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
}
|
||||
],
|
||||
"proves": [
|
||||
|
||||
@ -39,7 +39,8 @@
|
||||
"gateway port",
|
||||
"runtime binding",
|
||||
"startup readiness"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "media-timeout",
|
||||
@ -53,7 +54,8 @@
|
||||
"provider timeout observed",
|
||||
"gateway status after timeout",
|
||||
"mock provider request log"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
},
|
||||
{
|
||||
"id": "post-media-health",
|
||||
@ -68,7 +70,8 @@
|
||||
"provider timeout logs",
|
||||
"plugin errors",
|
||||
"memory after media timeout"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
}
|
||||
],
|
||||
"proves": [
|
||||
|
||||
@ -36,7 +36,8 @@
|
||||
"evidence": [
|
||||
"fresh env started",
|
||||
"baseline plugin list captured"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "install",
|
||||
@ -50,7 +51,8 @@
|
||||
"security scanner results",
|
||||
"plugins appear in list",
|
||||
"registry refresh succeeds"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
},
|
||||
{
|
||||
"id": "restart",
|
||||
@ -67,7 +69,8 @@
|
||||
"official plugin remains installed",
|
||||
"plugin load logs",
|
||||
"missing dependency scan"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
}
|
||||
],
|
||||
"proves": [
|
||||
|
||||
@ -37,7 +37,8 @@
|
||||
"gateway port",
|
||||
"runtime binding",
|
||||
"env created without service"
|
||||
]
|
||||
],
|
||||
"healthScope": "none"
|
||||
},
|
||||
{
|
||||
"id": "gateway-start",
|
||||
@ -51,7 +52,8 @@
|
||||
"gateway service installed",
|
||||
"gateway service started",
|
||||
"startup readiness"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "openai-compatible-turn",
|
||||
@ -66,7 +68,8 @@
|
||||
"mock provider request timing",
|
||||
"gateway health after turn",
|
||||
"role resource samples"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
},
|
||||
{
|
||||
"id": "post-http-health",
|
||||
@ -81,7 +84,8 @@
|
||||
"provider logs",
|
||||
"plugin errors",
|
||||
"memory after HTTP turn"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
}
|
||||
],
|
||||
"proves": [
|
||||
|
||||
@ -28,7 +28,8 @@
|
||||
"evidence": [
|
||||
"baseline gateway status",
|
||||
"readiness classification"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "reject-invalid-plugin",
|
||||
@ -41,7 +42,8 @@
|
||||
"install command rejected",
|
||||
"validation error",
|
||||
"no install record committed"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
},
|
||||
{
|
||||
"id": "post-failure-health",
|
||||
@ -56,7 +58,8 @@
|
||||
"gateway status",
|
||||
"plugin list",
|
||||
"logs after invalid install"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
}
|
||||
],
|
||||
"proves": [
|
||||
|
||||
@ -29,7 +29,8 @@
|
||||
"evidence": [
|
||||
"baseline plugin list",
|
||||
"gateway readiness"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "install",
|
||||
@ -45,7 +46,8 @@
|
||||
"plugin index update",
|
||||
"registry refresh",
|
||||
"plugin appears in list"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
},
|
||||
{
|
||||
"id": "restart",
|
||||
@ -60,7 +62,8 @@
|
||||
"restart readiness",
|
||||
"plugin load logs",
|
||||
"missing dependency scan"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
}
|
||||
],
|
||||
"proves": [
|
||||
|
||||
@ -28,7 +28,8 @@
|
||||
"plugin list",
|
||||
"update dry-run",
|
||||
"runtime dependency errors"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "restart",
|
||||
@ -43,7 +44,8 @@
|
||||
"restart status",
|
||||
"logs",
|
||||
"missing dependency scan"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
}
|
||||
],
|
||||
"proves": [
|
||||
|
||||
@ -31,7 +31,8 @@
|
||||
"install result",
|
||||
"plugin entry registered",
|
||||
"gateway readiness before load"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "restart",
|
||||
@ -46,7 +47,8 @@
|
||||
"missing dependency diagnostic",
|
||||
"plugin load failure",
|
||||
"gateway remains supervised"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "survival",
|
||||
@ -59,7 +61,8 @@
|
||||
"evidence": [
|
||||
"status after plugin failure",
|
||||
"plugin list after failure"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
}
|
||||
],
|
||||
"proves": [
|
||||
|
||||
@ -31,7 +31,8 @@
|
||||
"evidence": [
|
||||
"install record",
|
||||
"plugin appears before uninstall"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "remove",
|
||||
@ -46,7 +47,8 @@
|
||||
"uninstall output",
|
||||
"install index cleanup",
|
||||
"registry after removal"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
},
|
||||
{
|
||||
"id": "restart",
|
||||
@ -61,7 +63,8 @@
|
||||
"restart readiness",
|
||||
"removed plugin not loaded",
|
||||
"missing dependency scan"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
}
|
||||
],
|
||||
"proves": [
|
||||
|
||||
@ -31,7 +31,8 @@
|
||||
"evidence": [
|
||||
"plugin install record",
|
||||
"plugin appears in list"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "update",
|
||||
@ -46,7 +47,8 @@
|
||||
"plugin update dry-run output",
|
||||
"tracked plugin metadata",
|
||||
"registry refresh"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
},
|
||||
{
|
||||
"id": "post-update-health",
|
||||
@ -60,7 +62,8 @@
|
||||
"status after update",
|
||||
"plugin lifecycle logs",
|
||||
"dependency errors"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
}
|
||||
],
|
||||
"proves": [
|
||||
|
||||
@ -28,7 +28,8 @@
|
||||
"models list duration",
|
||||
"provider timeout warnings",
|
||||
"gateway status after model discovery"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "logs",
|
||||
@ -41,7 +42,8 @@
|
||||
"timeout logs",
|
||||
"auth skip logs",
|
||||
"gateway stall logs"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
}
|
||||
],
|
||||
"proves": [
|
||||
|
||||
@ -36,7 +36,8 @@
|
||||
"time to listening",
|
||||
"time to health ready",
|
||||
"readiness classification"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "post-start",
|
||||
@ -52,7 +53,8 @@
|
||||
"status command latency",
|
||||
"plugin list",
|
||||
"plugin startup health"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
},
|
||||
{
|
||||
"id": "startup-logs",
|
||||
@ -66,7 +68,8 @@
|
||||
"missing dependency errors",
|
||||
"plugin service failures",
|
||||
"startup phase logs"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
}
|
||||
],
|
||||
"proves": [
|
||||
|
||||
@ -16,7 +16,7 @@
|
||||
"soakCommandFailures": 0,
|
||||
"soakHealthFailures": 0,
|
||||
"rssGrowthMb": 300,
|
||||
"healthP95Ms": 1000
|
||||
"postReadyHealthP95Ms": 1000
|
||||
},
|
||||
"phases": [
|
||||
{
|
||||
@ -31,7 +31,8 @@
|
||||
"baseline PID",
|
||||
"baseline RSS",
|
||||
"baseline health"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "loop",
|
||||
@ -44,7 +45,8 @@
|
||||
"latency trend",
|
||||
"RSS trend",
|
||||
"logs during loop"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
}
|
||||
],
|
||||
"proves": [
|
||||
|
||||
@ -37,7 +37,8 @@
|
||||
"gateway port",
|
||||
"runtime binding",
|
||||
"env created without service"
|
||||
]
|
||||
],
|
||||
"healthScope": "none"
|
||||
},
|
||||
{
|
||||
"id": "gateway-start",
|
||||
@ -51,7 +52,8 @@
|
||||
"gateway service installed",
|
||||
"gateway service started",
|
||||
"startup readiness"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "tui-message-turn",
|
||||
@ -66,7 +68,8 @@
|
||||
"mock provider request timing",
|
||||
"gateway health after turn",
|
||||
"role resource samples"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
},
|
||||
{
|
||||
"id": "post-tui-health",
|
||||
@ -81,7 +84,8 @@
|
||||
"provider logs",
|
||||
"plugin errors",
|
||||
"memory after TUI turn"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
}
|
||||
],
|
||||
"proves": [
|
||||
|
||||
@ -30,7 +30,8 @@
|
||||
"evidence": [
|
||||
"gateway status",
|
||||
"readiness classification"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "tui-smoke",
|
||||
@ -43,7 +44,8 @@
|
||||
"TUI render time",
|
||||
"connected screen",
|
||||
"clean interrupt"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
},
|
||||
{
|
||||
"id": "post-tui-health",
|
||||
@ -57,7 +59,8 @@
|
||||
"status after TUI",
|
||||
"TUI disconnect logs",
|
||||
"gateway errors"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
}
|
||||
],
|
||||
"proves": [
|
||||
|
||||
@ -41,7 +41,8 @@
|
||||
"source env",
|
||||
"clone root",
|
||||
"pre-upgrade service status"
|
||||
]
|
||||
],
|
||||
"healthScope": "none"
|
||||
},
|
||||
{
|
||||
"id": "upgrade",
|
||||
@ -55,7 +56,8 @@
|
||||
"snapshot id",
|
||||
"doctor/update output",
|
||||
"rollback status"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "post-upgrade",
|
||||
@ -74,7 +76,8 @@
|
||||
"plugins install index",
|
||||
"doctor output",
|
||||
"gateway logs without missing dependency/plugin load failures"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
}
|
||||
],
|
||||
"proves": [
|
||||
|
||||
@ -28,7 +28,8 @@
|
||||
"clone result",
|
||||
"source env",
|
||||
"clone root"
|
||||
]
|
||||
],
|
||||
"healthScope": "none"
|
||||
},
|
||||
{
|
||||
"id": "source-runtime",
|
||||
@ -41,7 +42,8 @@
|
||||
"evidence": [
|
||||
"pre-upgrade runtime",
|
||||
"pre-upgrade gateway status"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "upgrade",
|
||||
@ -55,7 +57,8 @@
|
||||
"snapshot id",
|
||||
"doctor/update output",
|
||||
"rollback status"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "post-upgrade",
|
||||
@ -72,7 +75,8 @@
|
||||
"plugins folder/index presence",
|
||||
"doctor output",
|
||||
"gateway logs"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
}
|
||||
],
|
||||
"proves": [
|
||||
|
||||
@ -32,7 +32,8 @@
|
||||
"clone result",
|
||||
"source env",
|
||||
"clone root"
|
||||
]
|
||||
],
|
||||
"healthScope": "none"
|
||||
},
|
||||
{
|
||||
"id": "source-runtime",
|
||||
@ -47,7 +48,8 @@
|
||||
"2026.4.20 upgrade output",
|
||||
"pre-upgrade service status",
|
||||
"pre-upgrade OpenClaw status"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "upgrade",
|
||||
@ -61,7 +63,8 @@
|
||||
"snapshot id",
|
||||
"doctor/update output",
|
||||
"rollback status"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "post-upgrade",
|
||||
@ -78,7 +81,8 @@
|
||||
"plugins folder/index presence",
|
||||
"doctor output",
|
||||
"gateway logs"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
}
|
||||
],
|
||||
"proves": [
|
||||
|
||||
@ -35,7 +35,8 @@
|
||||
"clone result",
|
||||
"source env",
|
||||
"clone root"
|
||||
]
|
||||
],
|
||||
"healthScope": "none"
|
||||
},
|
||||
{
|
||||
"id": "source-runtime",
|
||||
@ -52,7 +53,8 @@
|
||||
"pre-upgrade service status",
|
||||
"pre-upgrade OpenClaw status",
|
||||
"known 2026.4.24 plugin/runtime-deps logs"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "upgrade",
|
||||
@ -66,7 +68,8 @@
|
||||
"snapshot id",
|
||||
"doctor/update output",
|
||||
"rollback status"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "post-upgrade",
|
||||
@ -83,7 +86,8 @@
|
||||
"plugins install index",
|
||||
"doctor output",
|
||||
"gateway logs without missing dependency/plugin load failures"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
}
|
||||
],
|
||||
"proves": [
|
||||
|
||||
@ -42,7 +42,8 @@
|
||||
"stable channel start output",
|
||||
"pre-upgrade gateway status",
|
||||
"pre-upgrade OpenClaw status"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "upgrade",
|
||||
@ -56,7 +57,8 @@
|
||||
"snapshot id",
|
||||
"doctor/update output",
|
||||
"rollback status"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "post-upgrade",
|
||||
@ -75,7 +77,8 @@
|
||||
"plugins install index",
|
||||
"doctor output",
|
||||
"gateway logs without missing dependency/plugin load failures"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
}
|
||||
],
|
||||
"proves": [
|
||||
|
||||
@ -39,7 +39,8 @@
|
||||
"stable channel start output",
|
||||
"pre-upgrade gateway status",
|
||||
"pre-upgrade OpenClaw status"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "upgrade",
|
||||
@ -53,7 +54,8 @@
|
||||
"snapshot id",
|
||||
"doctor/update output",
|
||||
"rollback status"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "post-upgrade",
|
||||
@ -72,7 +74,8 @@
|
||||
"plugins install index",
|
||||
"doctor output",
|
||||
"gateway logs without missing dependency/plugin load failures"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
}
|
||||
],
|
||||
"proves": [
|
||||
|
||||
@ -25,8 +25,8 @@
|
||||
"soakCommandFailures": 0,
|
||||
"soakHealthFailures": 0,
|
||||
"soakHealthP95Ms": 1000,
|
||||
"healthP95Ms": 1000,
|
||||
"peakRssMb": 1000
|
||||
"peakRssMb": 1000,
|
||||
"postReadyHealthP95Ms": 1000
|
||||
},
|
||||
"phases": [
|
||||
{
|
||||
@ -41,7 +41,8 @@
|
||||
"startup readiness",
|
||||
"gateway PID",
|
||||
"baseline RSS and CPU"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "restart-after-workspace",
|
||||
@ -55,7 +56,8 @@
|
||||
"restart readiness",
|
||||
"post-fixture gateway status",
|
||||
"resource samples during restart"
|
||||
]
|
||||
],
|
||||
"healthScope": "readiness"
|
||||
},
|
||||
{
|
||||
"id": "user-facing-commands",
|
||||
@ -74,7 +76,8 @@
|
||||
"short repeated command p95",
|
||||
"health p95",
|
||||
"RSS and CPU peaks"
|
||||
]
|
||||
],
|
||||
"healthScope": "post-ready"
|
||||
}
|
||||
],
|
||||
"proves": [
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
import { buildAgentTurnBreakdown } from "./collectors/agent-turns.mjs";
|
||||
import { computeProviderTurnAttribution } from "./collectors/provider.mjs";
|
||||
import { summarizeRuntimeDepsLogs } from "./collectors/logs.mjs";
|
||||
import { buildHealthMeasurement, deriveHealthCompatibility } from "./health.mjs";
|
||||
import { resolveThresholdPolicy } from "./evaluation/thresholds.mjs";
|
||||
import {
|
||||
checkAggregateThreshold,
|
||||
@ -91,8 +92,15 @@ export function evaluateRecord(record, scenario, options = {}) {
|
||||
providerSimulation: agentProviderSimulation
|
||||
});
|
||||
const finalGatewayState = record.finalMetrics?.service?.gatewayState ?? null;
|
||||
const healthFailures = countHealthFailures(record);
|
||||
const healthP95Ms = collectHealthP95(record);
|
||||
const health = buildHealthMeasurement(record, scenario);
|
||||
const healthCompatibility = deriveHealthCompatibility(health, record);
|
||||
const healthFailures = healthCompatibility.healthFailures;
|
||||
const healthP95Ms = healthCompatibility.healthP95Ms;
|
||||
const startupHealthP95Ms = healthCompatibility.startupHealthP95Ms;
|
||||
const postReadyHealthP95Ms = healthCompatibility.postReadyHealthP95Ms;
|
||||
const startupHealthFailures = healthCompatibility.startupHealthFailures;
|
||||
const postReadyHealthFailures = healthCompatibility.postReadyHealthFailures;
|
||||
const finalHealthFailures = healthCompatibility.finalHealthFailures;
|
||||
const soakEvidence = collectSoakEvidence(allResults);
|
||||
const mcpBridgeEvidence = collectMcpBridgeEvidence(allResults);
|
||||
const browserAutomationEvidence = collectBrowserAutomationEvidence(allResults);
|
||||
@ -101,10 +109,10 @@ export function evaluateRecord(record, scenario, options = {}) {
|
||||
const officialPluginEvidence = collectOfficialPluginEvidence(allResults);
|
||||
const listeningFailures = countListeningFailures(record);
|
||||
const tcpConnectMaxMs = collectTcpConnectMax(record);
|
||||
const timeToListeningMs = collectTimeToListening(record);
|
||||
const timeToHealthReadyMs = collectTimeToHealthReady(record);
|
||||
const timeToListeningMs = healthCompatibility.timeToListeningMs ?? collectTimeToListening(record);
|
||||
const timeToHealthReadyMs = healthCompatibility.timeToHealthReadyMs ?? collectTimeToHealthReady(record);
|
||||
const readinessFailures = countReadinessFailures(record);
|
||||
const readinessClassification = collectWorstReadinessClassification(record);
|
||||
const readinessClassification = healthCompatibility.readinessClassification ?? collectWorstReadinessClassification(record);
|
||||
const coldReadyMs = maxDurationWhere(allResults, (command) => command.startsWith("ocm start "));
|
||||
const warmReadyMs = maxDurationWhere(allResults, (command) => command.startsWith("ocm service restart "));
|
||||
const upgradeMs = maxDurationWhere(allResults, (command) => command.startsWith("ocm upgrade "));
|
||||
@ -203,6 +211,56 @@ export function evaluateRecord(record, scenario, options = {}) {
|
||||
});
|
||||
}
|
||||
|
||||
if (typeof thresholds.startupHealthFailures === "number" && startupHealthFailures > thresholds.startupHealthFailures) {
|
||||
violations.push({
|
||||
kind: "health",
|
||||
metric: "startupHealthFailures",
|
||||
expected: `<= ${thresholds.startupHealthFailures}`,
|
||||
actual: startupHealthFailures,
|
||||
message: `${startupHealthFailures} startup health check(s) failed, over threshold ${thresholds.startupHealthFailures}`
|
||||
});
|
||||
}
|
||||
|
||||
if (typeof thresholds.postReadyHealthFailures === "number" && postReadyHealthFailures > thresholds.postReadyHealthFailures) {
|
||||
violations.push({
|
||||
kind: "health",
|
||||
metric: "postReadyHealthFailures",
|
||||
expected: `<= ${thresholds.postReadyHealthFailures}`,
|
||||
actual: postReadyHealthFailures,
|
||||
message: `${postReadyHealthFailures} post-ready liveness check(s) failed, over threshold ${thresholds.postReadyHealthFailures}`
|
||||
});
|
||||
}
|
||||
|
||||
if (typeof thresholds.finalHealthFailures === "number" && finalHealthFailures > thresholds.finalHealthFailures) {
|
||||
violations.push({
|
||||
kind: "health",
|
||||
metric: "finalHealthFailures",
|
||||
expected: `<= ${thresholds.finalHealthFailures}`,
|
||||
actual: finalHealthFailures,
|
||||
message: `${finalHealthFailures} final health check(s) failed, over threshold ${thresholds.finalHealthFailures}`
|
||||
});
|
||||
}
|
||||
|
||||
if (typeof thresholds.startupHealthP95Ms === "number" && startupHealthP95Ms !== null && startupHealthP95Ms > thresholds.startupHealthP95Ms) {
|
||||
violations.push({
|
||||
kind: "health",
|
||||
metric: "startupHealthP95Ms",
|
||||
expected: `<= ${thresholds.startupHealthP95Ms}`,
|
||||
actual: startupHealthP95Ms,
|
||||
message: `startup health sample p95 ${startupHealthP95Ms}ms exceeded threshold ${thresholds.startupHealthP95Ms}ms`
|
||||
});
|
||||
}
|
||||
|
||||
if (typeof thresholds.postReadyHealthP95Ms === "number" && postReadyHealthP95Ms !== null && postReadyHealthP95Ms > thresholds.postReadyHealthP95Ms) {
|
||||
violations.push({
|
||||
kind: "health",
|
||||
metric: "postReadyHealthP95Ms",
|
||||
expected: `<= ${thresholds.postReadyHealthP95Ms}`,
|
||||
actual: postReadyHealthP95Ms,
|
||||
message: `post-ready liveness p95 ${postReadyHealthP95Ms}ms exceeded threshold ${thresholds.postReadyHealthP95Ms}ms`
|
||||
});
|
||||
}
|
||||
|
||||
if (typeof thresholds.soakMinDurationMs === "number" && soakEvidence.durationMs !== null && soakEvidence.durationMs < thresholds.soakMinDurationMs) {
|
||||
violations.push({
|
||||
kind: "soak",
|
||||
@ -747,6 +805,7 @@ export function evaluateRecord(record, scenario, options = {}) {
|
||||
agentProviderRequestCount: providerTurn?.requestCount ?? null,
|
||||
agentProviderRequestMissing: providerTurn?.missingProviderRequest ?? null,
|
||||
agentProviderAttribution: providerTurn,
|
||||
health,
|
||||
tcpConnectMaxMs,
|
||||
timeToListeningMs,
|
||||
timeToHealthReadyMs,
|
||||
@ -758,6 +817,11 @@ export function evaluateRecord(record, scenario, options = {}) {
|
||||
finalGatewayState,
|
||||
healthFailures,
|
||||
healthP95Ms,
|
||||
startupHealthP95Ms,
|
||||
postReadyHealthP95Ms,
|
||||
startupHealthFailures,
|
||||
postReadyHealthFailures,
|
||||
finalHealthFailures,
|
||||
soakEvidence,
|
||||
mcpBridgeEvidence,
|
||||
mcpInitializeMs: mcpBridgeEvidence.initializeMs,
|
||||
@ -1882,26 +1946,6 @@ function countGatewayRestarts(record) {
|
||||
return commandRestarts + countLogMetric(record, "gatewayRestartMentions");
|
||||
}
|
||||
|
||||
function collectHealthP95(record) {
|
||||
const p95Values = [];
|
||||
for (const phase of record.phases ?? []) {
|
||||
const p95 = phase.metrics?.healthSummary?.p95Ms;
|
||||
if (typeof p95 === "number") {
|
||||
p95Values.push(p95);
|
||||
}
|
||||
}
|
||||
|
||||
const finalP95 = record.finalMetrics?.healthSummary?.p95Ms;
|
||||
if (typeof finalP95 === "number") {
|
||||
p95Values.push(finalP95);
|
||||
}
|
||||
|
||||
if (p95Values.length === 0) {
|
||||
return null;
|
||||
}
|
||||
return Math.max(...p95Values);
|
||||
}
|
||||
|
||||
function collectSoakEvidence(results) {
|
||||
const loops = results
|
||||
.filter((result) => result.command?.includes("run-soak-loop.mjs"))
|
||||
|
||||
319
src/health.mjs
Normal file
319
src/health.mjs
Normal file
@ -0,0 +1,319 @@
|
||||
export const HEALTH_SCHEMA = "kova.health.v1";
|
||||
export const HEALTH_SCOPES = ["readiness", "startup-sample", "post-ready", "final", "none", "unknown"];
|
||||
|
||||
const startupScopes = new Set(["readiness", "startup-sample"]);
|
||||
|
||||
export function buildHealthMeasurement(record, scenario = null) {
|
||||
const phaseContracts = new Map((scenario?.phases ?? []).map((phase) => [phase.id, phase]));
|
||||
const entries = [];
|
||||
for (const phase of record.phases ?? []) {
|
||||
entries.push({
|
||||
source: "phase",
|
||||
phaseId: phase.id ?? null,
|
||||
scope: normalizeHealthScope(phase.healthScope ?? phaseContracts.get(phase.id)?.healthScope),
|
||||
metrics: phase.metrics ?? null
|
||||
});
|
||||
}
|
||||
|
||||
const finalEntry = {
|
||||
source: "final",
|
||||
phaseId: "final",
|
||||
scope: "final",
|
||||
metrics: record.finalMetrics ?? null
|
||||
};
|
||||
entries.push(finalEntry);
|
||||
|
||||
const readiness = selectReadiness(entries);
|
||||
const startupSamples = summarizeScopedSamples(
|
||||
entries.filter((entry) => startupScopes.has(entry.scope)),
|
||||
"startup-sample",
|
||||
startupSamplesForEntry
|
||||
);
|
||||
const postReadySamples = summarizeScopedSamples(
|
||||
entries.filter((entry) => entry.scope === "post-ready"),
|
||||
"post-ready",
|
||||
postReadySamplesForEntry
|
||||
);
|
||||
const unknownSamples = summarizeScopedSamples(
|
||||
entries.filter((entry) => entry.scope === "unknown"),
|
||||
"unknown",
|
||||
postReadySamplesForEntry
|
||||
);
|
||||
const final = summarizeFinalHealth(finalEntry.metrics);
|
||||
const slowestSample = selectSlowestSample([startupSamples, postReadySamples, final]);
|
||||
|
||||
return {
|
||||
schemaVersion: HEALTH_SCHEMA,
|
||||
readiness,
|
||||
startupSamples,
|
||||
postReadySamples,
|
||||
unknownSamples,
|
||||
final,
|
||||
slowestSample
|
||||
};
|
||||
}
|
||||
|
||||
export function deriveHealthCompatibility(health, record = null) {
|
||||
const startupHealthP95Ms = health?.startupSamples?.p95Ms ?? null;
|
||||
const postReadyHealthP95Ms = health?.postReadySamples?.p95Ms ?? null;
|
||||
const scopedP95Ms = maxNullable(startupHealthP95Ms, postReadyHealthP95Ms);
|
||||
const oldP95Ms = record ? collectOldHealthP95(record) : null;
|
||||
const startupFailures = health?.startupSamples?.failureCount ?? 0;
|
||||
const postReadyFailures = health?.postReadySamples?.failureCount ?? 0;
|
||||
const unknownFailures = health?.unknownSamples?.failureCount ?? 0;
|
||||
const finalFailures = health?.final?.failureCount ?? 0;
|
||||
|
||||
return {
|
||||
timeToListeningMs: health?.readiness?.listeningReadyAtMs ?? null,
|
||||
timeToHealthReadyMs: health?.readiness?.healthReadyAtMs ?? null,
|
||||
readinessClassification: health?.readiness
|
||||
? {
|
||||
phaseId: health.readiness.phaseId,
|
||||
state: health.readiness.classification,
|
||||
severity: health.readiness.severity,
|
||||
reason: health.readiness.reason,
|
||||
thresholdMs: health.readiness.thresholdMs,
|
||||
deadlineMs: health.readiness.deadlineMs,
|
||||
listeningReadyAtMs: health.readiness.listeningReadyAtMs,
|
||||
healthReadyAtMs: health.readiness.healthReadyAtMs
|
||||
}
|
||||
: null,
|
||||
healthFailures: startupFailures + postReadyFailures + unknownFailures + finalFailures,
|
||||
healthP95Ms: scopedP95Ms ?? oldP95Ms,
|
||||
startupHealthP95Ms,
|
||||
postReadyHealthP95Ms,
|
||||
startupHealthFailures: startupFailures,
|
||||
postReadyHealthFailures: postReadyFailures,
|
||||
finalHealthFailures: finalFailures
|
||||
};
|
||||
}
|
||||
|
||||
function normalizeHealthScope(scope) {
|
||||
return typeof scope === "string" && HEALTH_SCOPES.includes(scope) ? scope : "unknown";
|
||||
}
|
||||
|
||||
function selectReadiness(entries) {
|
||||
const scoped = entries
|
||||
.filter((entry) => startupScopes.has(entry.scope))
|
||||
.map((entry) => readinessValue(entry.metrics?.readiness, entry.phaseId))
|
||||
.filter(Boolean);
|
||||
const candidates = scoped.length > 0
|
||||
? scoped
|
||||
: entries.map((entry) => readinessValue(entry.metrics?.readiness, entry.phaseId)).filter(Boolean);
|
||||
if (candidates.length === 0) {
|
||||
return null;
|
||||
}
|
||||
candidates.sort((left, right) => {
|
||||
const rankDelta = readinessRank(right.classification) - readinessRank(left.classification);
|
||||
if (rankDelta !== 0) {
|
||||
return rankDelta;
|
||||
}
|
||||
return (right.healthReadyAtMs ?? 0) - (left.healthReadyAtMs ?? 0);
|
||||
});
|
||||
return candidates[0];
|
||||
}
|
||||
|
||||
function readinessValue(readiness, phaseId) {
|
||||
if (!readiness?.classification || !(readiness.deadlineMs > 0)) {
|
||||
return null;
|
||||
}
|
||||
return {
|
||||
phaseId,
|
||||
listeningReadyAtMs: readiness.listeningReadyAtMs,
|
||||
healthReadyAtMs: readiness.healthReadyAtMs,
|
||||
classification: readiness.classification.state,
|
||||
severity: readiness.classification.severity,
|
||||
reason: readiness.classification.reason,
|
||||
thresholdMs: readiness.thresholdMs,
|
||||
deadlineMs: readiness.deadlineMs,
|
||||
attempts: readiness.attempts ?? null
|
||||
};
|
||||
}
|
||||
|
||||
function readinessRank(state) {
|
||||
if (state === "hard-failure") {
|
||||
return 4;
|
||||
}
|
||||
if (state === "unhealthy") {
|
||||
return 3;
|
||||
}
|
||||
if (state === "slow-startup") {
|
||||
return 2;
|
||||
}
|
||||
if (state === "ready") {
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
function startupSamplesForEntry(entry) {
|
||||
const attempts = entry.metrics?.readiness?.healthAttempts;
|
||||
if (Array.isArray(attempts) && attempts.length > 0) {
|
||||
return attempts;
|
||||
}
|
||||
return entry.metrics?.healthSamples ?? [];
|
||||
}
|
||||
|
||||
function postReadySamplesForEntry(entry) {
|
||||
return entry.metrics?.healthSamples ?? [];
|
||||
}
|
||||
|
||||
function summarizeScopedSamples(entries, scope, sampleSelector) {
|
||||
const samples = [];
|
||||
for (const entry of entries) {
|
||||
for (const sample of sampleSelector(entry)) {
|
||||
samples.push({ ...sample, phaseId: entry.phaseId });
|
||||
}
|
||||
}
|
||||
if (samples.length > 0) {
|
||||
return summarizeSamples(samples, scope);
|
||||
}
|
||||
|
||||
const summaries = entries
|
||||
.map((entry) => ({ phaseId: entry.phaseId, summary: entry.metrics?.healthSummary }))
|
||||
.filter((entry) => entry.summary);
|
||||
if (summaries.length === 0) {
|
||||
return emptyHealthSummary(scope);
|
||||
}
|
||||
|
||||
let slowestPhaseId = null;
|
||||
let maxMs = null;
|
||||
for (const { phaseId, summary } of summaries) {
|
||||
if (typeof summary.maxMs === "number" && (maxMs === null || summary.maxMs > maxMs)) {
|
||||
maxMs = summary.maxMs;
|
||||
slowestPhaseId = phaseId;
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
scope,
|
||||
count: sum(summaries, "count"),
|
||||
okCount: sum(summaries, "okCount"),
|
||||
failureCount: sum(summaries, "failureCount"),
|
||||
minMs: minNullable(...summaries.map(({ summary }) => summary.minMs)),
|
||||
p50Ms: maxNullable(...summaries.map(({ summary }) => summary.p50Ms)),
|
||||
p95Ms: maxNullable(...summaries.map(({ summary }) => summary.p95Ms)),
|
||||
maxMs,
|
||||
slowestPhaseId
|
||||
};
|
||||
}
|
||||
|
||||
function summarizeSamples(samples, scope) {
|
||||
const durations = samples
|
||||
.map((sample) => sample.durationMs)
|
||||
.filter((duration) => typeof duration === "number")
|
||||
.sort((left, right) => left - right);
|
||||
let slowestPhaseId = null;
|
||||
let slowestMs = null;
|
||||
for (const sample of samples) {
|
||||
if (typeof sample.durationMs === "number" && (slowestMs === null || sample.durationMs > slowestMs)) {
|
||||
slowestMs = sample.durationMs;
|
||||
slowestPhaseId = sample.phaseId ?? null;
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
scope,
|
||||
count: samples.length,
|
||||
okCount: samples.filter((sample) => sample.ok === true).length,
|
||||
failureCount: samples.filter((sample) => sample.ok !== true).length,
|
||||
minMs: durations.at(0) ?? null,
|
||||
p50Ms: percentile(durations, 0.5),
|
||||
p95Ms: percentile(durations, 0.95),
|
||||
maxMs: durations.at(-1) ?? null,
|
||||
slowestPhaseId
|
||||
};
|
||||
}
|
||||
|
||||
function emptyHealthSummary(scope) {
|
||||
return {
|
||||
scope,
|
||||
count: 0,
|
||||
okCount: 0,
|
||||
failureCount: 0,
|
||||
minMs: null,
|
||||
p50Ms: null,
|
||||
p95Ms: null,
|
||||
maxMs: null,
|
||||
slowestPhaseId: null
|
||||
};
|
||||
}
|
||||
|
||||
function summarizeFinalHealth(metrics) {
|
||||
const samples = Array.isArray(metrics?.healthSamples) ? metrics.healthSamples : [];
|
||||
const summary = samples.length > 0 ? summarizeSamples(samples.map((sample) => ({ ...sample, phaseId: "final" })), "final") : null;
|
||||
const fallbackFailureCount = healthFailureCount([metrics?.health]);
|
||||
const failureCount = summary?.failureCount ?? metrics?.healthSummary?.failureCount ?? fallbackFailureCount;
|
||||
const maxMs = summary?.maxMs ?? metrics?.healthSummary?.maxMs ?? metrics?.health?.durationMs ?? null;
|
||||
const p95Ms = summary?.p95Ms ?? metrics?.healthSummary?.p95Ms ?? null;
|
||||
const gatewayState = metrics?.service?.gatewayState ?? null;
|
||||
const ok = metrics
|
||||
? (gatewayState === null ? failureCount === 0 : gatewayState === "running" && failureCount === 0)
|
||||
: null;
|
||||
return {
|
||||
scope: "final",
|
||||
gatewayState,
|
||||
ok,
|
||||
healthOk: metrics?.health?.ok ?? null,
|
||||
failureCount,
|
||||
p95Ms,
|
||||
maxMs,
|
||||
slowestPhaseId: maxMs === null ? null : "final"
|
||||
};
|
||||
}
|
||||
|
||||
function selectSlowestSample(summaries) {
|
||||
let slowest = null;
|
||||
for (const summary of summaries) {
|
||||
if (!summary || typeof summary.maxMs !== "number") {
|
||||
continue;
|
||||
}
|
||||
if (!slowest || summary.maxMs > slowest.durationMs) {
|
||||
slowest = {
|
||||
scope: summary.scope,
|
||||
phaseId: summary.slowestPhaseId ?? null,
|
||||
durationMs: summary.maxMs
|
||||
};
|
||||
}
|
||||
}
|
||||
return slowest;
|
||||
}
|
||||
|
||||
function collectOldHealthP95(record) {
|
||||
const values = [];
|
||||
for (const phase of record?.phases ?? []) {
|
||||
if (typeof phase.metrics?.healthSummary?.p95Ms === "number") {
|
||||
values.push(phase.metrics.healthSummary.p95Ms);
|
||||
}
|
||||
}
|
||||
if (typeof record?.finalMetrics?.healthSummary?.p95Ms === "number") {
|
||||
values.push(record.finalMetrics.healthSummary.p95Ms);
|
||||
}
|
||||
return values.length === 0 ? null : Math.max(...values);
|
||||
}
|
||||
|
||||
function healthFailureCount(samples) {
|
||||
return samples.filter((sample) => sample && sample.ok === false).length;
|
||||
}
|
||||
|
||||
function sum(entries, key) {
|
||||
return entries.reduce((total, entry) => total + (entry.summary?.[key] ?? 0), 0);
|
||||
}
|
||||
|
||||
function maxNullable(...values) {
|
||||
const numeric = values.filter((value) => typeof value === "number");
|
||||
return numeric.length === 0 ? null : Math.max(...numeric);
|
||||
}
|
||||
|
||||
function minNullable(...values) {
|
||||
const numeric = values.filter((value) => typeof value === "number");
|
||||
return numeric.length === 0 ? null : Math.min(...numeric);
|
||||
}
|
||||
|
||||
function percentile(values, percentileValue) {
|
||||
if (values.length === 0) {
|
||||
return null;
|
||||
}
|
||||
const index = Math.ceil(values.length * percentileValue) - 1;
|
||||
return values[Math.min(Math.max(index, 0), values.length - 1)];
|
||||
}
|
||||
@ -20,6 +20,8 @@ export const PERFORMANCE_METRICS = [
|
||||
{ id: "coldPreProviderMs", title: "Cold Pre-Provider", unit: "ms", regressionKey: "agentLatencyRegressionPercent" },
|
||||
{ id: "warmPreProviderMs", title: "Warm Pre-Provider", unit: "ms", regressionKey: "agentLatencyRegressionPercent" },
|
||||
{ id: "healthP95Ms", title: "Health p95", unit: "ms", regressionKey: "startupRegressionPercent" },
|
||||
{ id: "startupHealthP95Ms", title: "Startup Health p95", unit: "ms", regressionKey: "startupRegressionPercent" },
|
||||
{ id: "postReadyHealthP95Ms", title: "Post-Ready Health p95", unit: "ms", regressionKey: "startupRegressionPercent" },
|
||||
{ id: "runtimeDepsStagingMs", title: "Runtime Deps Staging", unit: "ms", regressionKey: "startupRegressionPercent" }
|
||||
];
|
||||
|
||||
|
||||
@ -1,6 +1,8 @@
|
||||
import { scenariosDir } from "../paths.mjs";
|
||||
import { assertNoShapeErrors, loadJsonRegistry, requireArray, requireKebabId, requireObject, requireString } from "./validate.mjs";
|
||||
|
||||
export const HEALTH_SCOPES = ["readiness", "startup-sample", "post-ready", "final", "none"];
|
||||
|
||||
export async function loadScenarios(selectedId) {
|
||||
return loadJsonRegistry({
|
||||
dir: scenariosDir,
|
||||
@ -108,6 +110,7 @@ function validatePhases(phases, errors) {
|
||||
requireKebabId(phase, "id", errors, prefix);
|
||||
requireString(phase, "title", errors, prefix);
|
||||
requireString(phase, "intent", errors, prefix);
|
||||
requireString(phase, "healthScope", errors, prefix);
|
||||
requireArray(phase, "commands", errors, prefix);
|
||||
requireArray(phase, "evidence", errors, prefix);
|
||||
|
||||
@ -120,6 +123,9 @@ function validatePhases(phases, errors) {
|
||||
|
||||
validateStringArray(phase.commands, `${prefix}.commands`, errors);
|
||||
validateStringArray(phase.evidence, `${prefix}.evidence`, errors);
|
||||
if (typeof phase.healthScope === "string" && !HEALTH_SCOPES.includes(phase.healthScope)) {
|
||||
errors.push(`${prefix}.healthScope must be one of ${HEALTH_SCOPES.join(", ")}`);
|
||||
}
|
||||
if (phase.expectedAgentFailure !== undefined && typeof phase.expectedAgentFailure !== "boolean") {
|
||||
errors.push(`${prefix}.expectedAgentFailure must be a boolean when set`);
|
||||
}
|
||||
|
||||
@ -21,6 +21,11 @@ const defaultThresholds = {
|
||||
timeToHealthReadyMs: 5000,
|
||||
readinessFailures: 0,
|
||||
healthP95Ms: 1000,
|
||||
startupHealthFailures: 0,
|
||||
postReadyHealthFailures: 0,
|
||||
finalHealthFailures: 0,
|
||||
startupHealthP95Ms: 1000,
|
||||
postReadyHealthP95Ms: 1000,
|
||||
gatewayRestartCount: 0,
|
||||
providerTimeoutMentions: 0,
|
||||
eventLoopDelayMentions: 0,
|
||||
@ -317,6 +322,8 @@ function diagnosticRecordSummary(record) {
|
||||
providerFinalMs: measurements.agentProviderFinalMs ?? measurements.coldProviderFinalMs ?? null,
|
||||
runtimeDepsStagingMs: measurements.runtimeDepsStagingMs ?? null,
|
||||
timeToHealthReadyMs: measurements.timeToHealthReadyMs ?? null,
|
||||
startupHealthP95Ms: measurements.startupHealthP95Ms ?? null,
|
||||
postReadyHealthP95Ms: measurements.postReadyHealthP95Ms ?? null,
|
||||
peakRssMb: measurements.peakRssMb ?? null
|
||||
};
|
||||
}
|
||||
@ -406,6 +413,11 @@ function metricDeltas(baseline, current) {
|
||||
"timeToHealthReadyMs",
|
||||
"healthP95Ms",
|
||||
"healthFailures",
|
||||
"startupHealthP95Ms",
|
||||
"postReadyHealthP95Ms",
|
||||
"startupHealthFailures",
|
||||
"postReadyHealthFailures",
|
||||
"finalHealthFailures",
|
||||
"readinessFailures",
|
||||
"missingDependencyErrors",
|
||||
"pluginLoadFailures",
|
||||
|
||||
@ -114,8 +114,7 @@ export function renderMarkdownReport(report) {
|
||||
lines.push(`- TCP connect max: ${record.measurements.tcpConnectMaxMs ?? "unknown"} ms`);
|
||||
lines.push(`- Missing dependency errors: ${record.measurements.missingDependencyErrors ?? "unknown"}`);
|
||||
lines.push(`- Final gateway state: ${record.measurements.finalGatewayState ?? "unknown"}`);
|
||||
lines.push(`- Health failures: ${record.measurements.healthFailures ?? "unknown"}`);
|
||||
lines.push(`- Health p95: ${record.measurements.healthP95Ms ?? "unknown"} ms`);
|
||||
lines.push(...formatHealthMeasurementLines(record.measurements));
|
||||
if (record.measurements.soakEvidence?.available) {
|
||||
lines.push(`- Soak trend: duration ${record.measurements.soakDurationMs ?? "unknown"} ms; iterations ${record.measurements.soakIterations ?? "unknown"}; command p95 ${record.measurements.soakCommandP95Ms ?? "unknown"} ms; health p95 ${record.measurements.soakHealthP95Ms ?? "unknown"} ms; RSS growth ${record.measurements.rssGrowthMb ?? "unknown"} MB; gateway RSS growth ${record.measurements.gatewayRssGrowthMb ?? "unknown"} MB`);
|
||||
}
|
||||
@ -499,6 +498,26 @@ function formatMetrics(metrics) {
|
||||
return lines.length > 0 ? lines : ["- unavailable"];
|
||||
}
|
||||
|
||||
function formatHealthMeasurementLines(measurements) {
|
||||
const health = measurements.health;
|
||||
const lines = [
|
||||
`- Health failures: ${measurements.healthFailures ?? "unknown"}`,
|
||||
`- Startup health p95: ${measurements.startupHealthP95Ms ?? health?.startupSamples?.p95Ms ?? "unknown"} ms`,
|
||||
`- Post-ready liveness p95: ${measurements.postReadyHealthP95Ms ?? health?.postReadySamples?.p95Ms ?? "unknown"} ms`,
|
||||
`- Final health failures: ${measurements.finalHealthFailures ?? health?.final?.failureCount ?? "unknown"}`
|
||||
];
|
||||
if (health?.final) {
|
||||
const healthState = health.final.healthOk === null ? "unknown" : health.final.healthOk ? "ok" : "not-ok";
|
||||
lines.push(`- Final health state: gateway ${health.final.gatewayState ?? "unknown"}; health ${healthState}`);
|
||||
}
|
||||
if (health?.slowestSample) {
|
||||
lines.push(`- Slowest health sample: ${health.slowestSample.scope} ${health.slowestSample.phaseId ?? "unknown"} ${health.slowestSample.durationMs} ms`);
|
||||
} else if (measurements.healthP95Ms !== null && measurements.healthP95Ms !== undefined) {
|
||||
lines.push(`- Compatibility health p95: ${measurements.healthP95Ms} ms`);
|
||||
}
|
||||
return lines;
|
||||
}
|
||||
|
||||
function formatRecordFailureCards(records = []) {
|
||||
const cards = records
|
||||
.filter((record) => !["PASS", "DRY-RUN"].includes(record.status))
|
||||
@ -670,7 +689,14 @@ function summarizeMeasurements(measurements) {
|
||||
timeToHealthReadyMs: measurements.timeToHealthReadyMs ?? null,
|
||||
readinessClassification: measurements.readinessClassification ?? null,
|
||||
readinessClassificationReason: measurements.readinessClassificationReason ?? null,
|
||||
health: measurements.health ?? null,
|
||||
healthFailures: measurements.healthFailures ?? null,
|
||||
healthP95Ms: measurements.healthP95Ms ?? null,
|
||||
startupHealthP95Ms: measurements.startupHealthP95Ms ?? null,
|
||||
postReadyHealthP95Ms: measurements.postReadyHealthP95Ms ?? null,
|
||||
startupHealthFailures: measurements.startupHealthFailures ?? null,
|
||||
postReadyHealthFailures: measurements.postReadyHealthFailures ?? null,
|
||||
finalHealthFailures: measurements.finalHealthFailures ?? null,
|
||||
missingDependencyErrors: measurements.missingDependencyErrors ?? null,
|
||||
pluginLoadFailures: measurements.pluginLoadFailures ?? null,
|
||||
officialPluginEvidence: measurements.officialPluginEvidence ?? null,
|
||||
@ -1150,6 +1176,7 @@ function compactRolePeaks(measurements) {
|
||||
function pushMeasurementBrief(lines, measurements, { compact }) {
|
||||
lines.push("Measurements:");
|
||||
lines.push(`- startup: listening ${valueMs(measurements.timeToListeningMs)}; health ${valueMs(measurements.timeToHealthReadyMs)}; readiness ${measurements.readinessClassification ?? "unknown"}; gateway ${measurements.finalGatewayState ?? "unknown"}; restarts ${measurements.gatewayRestartCount ?? "unknown"}`);
|
||||
lines.push(`- health: startup p95 ${valueMs(measurements.startupHealthP95Ms)}; post-ready p95 ${valueMs(measurements.postReadyHealthP95Ms)}; failures ${measurements.healthFailures ?? "unknown"}; final failures ${measurements.finalHealthFailures ?? "unknown"}${healthSlowestText(measurements)}`);
|
||||
lines.push(`- resources: peak RSS ${valueMb(measurements.peakRssMb)}; max CPU ${valuePercent(measurements.cpuPercentMax)}; samples ${measurements.resourceSampleCount ?? "unknown"}; roles ${rolePeakText(measurements)}`);
|
||||
lines.push(`- agent: turn ${valueMs(measurements.agentTurnMs, "not-run")}; cold/warm ${valueMs(measurements.coldAgentTurnMs)}/${valueMs(measurements.warmAgentTurnMs)}; cold-warm delta ${valueMs(measurements.agentColdWarmDeltaMs)}; pre-provider ${valueMs(measurements.agentPreProviderMs)}; provider ${valueMs(measurements.agentProviderFinalMs)}; cleanup ${valueMs(measurements.agentCleanupMaxMs)}; diagnosis ${measurements.agentLatencyDiagnosis?.kind ?? "unknown"}; leaks ${measurements.agentProcessLeakCount ?? "unknown"}`);
|
||||
lines.push(`- plugins/runtime: missing deps ${measurements.missingDependencyErrors ?? "unknown"}; plugin failures ${measurements.pluginLoadFailures ?? "unknown"}; runtime deps ${valueMs(measurements.runtimeDepsStagingMs)}${runtimeDepsPluginText(measurements)}; warm restages ${measurements.warmRuntimeDepsRestageCount ?? "unknown"}; warm reuse ${measurements.runtimeDepsWarmReuseOk ?? "unknown"}`);
|
||||
@ -1223,6 +1250,14 @@ function valuePercent(value) {
|
||||
return value === null || value === undefined ? "unknown" : `${value}%`;
|
||||
}
|
||||
|
||||
function healthSlowestText(measurements) {
|
||||
const slowest = measurements.health?.slowestSample;
|
||||
if (!slowest) {
|
||||
return "";
|
||||
}
|
||||
return `; slowest ${slowest.scope}/${slowest.phaseId ?? "unknown"} ${valueMs(slowest.durationMs)}`;
|
||||
}
|
||||
|
||||
function buildFixerPrompt({ report, primaryBlocker, why, measurements, evidence, likelyOwner }) {
|
||||
const parts = [
|
||||
`Investigate OpenClaw release gate failure ${primaryBlocker}.`,
|
||||
|
||||
@ -142,6 +142,7 @@ export async function executeScenario(scenario, context) {
|
||||
id: phase.id,
|
||||
title: phase.title,
|
||||
intent: phase.intent,
|
||||
healthScope: phase.healthScope,
|
||||
expectedAgentFailure: phase.expectedAgentFailure === true,
|
||||
commands,
|
||||
evidence: phase.evidence ?? [],
|
||||
@ -347,6 +348,7 @@ function buildPlannedPhases(scenario, context, envName, artifactDir, authPolicy)
|
||||
id: phase.id,
|
||||
title: phase.title,
|
||||
intent: phase.intent,
|
||||
healthScope: phase.healthScope,
|
||||
expectedAgentFailure: phase.expectedAgentFailure === true,
|
||||
commands: materializeScenarioPhaseCommands(phase, context, envName, artifactDir),
|
||||
evidence: phase.evidence ?? []
|
||||
|
||||
@ -356,6 +356,8 @@ export async function runSelfCheck(flags = {}) {
|
||||
checks.push(markdownFailureCardsCheck());
|
||||
checks.push(reportRecommendedNextScenarioCheck());
|
||||
checks.push(readinessClassificationCheck());
|
||||
checks.push(healthReadinessModelCheck());
|
||||
checks.push(oldHealthReportCompatibilityCheck());
|
||||
checks.push(await resourceRoleAttributionCheck(tmp));
|
||||
checks.push(await resourceRootCommandRoleBoundaryCheck());
|
||||
checks.push(await resourceRolePollutionCheck());
|
||||
@ -365,6 +367,7 @@ export async function runSelfCheck(flags = {}) {
|
||||
checks.push(await cleanupRetryCheck(tmp));
|
||||
checks.push(stateRegistryValidationCheck());
|
||||
checks.push(scenarioCloneFirstValidationCheck());
|
||||
checks.push(scenarioHealthScopeValidationCheck());
|
||||
checks.push(scenarioStateCompatibilityCheck());
|
||||
checks.push(await cpuProfileParserCheck());
|
||||
checks.push(await heapProfileParserCheck());
|
||||
@ -3880,6 +3883,7 @@ function readinessClassificationCheck() {
|
||||
phases: [
|
||||
{
|
||||
id: "provision",
|
||||
healthScope: "readiness",
|
||||
results: [],
|
||||
metrics: {
|
||||
readiness: {
|
||||
@ -3951,6 +3955,186 @@ function readinessClassificationCheck() {
|
||||
}
|
||||
}
|
||||
|
||||
function healthReadinessModelCheck() {
|
||||
try {
|
||||
const record = {
|
||||
status: "PASS",
|
||||
phases: [
|
||||
{
|
||||
id: "cold-start",
|
||||
healthScope: "readiness",
|
||||
results: [],
|
||||
metrics: {
|
||||
readiness: {
|
||||
deadlineMs: 90000,
|
||||
thresholdMs: 30000,
|
||||
ready: true,
|
||||
listeningReady: true,
|
||||
listeningReadyAtMs: 120,
|
||||
healthReadyAtMs: 200,
|
||||
attempts: 2,
|
||||
classification: {
|
||||
state: "ready",
|
||||
severity: "pass",
|
||||
reason: "gateway became healthy within the readiness threshold"
|
||||
},
|
||||
healthAttempts: [
|
||||
{ ok: false, durationMs: 25 },
|
||||
{ ok: true, durationMs: 30 }
|
||||
]
|
||||
},
|
||||
healthSamples: [
|
||||
{ ok: true, durationMs: 40 }
|
||||
],
|
||||
healthSummary: {
|
||||
count: 1,
|
||||
okCount: 1,
|
||||
failureCount: 0,
|
||||
minMs: 40,
|
||||
p50Ms: 40,
|
||||
p95Ms: 40,
|
||||
maxMs: 40
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
id: "api-latency",
|
||||
healthScope: "post-ready",
|
||||
results: [],
|
||||
metrics: {
|
||||
healthSamples: [
|
||||
{ ok: true, durationMs: 10 },
|
||||
{ ok: true, durationMs: 1500 }
|
||||
],
|
||||
healthSummary: {
|
||||
count: 2,
|
||||
okCount: 2,
|
||||
failureCount: 0,
|
||||
minMs: 10,
|
||||
p50Ms: 10,
|
||||
p95Ms: 1500,
|
||||
maxMs: 1500
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
finalMetrics: {
|
||||
service: { gatewayState: "running" },
|
||||
healthSamples: [{ ok: true, durationMs: 50 }],
|
||||
healthSummary: {
|
||||
count: 1,
|
||||
okCount: 1,
|
||||
failureCount: 0,
|
||||
minMs: 50,
|
||||
p50Ms: 50,
|
||||
p95Ms: 50,
|
||||
maxMs: 50
|
||||
},
|
||||
health: { ok: true, durationMs: 50 }
|
||||
}
|
||||
};
|
||||
const scenario = {
|
||||
phases: [
|
||||
{ id: "cold-start", healthScope: "readiness" },
|
||||
{ id: "api-latency", healthScope: "post-ready" }
|
||||
],
|
||||
thresholds: {
|
||||
gatewayReadyMs: 30000,
|
||||
postReadyHealthP95Ms: 1000
|
||||
}
|
||||
};
|
||||
evaluateRecord(record, scenario);
|
||||
assertEqual(record.status, "FAIL", "post-ready health threshold fails");
|
||||
assertEqual(record.measurements.health.schemaVersion, "kova.health.v1", "health schema");
|
||||
assertEqual(record.measurements.timeToHealthReadyMs, 200, "readiness health ready derived");
|
||||
assertEqual(record.measurements.startupHealthP95Ms, 30, "startup health p95 derived from readiness attempts");
|
||||
assertEqual(record.measurements.postReadyHealthP95Ms, 1500, "post-ready health p95 derived from post-ready samples");
|
||||
assertEqual(record.measurements.healthP95Ms, 1500, "compatibility health p95 derived");
|
||||
assertEqual(record.measurements.health.slowestSample.scope, "post-ready", "slowest health scope");
|
||||
assertEqual(
|
||||
record.violations.some((violation) => violation.metric === "postReadyHealthP95Ms"),
|
||||
true,
|
||||
"post-ready health violation"
|
||||
);
|
||||
assertEqual(
|
||||
record.violations.some((violation) => violation.metric === "timeToHealthReadyMs"),
|
||||
false,
|
||||
"post-ready liveness does not masquerade as readiness"
|
||||
);
|
||||
return {
|
||||
id: "health-readiness-model",
|
||||
status: "PASS",
|
||||
command: "evaluate synthetic scoped health record",
|
||||
durationMs: 0
|
||||
};
|
||||
} catch (error) {
|
||||
return {
|
||||
id: "health-readiness-model",
|
||||
status: "FAIL",
|
||||
command: "evaluate synthetic scoped health record",
|
||||
durationMs: 0,
|
||||
message: error.message
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
function oldHealthReportCompatibilityCheck() {
|
||||
try {
|
||||
const report = {
|
||||
schemaVersion: "kova.report.v1",
|
||||
generatedAt: "2026-05-05T00:00:00.000Z",
|
||||
runId: "old-health-report",
|
||||
mode: "execution",
|
||||
target: "runtime:stable",
|
||||
platform: { os: "darwin", release: "25.0.0", arch: "arm64", node: process.version },
|
||||
summary: { total: 1, statuses: { PASS: 1 } },
|
||||
records: [{
|
||||
scenario: "fresh-install",
|
||||
title: "Fresh Install",
|
||||
status: "PASS",
|
||||
target: "runtime:stable",
|
||||
state: { id: "fresh", title: "Fresh" },
|
||||
envName: "kova-old-health",
|
||||
likelyOwner: "OpenClaw",
|
||||
objective: "Old report compatibility.",
|
||||
measurements: {
|
||||
peakRssMb: 100,
|
||||
cpuPercentMax: 10,
|
||||
timeToListeningMs: 100,
|
||||
timeToHealthReadyMs: 200,
|
||||
readinessClassification: "ready",
|
||||
healthFailures: 0,
|
||||
healthP95Ms: 900,
|
||||
finalGatewayState: "running"
|
||||
},
|
||||
phases: [],
|
||||
violations: []
|
||||
}]
|
||||
};
|
||||
const summary = renderReportSummary(report, { structured: true });
|
||||
assertEqual(summary.scenarios[0].measurements.health, null, "old report health object absent");
|
||||
assertEqual(summary.scenarios[0].measurements.healthP95Ms, 900, "old report health p95 summarized");
|
||||
const markdown = renderMarkdownReport(report);
|
||||
assertEqual(markdown.includes("Compatibility health p95: 900 ms"), true, "old report markdown compatibility p95");
|
||||
const comparison = compareReports(report, report, { thresholds: { healthP95Ms: 0 } });
|
||||
assertEqual(comparison.ok, true, "old report compare remains ok");
|
||||
return {
|
||||
id: "old-health-report-compatibility",
|
||||
status: "PASS",
|
||||
command: "summarize and compare legacy health report shape",
|
||||
durationMs: 0
|
||||
};
|
||||
} catch (error) {
|
||||
return {
|
||||
id: "old-health-report-compatibility",
|
||||
status: "FAIL",
|
||||
command: "summarize and compare legacy health report shape",
|
||||
durationMs: 0,
|
||||
message: error.message
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
async function resourceRoleAttributionCheck(tmp) {
|
||||
const command = "node -e 'setTimeout(() => {}, 650)'";
|
||||
const artifactPath = join(tmp, "resource-role-attribution.jsonl");
|
||||
@ -4739,6 +4923,7 @@ function scenarioCloneFirstValidationCheck() {
|
||||
id: "status",
|
||||
title: "Status",
|
||||
intent: "Unsafe durable source access.",
|
||||
healthScope: "post-ready",
|
||||
commands: ["ocm service status {sourceEnv} --json"],
|
||||
evidence: ["status"]
|
||||
}]
|
||||
@ -4762,6 +4947,7 @@ function scenarioCloneFirstValidationCheck() {
|
||||
id: "clone",
|
||||
title: "Clone",
|
||||
intent: "Clone source.",
|
||||
healthScope: "none",
|
||||
commands: ["ocm env clone {sourceEnv} {env} --json", "ocm logs {sourceEnv} --tail 20"],
|
||||
evidence: ["clone"]
|
||||
}]
|
||||
@ -4783,12 +4969,14 @@ function scenarioCloneFirstValidationCheck() {
|
||||
id: "clone",
|
||||
title: "Clone",
|
||||
intent: "Clone source.",
|
||||
healthScope: "none",
|
||||
commands: ["ocm env clone {sourceEnv} {env} --json"],
|
||||
evidence: ["clone"]
|
||||
}, {
|
||||
id: "upgrade",
|
||||
title: "Upgrade",
|
||||
intent: "Upgrade disposable clone.",
|
||||
healthScope: "readiness",
|
||||
commands: ["ocm upgrade {env} --channel beta --json"],
|
||||
evidence: ["upgrade"]
|
||||
}]
|
||||
@ -4811,6 +4999,72 @@ function scenarioCloneFirstValidationCheck() {
|
||||
}
|
||||
}
|
||||
|
||||
function scenarioHealthScopeValidationCheck() {
|
||||
try {
|
||||
let rejectedMissing = false;
|
||||
try {
|
||||
validateScenarioShape({
|
||||
id: "missing-health-scope",
|
||||
surface: "fresh-install",
|
||||
title: "Missing Health Scope",
|
||||
objective: "Scenario phase without an explicit health scope.",
|
||||
tags: ["fresh-user"],
|
||||
proves: ["baseline"],
|
||||
thresholds: {},
|
||||
phases: [{
|
||||
id: "start",
|
||||
title: "Start",
|
||||
intent: "Start gateway.",
|
||||
commands: ["ocm start {env} {startSelector} --json"],
|
||||
evidence: ["start"]
|
||||
}]
|
||||
}, "missing-health-scope.json");
|
||||
} catch (error) {
|
||||
rejectedMissing = /phases\[0\]\.healthScope must be a non-empty string/.test(error.message);
|
||||
}
|
||||
assertEqual(rejectedMissing, true, "missing healthScope rejected");
|
||||
|
||||
let rejectedInvalid = false;
|
||||
try {
|
||||
validateScenarioShape({
|
||||
id: "invalid-health-scope",
|
||||
surface: "fresh-install",
|
||||
title: "Invalid Health Scope",
|
||||
objective: "Scenario phase with an invalid health scope.",
|
||||
tags: ["fresh-user"],
|
||||
proves: ["baseline"],
|
||||
thresholds: {},
|
||||
phases: [{
|
||||
id: "start",
|
||||
title: "Start",
|
||||
intent: "Start gateway.",
|
||||
healthScope: "startup",
|
||||
commands: ["ocm start {env} {startSelector} --json"],
|
||||
evidence: ["start"]
|
||||
}]
|
||||
}, "invalid-health-scope.json");
|
||||
} catch (error) {
|
||||
rejectedInvalid = /healthScope must be one of/.test(error.message);
|
||||
}
|
||||
assertEqual(rejectedInvalid, true, "invalid healthScope rejected");
|
||||
|
||||
return {
|
||||
id: "scenario-health-scope-validation",
|
||||
status: "PASS",
|
||||
command: "validate scenario health scope contracts",
|
||||
durationMs: 0
|
||||
};
|
||||
} catch (error) {
|
||||
return {
|
||||
id: "scenario-health-scope-validation",
|
||||
status: "FAIL",
|
||||
command: "validate scenario health scope contracts",
|
||||
durationMs: 0,
|
||||
message: error.message
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
function scenarioStateCompatibilityCheck() {
|
||||
try {
|
||||
let rejected = false;
|
||||
|
||||
@ -18,8 +18,8 @@
|
||||
"preProviderMs": 10000,
|
||||
"providerFinalMs": 3000,
|
||||
"agentCleanupMs": 5000,
|
||||
"healthP95Ms": 1000,
|
||||
"peakRssMb": 900
|
||||
"peakRssMb": 900,
|
||||
"postReadyHealthP95Ms": 1000
|
||||
},
|
||||
"roleThresholds": {
|
||||
"gateway": {
|
||||
@ -81,7 +81,7 @@
|
||||
"warmPreProviderMs",
|
||||
"agentPreProviderP95Ms",
|
||||
"agentCleanupMaxMs",
|
||||
"healthP95Ms",
|
||||
"postReadyHealthP95Ms",
|
||||
"peakRssMb",
|
||||
"providerTimeoutMentions",
|
||||
"pluginLoadFailures"
|
||||
|
||||
@ -14,8 +14,8 @@
|
||||
"agentTurnMs": 45000,
|
||||
"preProviderMs": 10000,
|
||||
"providerFinalMs": 3000,
|
||||
"healthP95Ms": 1000,
|
||||
"peakRssMb": 900
|
||||
"peakRssMb": 900,
|
||||
"postReadyHealthP95Ms": 1000
|
||||
},
|
||||
"roleThresholds": {
|
||||
"gateway": {
|
||||
@ -63,7 +63,7 @@
|
||||
"agentTurnP95Ms",
|
||||
"agentTurnMaxMs",
|
||||
"coldPreProviderMs",
|
||||
"healthP95Ms",
|
||||
"postReadyHealthP95Ms",
|
||||
"peakRssMb",
|
||||
"pluginLoadFailures"
|
||||
]
|
||||
|
||||
@ -14,8 +14,8 @@
|
||||
"agentTurnMs": 45000,
|
||||
"preProviderMs": 10000,
|
||||
"providerFinalMs": 3000,
|
||||
"healthP95Ms": 1000,
|
||||
"peakRssMb": 900
|
||||
"peakRssMb": 900,
|
||||
"postReadyHealthP95Ms": 1000
|
||||
},
|
||||
"roleThresholds": {
|
||||
"gateway": {
|
||||
@ -61,7 +61,7 @@
|
||||
"agentTurnMs",
|
||||
"agentTurnP95Ms",
|
||||
"coldPreProviderMs",
|
||||
"healthP95Ms",
|
||||
"postReadyHealthP95Ms",
|
||||
"peakRssMb",
|
||||
"pluginLoadFailures"
|
||||
]
|
||||
|
||||
@ -12,8 +12,8 @@
|
||||
"thresholds": {
|
||||
"coldReadyMs": 30000,
|
||||
"warmReadyMs": 15000,
|
||||
"healthP95Ms": 1000,
|
||||
"peakRssMb": 900
|
||||
"peakRssMb": 900,
|
||||
"postReadyHealthP95Ms": 1000
|
||||
},
|
||||
"roleThresholds": {
|
||||
"gateway": {
|
||||
@ -60,7 +60,7 @@
|
||||
"metrics": [
|
||||
"coldReadyMs",
|
||||
"warmReadyMs",
|
||||
"healthP95Ms",
|
||||
"postReadyHealthP95Ms",
|
||||
"peakRssMb",
|
||||
"eventLoopMaxMs"
|
||||
]
|
||||
|
||||
@ -14,8 +14,8 @@
|
||||
"agentTurnMs": 45000,
|
||||
"preProviderMs": 10000,
|
||||
"providerFinalMs": 3000,
|
||||
"healthP95Ms": 1000,
|
||||
"peakRssMb": 900
|
||||
"peakRssMb": 900,
|
||||
"postReadyHealthP95Ms": 1000
|
||||
},
|
||||
"roleThresholds": {
|
||||
"gateway": {
|
||||
@ -62,7 +62,7 @@
|
||||
"agentTurnMs",
|
||||
"agentTurnP95Ms",
|
||||
"coldPreProviderMs",
|
||||
"healthP95Ms",
|
||||
"postReadyHealthP95Ms",
|
||||
"peakRssMb",
|
||||
"pluginLoadFailures"
|
||||
]
|
||||
|
||||
@ -19,7 +19,7 @@
|
||||
"rssGrowthMb": 300,
|
||||
"gatewayRssGrowthMb": 300,
|
||||
"soakHealthP95Ms": 1000,
|
||||
"healthP95Ms": 1000
|
||||
"postReadyHealthP95Ms": 1000
|
||||
},
|
||||
"roleThresholds": {
|
||||
"gateway": {
|
||||
|
||||
@ -14,8 +14,8 @@
|
||||
"agentTurnMs": 45000,
|
||||
"preProviderMs": 10000,
|
||||
"providerFinalMs": 3000,
|
||||
"healthP95Ms": 1000,
|
||||
"peakRssMb": 900
|
||||
"peakRssMb": 900,
|
||||
"postReadyHealthP95Ms": 1000
|
||||
},
|
||||
"roleThresholds": {
|
||||
"gateway": {
|
||||
@ -61,7 +61,7 @@
|
||||
"agentTurnMs",
|
||||
"agentTurnP95Ms",
|
||||
"coldPreProviderMs",
|
||||
"healthP95Ms",
|
||||
"postReadyHealthP95Ms",
|
||||
"peakRssMb",
|
||||
"pluginLoadFailures"
|
||||
]
|
||||
|
||||
@ -18,9 +18,9 @@
|
||||
"modelsListMs": 20000,
|
||||
"soakCommandP95Ms": 12000,
|
||||
"soakHealthP95Ms": 1000,
|
||||
"healthP95Ms": 1000,
|
||||
"peakRssMb": 1000,
|
||||
"eventLoopMaxMs": 500
|
||||
"eventLoopMaxMs": 500,
|
||||
"postReadyHealthP95Ms": 1000
|
||||
},
|
||||
"roleThresholds": {
|
||||
"gateway": {
|
||||
@ -74,7 +74,7 @@
|
||||
"soakCommandP95Ms",
|
||||
"soakHealthP95Ms",
|
||||
"peakRssMb",
|
||||
"healthP95Ms",
|
||||
"postReadyHealthP95Ms",
|
||||
"eventLoopMaxMs"
|
||||
]
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user