fix: narrow dashboard health probe

This commit is contained in:
Shakker 2026-05-07 08:29:54 +01:00
parent 740e957974
commit fe3d40552f
No known key found for this signature in database
3 changed files with 18 additions and 7 deletions

View File

@ -96,11 +96,11 @@
"title": "Post-Dashboard Gateway Health",
"intent": "Verify the cloned gateway remains responsive after the dashboard-style turn and collect logs for embedded-run/liveness evidence.",
"commands": [
"ocm @{env} -- status",
"ocm @{env} -- gateway status --json --require-rpc",
"ocm logs {env} --tail 300 --raw"
],
"evidence": [
"gateway status",
"gateway status probe",
"embedded-run traces",
"liveness warnings",
"plugin errors",

View File

@ -104,11 +104,11 @@
"title": "Post-Dashboard Gateway Health",
"intent": "Verify the gateway remains responsive after dashboard-style cold and warm message turns.",
"commands": [
"ocm @{env} -- status",
"ocm @{env} -- gateway status --json --require-rpc",
"ocm logs {env} --tail 300 --raw"
],
"evidence": [
"gateway status",
"gateway status probe",
"provider logs",
"plugin errors",
"memory after dashboard turn"

View File

@ -127,14 +127,14 @@ export function evaluateRecord(record, scenario, options = {}) {
const coldReadyMs = maxDurationWhere(allResults, (command) => command.startsWith("ocm start "));
const warmReadyMs = maxDurationWhere(allResults, (command) => command.startsWith("ocm service restart "));
const upgradeMs = maxDurationWhere(allResults, (command) => command.startsWith("ocm upgrade "));
const statusMs = maxDurationWhere(allResults, (command) => command.includes(" -- status"));
const statusMs = maxDurationWhere(allResults, isPostAgentStatusCommand);
const pluginsListMs = maxDurationWhere(allResults, (command) => command.includes(" -- plugins list"));
const pluginInstallMs = maxDurationWhere(allResults, (command) => command.includes("run-official-plugin-install.mjs") || command.includes(" -- plugins install"));
const modelsListMs = maxDurationWhere(allResults, (command) => command.includes(" -- models list"));
const rssGrowthMb = maxNullable(resourceSummary.maxTotalRssGrowthMb);
const gatewayRssGrowthMb = maxNullable(resourceSummary.maxGatewayRssGrowthMb);
checkDuration(violations, allResults, "statusMs", thresholds.statusMs, (command) => command.includes(" -- status"));
checkDuration(violations, allResults, "statusMs", thresholds.statusMs, isPostAgentStatusCommand);
checkDuration(violations, allResults, "pluginsListMs", thresholds.pluginsListMs, (command) => command.includes(" -- plugins list"));
checkDuration(violations, allResults, "pluginUpdateDryRunMs", thresholds.pluginUpdateDryRunMs, (command) =>
command.includes(" -- plugins update") && command.includes("--dry-run")
@ -1263,7 +1263,9 @@ function evaluateAgentFailureContainment({ turns, record, thresholds, gatewayExp
? thresholds.agentContainmentHealthFailures
: (typeof thresholds.providerFailureHealthFailures === "number" ? thresholds.providerFailureHealthFailures : 0);
const finalGatewayState = record.finalMetrics?.service?.gatewayState ?? null;
const statusCommands = collectResults(record).filter((result) => /\s--\sstatus\b|@\S+\s+--\s+status\b/.test(result.command) || result.command.includes(" -- status"));
const statusCommands = collectResults(record).filter((result) =>
isPostAgentStatusCommand(result.command)
);
const statusWorks = statusCommands.length === 0 ? null : statusCommands.some((result) => result.status === 0 && result.timedOut !== true);
return {
@ -1284,6 +1286,15 @@ function evaluateAgentFailureContainment({ turns, record, thresholds, gatewayExp
};
}
function isPostAgentStatusCommand(command) {
return (
/\s--\sstatus\b|@\S+\s+--\s+status\b/.test(command) ||
command.includes(" -- status") ||
/\s--\s+gateway\s+status\b/.test(command) ||
/@\S+\s+--\s+gateway\s+status\b/.test(command)
);
}
function checkAgentFailureContainment(violations, containment) {
if (containment.processLeaksOk !== true) {
const first = containment.leakedProcesses[0];