fix: avoid false readiness and live provider failures

This commit is contained in:
Shakker 2026-05-01 17:51:45 +01:00
parent 3b805dbc3a
commit 67a79b376c
No known key found for this signature in database
2 changed files with 62 additions and 1 deletions

View File

@ -1496,6 +1496,14 @@ function diagnoseAgentLatency({ coldAgentTurn, warmAgentTurn, providerTurn, thre
likelyOwner: "agent-runtime/auth"
};
}
if (authMode === "live") {
return {
kind: "live-provider-timing-unavailable",
severity: "info",
summary: "Live provider request timing was not captured; use OpenClaw timeline spans or a deterministic mock provider lane for provider boundary attribution.",
likelyOwner: "Kova/OpenClaw diagnostics integration"
};
}
return {
kind: "no-provider-request",
severity: "fail",

View File

@ -80,7 +80,7 @@ export async function collectEnvMetrics(envName, options = {}) {
recordCollector(collectors, "process", metrics.process);
}
if (serviceJson.gatewayPort) {
if (serviceJson.gatewayPort && shouldProbeReadiness(serviceJson, readinessTimeoutMs)) {
await collectReadinessAndHealth(metrics, collectors, serviceJson.gatewayPort, {
readinessTimeoutMs,
readinessThresholdMs: options.readinessThresholdMs,
@ -91,6 +91,20 @@ export async function collectEnvMetrics(envName, options = {}) {
timeoutMs,
sampleHealthAfterReady: Boolean(serviceJson.childPid)
});
} else if (serviceJson.gatewayPort) {
metrics.readiness = skippedReadinessMetrics(serviceJson.gatewayPort, {
thresholdMs: options.readinessThresholdMs,
deadlineMs: readinessTimeoutMs,
reason: serviceJson.childPid
? "readiness probe disabled for this phase"
: "gateway process is not expected to be running for this phase"
});
recordCollector(collectors, "readiness", {
commandStatus: 0,
durationMs: 0,
statusLabel: "INFO",
error: null
});
}
await collectLogAndTimelineMetrics(metrics, collectors, envName, timeoutMs, options);
@ -110,6 +124,45 @@ export async function collectEnvMetrics(envName, options = {}) {
return metrics;
}
function shouldProbeReadiness(serviceJson, readinessTimeoutMs) {
if (serviceJson.childPid) {
return true;
}
if (readinessTimeoutMs <= 0) {
return false;
}
return serviceJson.running === true || serviceJson.desiredRunning === true || serviceJson.gatewayState === "running" || serviceJson.gatewayState === "backoff";
}
function skippedReadinessMetrics(port, { thresholdMs, deadlineMs, reason }) {
return {
schemaVersion: "kova.readiness.v1",
deadlineMs,
thresholdMs: Math.max(0, Number(thresholdMs ?? 0)),
intervalMs: null,
attempts: 0,
ready: null,
listeningReady: null,
listeningReadyAtMs: null,
healthReadyAtMs: null,
classification: {
state: "not-applicable",
severity: "info",
reason
},
listening: {
host: "127.0.0.1",
port: Number(port),
ok: null,
durationMs: null,
error: reason
},
health: null,
listeningAttempts: [],
healthAttempts: []
};
}
async function collectReadinessAndHealth(metrics, collectors, port, options) {
const readinessStarted = Date.now();
metrics.readiness = await collectReadinessMetrics(port, {