From d601bde7f8b85d77db565b6846133adea6aae14c Mon Sep 17 00:00:00 2001 From: Shakker Date: Fri, 1 May 2026 08:26:39 +0100 Subject: [PATCH] feat: add long agent session coverage --- profiles/exhaustive.json | 1 + profiles/release.json | 6 ++ profiles/soak.json | 5 ++ scenarios/agent-long-session.json | 100 ++++++++++++++++++++++++++++++ src/evaluator.mjs | 89 ++++++++++++++++++++++++++ src/performance/stats.mjs | 3 + src/report.mjs | 8 +++ src/selfcheck.mjs | 4 ++ surfaces/agent-message.json | 4 +- 9 files changed, 218 insertions(+), 2 deletions(-) create mode 100644 scenarios/agent-long-session.json diff --git a/profiles/exhaustive.json b/profiles/exhaustive.json index c73e76f..2865da1 100644 --- a/profiles/exhaustive.json +++ b/profiles/exhaustive.json @@ -26,6 +26,7 @@ { "scenario": "provider-models", "state": "model-auth-configured" }, { "scenario": "provider-models", "state": "model-auth-missing" }, { "scenario": "agent-cold-warm-message", "state": "mock-openai-provider", "timeoutMs": 180000 }, + { "scenario": "agent-long-session", "state": "mock-openai-provider", "timeoutMs": 360000 }, { "scenario": "agent-provider-slow", "state": "mock-openai-provider", "timeoutMs": 180000 }, { "scenario": "agent-provider-timeout", "state": "mock-openai-provider", "timeoutMs": 180000 }, { "scenario": "agent-provider-malformed", "state": "mock-openai-provider", "timeoutMs": 180000 }, diff --git a/profiles/release.json b/profiles/release.json index a5ab40a..eabc1cc 100644 --- a/profiles/release.json +++ b/profiles/release.json @@ -98,6 +98,7 @@ { "scenario": "agent-provider-malformed", "state": "mock-openai-provider" }, { "scenario": "agent-provider-streaming-stall", "state": "mock-openai-provider" }, { "scenario": "agent-provider-recovery", "state": "mock-openai-provider" }, + { "scenario": "agent-long-session", "state": "mock-openai-provider" }, { "scenario": "failure-injection", "state": "broken-plugin-deps" }, { "scenario": "soak", "state": "large-workspace" }, { "scenario": "cross-platform-smoke", "state": "slow-filesystem" } @@ -191,6 +192,11 @@ "state": "mock-openai-provider", "timeoutMs": 240000 }, + { + "scenario": "agent-long-session", + "state": "mock-openai-provider", + "timeoutMs": 360000 + }, { "scenario": "dashboard-readiness", "state": "fresh" diff --git a/profiles/soak.json b/profiles/soak.json index a252ba9..00416a4 100644 --- a/profiles/soak.json +++ b/profiles/soak.json @@ -32,6 +32,11 @@ "scenario": "agent-cold-warm-message", "state": "mock-openai-provider", "timeoutMs": 180000 + }, + { + "scenario": "agent-long-session", + "state": "mock-openai-provider", + "timeoutMs": 360000 } ] } diff --git a/scenarios/agent-long-session.json b/scenarios/agent-long-session.json new file mode 100644 index 0000000..1046a91 --- /dev/null +++ b/scenarios/agent-long-session.json @@ -0,0 +1,100 @@ +{ + "id": "agent-long-session", + "surface": "agent-message", + "title": "Agent Long Session", + "objective": "Send repeated simple messages through one OpenClaw session to catch latency drift, provider routing drift, resource growth, health degradation, and child-process leaks during normal assistant use.", + "tags": ["agent", "message", "latency", "providers", "soak", "long-session"], + "timeoutMs": 360000, + "agent": { + "expectedText": "KOVA_AGENT_OK" + }, + "thresholds": { + "gatewayReadyMs": 30000, + "agentTurnMs": 45000, + "agentTurnP95Ms": 30000, + "agentTurnMaxMs": 45000, + "preProviderMs": 10000, + "agentPreProviderP95Ms": 8000, + "agentPreProviderMaxMs": 12000, + "providerFinalMs": 3000, + "agentProviderFinalP95Ms": 3000, + "preProviderDominanceRatio": 0.8, + "agentContainmentHealthFailures": 0, + "agentProcessLeaks": 0, + "statusMs": 10000, + "peakRssMb": 900, + "missingDependencyErrors": 0, + "pluginLoadFailures": 0, + "providerTimeoutMentions": 0 + }, + "phases": [ + { + "id": "provision", + "title": "Provision Long Session Env", + "intent": "Start a disposable OpenClaw gateway before wiring the model provider and sending repeated messages.", + "commands": ["ocm start {env} {startSelector} --json"], + "evidence": ["gateway port", "runtime binding", "startup readiness"] + }, + { + "id": "cold-session-turn", + "title": "Cold Session Turn", + "intent": "Send the first simple message through OpenClaw's real local embedded agent CLI command in a fresh session.", + "commands": [ + "ocm @{env} -- agent --local --agent main --session-id kova-agent-long-session --message 'Reply with exact ASCII text KOVA_AGENT_OK only.' --thinking off --timeout 120 --json" + ], + "evidence": ["cold command duration", "assistant text", "provider request timing", "role resource samples"] + }, + { + "id": "warm-session-turn", + "title": "Warm Session Turn", + "intent": "Send a warm follow-up in the same session to establish cache behavior after cold discovery work.", + "commands": [ + "ocm @{env} -- agent --local --agent main --session-id kova-agent-long-session --message 'Reply with exact ASCII text KOVA_AGENT_OK only.' --thinking off --timeout 120 --json" + ], + "evidence": ["warm command duration", "assistant text", "provider request timing", "cold/warm delta"] + }, + { + "id": "session-turn-3", + "title": "Session Turn 3", + "intent": "Continue the same OpenClaw session to catch repeated-turn latency or resource drift.", + "commands": [ + "ocm @{env} -- agent --local --agent main --session-id kova-agent-long-session --message 'Reply with exact ASCII text KOVA_AGENT_OK only.' --thinking off --timeout 120 --json" + ], + "evidence": ["turn duration", "assistant text", "provider request timing", "role resource samples"] + }, + { + "id": "session-turn-4", + "title": "Session Turn 4", + "intent": "Continue the same OpenClaw session and verify the assistant path remains stable.", + "commands": [ + "ocm @{env} -- agent --local --agent main --session-id kova-agent-long-session --message 'Reply with exact ASCII text KOVA_AGENT_OK only.' --thinking off --timeout 120 --json" + ], + "evidence": ["turn duration", "assistant text", "provider request timing", "role resource samples"] + }, + { + "id": "session-turn-5", + "title": "Session Turn 5", + "intent": "Continue repeated assistant use and catch process or memory growth between turns.", + "commands": [ + "ocm @{env} -- agent --local --agent main --session-id kova-agent-long-session --message 'Reply with exact ASCII text KOVA_AGENT_OK only.' --thinking off --timeout 120 --json" + ], + "evidence": ["turn duration", "assistant text", "provider request timing", "role resource samples"] + }, + { + "id": "session-turn-6", + "title": "Session Turn 6", + "intent": "Send a final repeated assistant message before checking gateway health and process cleanup.", + "commands": [ + "ocm @{env} -- agent --local --agent main --session-id kova-agent-long-session --message 'Reply with exact ASCII text KOVA_AGENT_OK only.' --thinking off --timeout 120 --json" + ], + "evidence": ["turn duration", "assistant text", "provider request timing", "role resource samples"] + }, + { + "id": "post-session-health", + "title": "Post-Session Gateway Health", + "intent": "Verify the gateway remains responsive after repeated agent turns and capture provider/plugin diagnostics.", + "commands": ["ocm @{env} -- status", "ocm logs {env} --tail 500 --raw"], + "evidence": ["gateway status", "provider logs", "plugin errors", "memory after repeated turns", "process leak summary"] + } + ] +} diff --git a/src/evaluator.mjs b/src/evaluator.mjs index 09211b9..0d59fbe 100644 --- a/src/evaluator.mjs +++ b/src/evaluator.mjs @@ -44,6 +44,7 @@ export function evaluateRecord(record, scenario, options = {}) { const coldAgentTurn = selectAgentTurn(agentTurns, "cold") ?? agentTurns[0] ?? null; const warmAgentTurn = selectAgentTurn(agentTurns, "warm") ?? agentTurns[1] ?? null; const providerTurn = collectSlowestProviderTurn(agentTurns); + const agentTurnStats = summarizeAgentTurnStats(agentTurns); const agentTurnMs = maxNullable(maxDurationWhere(allResults, isAgentMessageCommand), maxTurnDuration(agentTurns)); const agentResponseOk = agentTurns.length === 0 ? null : agentTurns.every((turn) => turn.responseOk === true); const agentProviderSimulation = evaluateProviderSimulation({ turns: agentTurns, scenario, record, thresholds }); @@ -302,6 +303,7 @@ export function evaluateRecord(record, scenario, options = {}) { } checkAgentTurnCorrectness(violations, agentTurns, scenario.agent?.expectedText ?? null); checkAgentTurnThresholds(violations, agentTurns, { coldAgentTurn, warmAgentTurn, providerTurn, agentLatencyDiagnosis }, thresholds, record); + checkAgentTurnAggregateThresholds(violations, agentTurnStats, thresholds); checkProviderSimulation(violations, agentProviderSimulation); checkAgentFailureContainment(violations, agentFailureContainment); @@ -318,6 +320,16 @@ export function evaluateRecord(record, scenario, options = {}) { agentResponseOk, agentTurnCount: agentTurns.length, agentTurns, + agentTurnStats, + agentTurnMedianMs: agentTurnStats.totalTurnMs.median, + agentTurnP95Ms: agentTurnStats.totalTurnMs.p95, + agentTurnMaxMs: agentTurnStats.totalTurnMs.max, + agentPreProviderMedianMs: agentTurnStats.preProviderMs.median, + agentPreProviderP95Ms: agentTurnStats.preProviderMs.p95, + agentPreProviderMaxMs: agentTurnStats.preProviderMs.max, + agentProviderFinalMedianMs: agentTurnStats.providerFinalMs.median, + agentProviderFinalP95Ms: agentTurnStats.providerFinalMs.p95, + agentProviderFinalMaxMs: agentTurnStats.providerFinalMs.max, coldAgentTurnMs: coldAgentTurn?.totalTurnMs ?? null, warmAgentTurnMs: warmAgentTurn?.totalTurnMs ?? null, agentColdWarmDeltaMs: delta(coldAgentTurn?.totalTurnMs, warmAgentTurn?.totalTurnMs), @@ -606,6 +618,61 @@ function maxTurnDuration(turns) { return durations.length === 0 ? null : Math.max(...durations); } +function summarizeAgentTurnStats(turns) { + return { + schemaVersion: "kova.agentTurnStats.v1", + count: turns.length, + totalTurnMs: summarizeNumericField(turns, "totalTurnMs"), + preProviderMs: summarizeNumericField(turns, "preProviderMs"), + providerFinalMs: summarizeNumericField(turns, "providerFinalMs"), + postProviderMs: summarizeNumericField(turns, "postProviderMs"), + firstByteLatencyMs: summarizeNumericField(turns, "firstByteLatencyMs"), + processLeakCount: turns.reduce((sum, turn) => sum + (turn.processLeakCount ?? 0), 0), + missingProviderRequestCount: turns.filter((turn) => turn.missingProviderRequest === true).length, + responseOkCount: turns.filter((turn) => turn.responseOk === true).length + }; +} + +function summarizeNumericField(items, field) { + const values = items + .map((item) => item?.[field]) + .filter((value) => typeof value === "number" && Number.isFinite(value)) + .toSorted((left, right) => left - right); + if (values.length === 0) { + return { + count: 0, + min: null, + median: null, + p95: null, + max: null + }; + } + return { + count: values.length, + min: values[0], + median: percentile(values, 50), + p95: percentile(values, 95), + max: values.at(-1) + }; +} + +function percentile(sortedValues, percentileValue) { + if (sortedValues.length === 0) { + return null; + } + if (sortedValues.length === 1) { + return sortedValues[0]; + } + const position = (percentileValue / 100) * (sortedValues.length - 1); + const lower = Math.floor(position); + const upper = Math.ceil(position); + if (lower === upper) { + return sortedValues[lower]; + } + const weight = position - lower; + return Math.round((sortedValues[lower] * (1 - weight) + sortedValues[upper] * weight) * 1000) / 1000; +} + function checkAgentTurnCorrectness(violations, turns, expectedText) { for (const turn of turns) { if (turn.expectedFailure === true) { @@ -860,6 +927,28 @@ function checkAgentTurnThresholds(violations, turns, selected, thresholds, recor } } +function checkAgentTurnAggregateThresholds(violations, stats, thresholds) { + checkAggregateThreshold(violations, stats.totalTurnMs.p95, "agentTurnP95Ms", thresholds.agentTurnP95Ms); + checkAggregateThreshold(violations, stats.totalTurnMs.max, "agentTurnMaxMs", thresholds.agentTurnMaxMs); + checkAggregateThreshold(violations, stats.preProviderMs.p95, "agentPreProviderP95Ms", thresholds.agentPreProviderP95Ms); + checkAggregateThreshold(violations, stats.preProviderMs.max, "agentPreProviderMaxMs", thresholds.agentPreProviderMaxMs); + checkAggregateThreshold(violations, stats.providerFinalMs.p95, "agentProviderFinalP95Ms", thresholds.agentProviderFinalP95Ms); + checkAggregateThreshold(violations, stats.providerFinalMs.max, "agentProviderFinalMaxMs", thresholds.agentProviderFinalMaxMs); +} + +function checkAggregateThreshold(violations, actual, metric, threshold) { + if (typeof threshold !== "number" || typeof actual !== "number" || actual <= threshold) { + return; + } + violations.push({ + kind: "agent-latency", + metric, + expected: `<= ${threshold}`, + actual, + message: `${metric} ${actual}ms exceeded threshold ${threshold}ms` + }); +} + function checkTurnThreshold(violations, turn, metric, threshold, message) { if (!turn || typeof threshold !== "number" || typeof turn[metric] !== "number" || turn[metric] <= threshold) { return; diff --git a/src/performance/stats.mjs b/src/performance/stats.mjs index bc0402e..361439b 100644 --- a/src/performance/stats.mjs +++ b/src/performance/stats.mjs @@ -9,9 +9,12 @@ export const PERFORMANCE_METRICS = [ { id: "openclawEventLoopMaxMs", title: "Event Loop Max", unit: "ms", regressionKey: "eventLoopRegressionPercent" }, { id: "eventLoopDelayMs", title: "Event Loop Delay", unit: "ms", regressionKey: "eventLoopRegressionPercent" }, { id: "agentTurnMs", title: "Agent Turn", unit: "ms", regressionKey: "agentLatencyRegressionPercent" }, + { id: "agentTurnP95Ms", title: "Agent Turn p95", unit: "ms", regressionKey: "agentLatencyRegressionPercent" }, + { id: "agentTurnMaxMs", title: "Agent Turn Max", unit: "ms", regressionKey: "agentLatencyRegressionPercent" }, { id: "coldAgentTurnMs", title: "Cold Agent Turn", unit: "ms", regressionKey: "agentLatencyRegressionPercent" }, { id: "warmAgentTurnMs", title: "Warm Agent Turn", unit: "ms", regressionKey: "agentLatencyRegressionPercent" }, { id: "agentColdWarmDeltaMs", title: "Cold/Warm Agent Delta", unit: "ms", regressionKey: "agentLatencyRegressionPercent" }, + { id: "agentPreProviderP95Ms", title: "Pre-Provider p95", unit: "ms", regressionKey: "agentLatencyRegressionPercent" }, { id: "coldPreProviderMs", title: "Cold Pre-Provider", unit: "ms", regressionKey: "agentLatencyRegressionPercent" }, { id: "warmPreProviderMs", title: "Warm Pre-Provider", unit: "ms", regressionKey: "agentLatencyRegressionPercent" }, { id: "healthP95Ms", title: "Health p95", unit: "ms", regressionKey: "startupRegressionPercent" }, diff --git a/src/report.mjs b/src/report.mjs index 86f97ef..43f784c 100644 --- a/src/report.mjs +++ b/src/report.mjs @@ -144,6 +144,7 @@ export function renderMarkdownReport(report) { lines.push(`- Agent cold/warm: cold ${record.measurements.coldAgentTurnMs ?? "unknown"} ms; warm ${record.measurements.warmAgentTurnMs ?? "unknown"} ms; delta ${record.measurements.agentColdWarmDeltaMs ?? "unknown"} ms`); lines.push(`- Agent pre-provider: cold ${record.measurements.coldPreProviderMs ?? "unknown"} ms; warm ${record.measurements.warmPreProviderMs ?? "unknown"} ms; delta ${record.measurements.agentColdWarmPreProviderDeltaMs ?? "unknown"} ms`); lines.push(`- Agent provider final: cold ${record.measurements.coldProviderFinalMs ?? "unknown"} ms; warm ${record.measurements.warmProviderFinalMs ?? "unknown"} ms`); + lines.push(`- Agent turn stats: count ${record.measurements.agentTurnCount}; p95 ${record.measurements.agentTurnP95Ms ?? "unknown"} ms; max ${record.measurements.agentTurnMaxMs ?? "unknown"} ms; pre-provider p95 ${record.measurements.agentPreProviderP95Ms ?? "unknown"} ms`); } if (record.measurements.agentProviderAttribution) { lines.push(`- Provider evidence: ${record.measurements.agentProviderRequestCount ?? 0} request(s); provider work ${record.measurements.agentProviderFinalMs ?? "unknown"} ms; pre-provider ${record.measurements.agentPreProviderMs ?? "unknown"} ms; post-provider ${record.measurements.agentPostProviderMs ?? "unknown"} ms`); @@ -549,6 +550,13 @@ function summarizeMeasurements(measurements) { providerFirstByteLatencyMs: measurements.providerFirstByteLatencyMs ?? null, agentTurnCount: measurements.agentTurnCount ?? null, agentTurns: measurements.agentTurns ?? null, + agentTurnStats: measurements.agentTurnStats ?? null, + agentTurnP95Ms: measurements.agentTurnP95Ms ?? null, + agentTurnMaxMs: measurements.agentTurnMaxMs ?? null, + agentPreProviderP95Ms: measurements.agentPreProviderP95Ms ?? null, + agentPreProviderMaxMs: measurements.agentPreProviderMaxMs ?? null, + agentProviderFinalP95Ms: measurements.agentProviderFinalP95Ms ?? null, + agentProviderFinalMaxMs: measurements.agentProviderFinalMaxMs ?? null, coldAgentTurnMs: measurements.coldAgentTurnMs ?? null, warmAgentTurnMs: measurements.warmAgentTurnMs ?? null, agentColdWarmDeltaMs: measurements.agentColdWarmDeltaMs ?? null, diff --git a/src/selfcheck.mjs b/src/selfcheck.mjs index 1dbbd15..1c2917a 100644 --- a/src/selfcheck.mjs +++ b/src/selfcheck.mjs @@ -1226,6 +1226,9 @@ function agentTurnBreakdownCheck() { agent: { expectedText: "KOVA_AGENT_OK" }, thresholds: {} }, { surface: { thresholds: {} }, targetPlan: { kind: "local-build" } }); + assertEqual(record.measurements.agentTurnStats?.count, 1, "agent turn stats count"); + assertEqual(record.measurements.agentTurnP95Ms, 1000, "agent turn p95"); + assertEqual(record.measurements.agentPreProviderP95Ms, 200, "agent pre-provider p95"); const rendered = renderMarkdownReport({ generatedAt: "2026-05-01T00:00:00.000Z", runId: "self-check-agent-turn-breakdown", @@ -1237,6 +1240,7 @@ function agentTurnBreakdownCheck() { }); assertEqual(rendered.includes("breakdown:"), true, "markdown includes agent turn breakdown"); assertEqual(rendered.includes("models.catalog.* 70ms"), true, "markdown includes source span evidence"); + assertEqual(rendered.includes("Agent turn stats:"), true, "markdown includes agent turn stats"); assertEqual( summarizeAgentTurnBreakdownForMarkdown(normal.breakdown).includes("unknown 15ms"), true, diff --git a/surfaces/agent-message.json b/surfaces/agent-message.json index 6cca384..03931e2 100644 --- a/surfaces/agent-message.json +++ b/surfaces/agent-message.json @@ -2,10 +2,10 @@ "id": "agent-message", "title": "Agent Message", "ownerArea": "agent-runtime", - "description": "Send cold and warm local OpenClaw agent messages and verify response latency, provider routing, gateway health, memory, and logs.", + "description": "Send cold, warm, and repeated local OpenClaw agent messages and verify response latency, provider routing, gateway health, memory, and logs.", "requiredStates": ["mock-openai-provider"], "targetKinds": ["npm", "channel", "runtime", "local-build"], - "requiredMetrics": ["agentTurnMs", "coldAgentTurnMs", "warmAgentTurnMs", "agentColdWarmDeltaMs", "coldPreProviderMs", "warmPreProviderMs", "healthP95Ms", "peakRssMb", "providerTimeoutMentions", "pluginLoadFailures"], + "requiredMetrics": ["agentTurnMs", "agentTurnP95Ms", "agentTurnMaxMs", "coldAgentTurnMs", "warmAgentTurnMs", "agentColdWarmDeltaMs", "coldPreProviderMs", "warmPreProviderMs", "agentPreProviderP95Ms", "healthP95Ms", "peakRssMb", "providerTimeoutMentions", "pluginLoadFailures"], "processRoles": ["gateway", "command-tree", "agent-cli", "agent-process", "mock-provider"], "thresholds": { "agentTurnMs": 45000, "coldAgentTurnMs": 45000, "warmAgentTurnMs": 15000, "coldWarmDeltaMs": 30000, "preProviderMs": 10000, "providerFinalMs": 3000, "healthP95Ms": 1000, "peakRssMb": 900 }, "roleThresholds": {