feat: add long agent session coverage

This commit is contained in:
Shakker 2026-05-01 08:26:39 +01:00
parent 97454f3f99
commit d601bde7f8
No known key found for this signature in database
9 changed files with 218 additions and 2 deletions

View File

@ -26,6 +26,7 @@
{ "scenario": "provider-models", "state": "model-auth-configured" },
{ "scenario": "provider-models", "state": "model-auth-missing" },
{ "scenario": "agent-cold-warm-message", "state": "mock-openai-provider", "timeoutMs": 180000 },
{ "scenario": "agent-long-session", "state": "mock-openai-provider", "timeoutMs": 360000 },
{ "scenario": "agent-provider-slow", "state": "mock-openai-provider", "timeoutMs": 180000 },
{ "scenario": "agent-provider-timeout", "state": "mock-openai-provider", "timeoutMs": 180000 },
{ "scenario": "agent-provider-malformed", "state": "mock-openai-provider", "timeoutMs": 180000 },

View File

@ -98,6 +98,7 @@
{ "scenario": "agent-provider-malformed", "state": "mock-openai-provider" },
{ "scenario": "agent-provider-streaming-stall", "state": "mock-openai-provider" },
{ "scenario": "agent-provider-recovery", "state": "mock-openai-provider" },
{ "scenario": "agent-long-session", "state": "mock-openai-provider" },
{ "scenario": "failure-injection", "state": "broken-plugin-deps" },
{ "scenario": "soak", "state": "large-workspace" },
{ "scenario": "cross-platform-smoke", "state": "slow-filesystem" }
@ -191,6 +192,11 @@
"state": "mock-openai-provider",
"timeoutMs": 240000
},
{
"scenario": "agent-long-session",
"state": "mock-openai-provider",
"timeoutMs": 360000
},
{
"scenario": "dashboard-readiness",
"state": "fresh"

View File

@ -32,6 +32,11 @@
"scenario": "agent-cold-warm-message",
"state": "mock-openai-provider",
"timeoutMs": 180000
},
{
"scenario": "agent-long-session",
"state": "mock-openai-provider",
"timeoutMs": 360000
}
]
}

View File

@ -0,0 +1,100 @@
{
"id": "agent-long-session",
"surface": "agent-message",
"title": "Agent Long Session",
"objective": "Send repeated simple messages through one OpenClaw session to catch latency drift, provider routing drift, resource growth, health degradation, and child-process leaks during normal assistant use.",
"tags": ["agent", "message", "latency", "providers", "soak", "long-session"],
"timeoutMs": 360000,
"agent": {
"expectedText": "KOVA_AGENT_OK"
},
"thresholds": {
"gatewayReadyMs": 30000,
"agentTurnMs": 45000,
"agentTurnP95Ms": 30000,
"agentTurnMaxMs": 45000,
"preProviderMs": 10000,
"agentPreProviderP95Ms": 8000,
"agentPreProviderMaxMs": 12000,
"providerFinalMs": 3000,
"agentProviderFinalP95Ms": 3000,
"preProviderDominanceRatio": 0.8,
"agentContainmentHealthFailures": 0,
"agentProcessLeaks": 0,
"statusMs": 10000,
"peakRssMb": 900,
"missingDependencyErrors": 0,
"pluginLoadFailures": 0,
"providerTimeoutMentions": 0
},
"phases": [
{
"id": "provision",
"title": "Provision Long Session Env",
"intent": "Start a disposable OpenClaw gateway before wiring the model provider and sending repeated messages.",
"commands": ["ocm start {env} {startSelector} --json"],
"evidence": ["gateway port", "runtime binding", "startup readiness"]
},
{
"id": "cold-session-turn",
"title": "Cold Session Turn",
"intent": "Send the first simple message through OpenClaw's real local embedded agent CLI command in a fresh session.",
"commands": [
"ocm @{env} -- agent --local --agent main --session-id kova-agent-long-session --message 'Reply with exact ASCII text KOVA_AGENT_OK only.' --thinking off --timeout 120 --json"
],
"evidence": ["cold command duration", "assistant text", "provider request timing", "role resource samples"]
},
{
"id": "warm-session-turn",
"title": "Warm Session Turn",
"intent": "Send a warm follow-up in the same session to establish cache behavior after cold discovery work.",
"commands": [
"ocm @{env} -- agent --local --agent main --session-id kova-agent-long-session --message 'Reply with exact ASCII text KOVA_AGENT_OK only.' --thinking off --timeout 120 --json"
],
"evidence": ["warm command duration", "assistant text", "provider request timing", "cold/warm delta"]
},
{
"id": "session-turn-3",
"title": "Session Turn 3",
"intent": "Continue the same OpenClaw session to catch repeated-turn latency or resource drift.",
"commands": [
"ocm @{env} -- agent --local --agent main --session-id kova-agent-long-session --message 'Reply with exact ASCII text KOVA_AGENT_OK only.' --thinking off --timeout 120 --json"
],
"evidence": ["turn duration", "assistant text", "provider request timing", "role resource samples"]
},
{
"id": "session-turn-4",
"title": "Session Turn 4",
"intent": "Continue the same OpenClaw session and verify the assistant path remains stable.",
"commands": [
"ocm @{env} -- agent --local --agent main --session-id kova-agent-long-session --message 'Reply with exact ASCII text KOVA_AGENT_OK only.' --thinking off --timeout 120 --json"
],
"evidence": ["turn duration", "assistant text", "provider request timing", "role resource samples"]
},
{
"id": "session-turn-5",
"title": "Session Turn 5",
"intent": "Continue repeated assistant use and catch process or memory growth between turns.",
"commands": [
"ocm @{env} -- agent --local --agent main --session-id kova-agent-long-session --message 'Reply with exact ASCII text KOVA_AGENT_OK only.' --thinking off --timeout 120 --json"
],
"evidence": ["turn duration", "assistant text", "provider request timing", "role resource samples"]
},
{
"id": "session-turn-6",
"title": "Session Turn 6",
"intent": "Send a final repeated assistant message before checking gateway health and process cleanup.",
"commands": [
"ocm @{env} -- agent --local --agent main --session-id kova-agent-long-session --message 'Reply with exact ASCII text KOVA_AGENT_OK only.' --thinking off --timeout 120 --json"
],
"evidence": ["turn duration", "assistant text", "provider request timing", "role resource samples"]
},
{
"id": "post-session-health",
"title": "Post-Session Gateway Health",
"intent": "Verify the gateway remains responsive after repeated agent turns and capture provider/plugin diagnostics.",
"commands": ["ocm @{env} -- status", "ocm logs {env} --tail 500 --raw"],
"evidence": ["gateway status", "provider logs", "plugin errors", "memory after repeated turns", "process leak summary"]
}
]
}

View File

@ -44,6 +44,7 @@ export function evaluateRecord(record, scenario, options = {}) {
const coldAgentTurn = selectAgentTurn(agentTurns, "cold") ?? agentTurns[0] ?? null;
const warmAgentTurn = selectAgentTurn(agentTurns, "warm") ?? agentTurns[1] ?? null;
const providerTurn = collectSlowestProviderTurn(agentTurns);
const agentTurnStats = summarizeAgentTurnStats(agentTurns);
const agentTurnMs = maxNullable(maxDurationWhere(allResults, isAgentMessageCommand), maxTurnDuration(agentTurns));
const agentResponseOk = agentTurns.length === 0 ? null : agentTurns.every((turn) => turn.responseOk === true);
const agentProviderSimulation = evaluateProviderSimulation({ turns: agentTurns, scenario, record, thresholds });
@ -302,6 +303,7 @@ export function evaluateRecord(record, scenario, options = {}) {
}
checkAgentTurnCorrectness(violations, agentTurns, scenario.agent?.expectedText ?? null);
checkAgentTurnThresholds(violations, agentTurns, { coldAgentTurn, warmAgentTurn, providerTurn, agentLatencyDiagnosis }, thresholds, record);
checkAgentTurnAggregateThresholds(violations, agentTurnStats, thresholds);
checkProviderSimulation(violations, agentProviderSimulation);
checkAgentFailureContainment(violations, agentFailureContainment);
@ -318,6 +320,16 @@ export function evaluateRecord(record, scenario, options = {}) {
agentResponseOk,
agentTurnCount: agentTurns.length,
agentTurns,
agentTurnStats,
agentTurnMedianMs: agentTurnStats.totalTurnMs.median,
agentTurnP95Ms: agentTurnStats.totalTurnMs.p95,
agentTurnMaxMs: agentTurnStats.totalTurnMs.max,
agentPreProviderMedianMs: agentTurnStats.preProviderMs.median,
agentPreProviderP95Ms: agentTurnStats.preProviderMs.p95,
agentPreProviderMaxMs: agentTurnStats.preProviderMs.max,
agentProviderFinalMedianMs: agentTurnStats.providerFinalMs.median,
agentProviderFinalP95Ms: agentTurnStats.providerFinalMs.p95,
agentProviderFinalMaxMs: agentTurnStats.providerFinalMs.max,
coldAgentTurnMs: coldAgentTurn?.totalTurnMs ?? null,
warmAgentTurnMs: warmAgentTurn?.totalTurnMs ?? null,
agentColdWarmDeltaMs: delta(coldAgentTurn?.totalTurnMs, warmAgentTurn?.totalTurnMs),
@ -606,6 +618,61 @@ function maxTurnDuration(turns) {
return durations.length === 0 ? null : Math.max(...durations);
}
function summarizeAgentTurnStats(turns) {
return {
schemaVersion: "kova.agentTurnStats.v1",
count: turns.length,
totalTurnMs: summarizeNumericField(turns, "totalTurnMs"),
preProviderMs: summarizeNumericField(turns, "preProviderMs"),
providerFinalMs: summarizeNumericField(turns, "providerFinalMs"),
postProviderMs: summarizeNumericField(turns, "postProviderMs"),
firstByteLatencyMs: summarizeNumericField(turns, "firstByteLatencyMs"),
processLeakCount: turns.reduce((sum, turn) => sum + (turn.processLeakCount ?? 0), 0),
missingProviderRequestCount: turns.filter((turn) => turn.missingProviderRequest === true).length,
responseOkCount: turns.filter((turn) => turn.responseOk === true).length
};
}
function summarizeNumericField(items, field) {
const values = items
.map((item) => item?.[field])
.filter((value) => typeof value === "number" && Number.isFinite(value))
.toSorted((left, right) => left - right);
if (values.length === 0) {
return {
count: 0,
min: null,
median: null,
p95: null,
max: null
};
}
return {
count: values.length,
min: values[0],
median: percentile(values, 50),
p95: percentile(values, 95),
max: values.at(-1)
};
}
function percentile(sortedValues, percentileValue) {
if (sortedValues.length === 0) {
return null;
}
if (sortedValues.length === 1) {
return sortedValues[0];
}
const position = (percentileValue / 100) * (sortedValues.length - 1);
const lower = Math.floor(position);
const upper = Math.ceil(position);
if (lower === upper) {
return sortedValues[lower];
}
const weight = position - lower;
return Math.round((sortedValues[lower] * (1 - weight) + sortedValues[upper] * weight) * 1000) / 1000;
}
function checkAgentTurnCorrectness(violations, turns, expectedText) {
for (const turn of turns) {
if (turn.expectedFailure === true) {
@ -860,6 +927,28 @@ function checkAgentTurnThresholds(violations, turns, selected, thresholds, recor
}
}
function checkAgentTurnAggregateThresholds(violations, stats, thresholds) {
checkAggregateThreshold(violations, stats.totalTurnMs.p95, "agentTurnP95Ms", thresholds.agentTurnP95Ms);
checkAggregateThreshold(violations, stats.totalTurnMs.max, "agentTurnMaxMs", thresholds.agentTurnMaxMs);
checkAggregateThreshold(violations, stats.preProviderMs.p95, "agentPreProviderP95Ms", thresholds.agentPreProviderP95Ms);
checkAggregateThreshold(violations, stats.preProviderMs.max, "agentPreProviderMaxMs", thresholds.agentPreProviderMaxMs);
checkAggregateThreshold(violations, stats.providerFinalMs.p95, "agentProviderFinalP95Ms", thresholds.agentProviderFinalP95Ms);
checkAggregateThreshold(violations, stats.providerFinalMs.max, "agentProviderFinalMaxMs", thresholds.agentProviderFinalMaxMs);
}
function checkAggregateThreshold(violations, actual, metric, threshold) {
if (typeof threshold !== "number" || typeof actual !== "number" || actual <= threshold) {
return;
}
violations.push({
kind: "agent-latency",
metric,
expected: `<= ${threshold}`,
actual,
message: `${metric} ${actual}ms exceeded threshold ${threshold}ms`
});
}
function checkTurnThreshold(violations, turn, metric, threshold, message) {
if (!turn || typeof threshold !== "number" || typeof turn[metric] !== "number" || turn[metric] <= threshold) {
return;

View File

@ -9,9 +9,12 @@ export const PERFORMANCE_METRICS = [
{ id: "openclawEventLoopMaxMs", title: "Event Loop Max", unit: "ms", regressionKey: "eventLoopRegressionPercent" },
{ id: "eventLoopDelayMs", title: "Event Loop Delay", unit: "ms", regressionKey: "eventLoopRegressionPercent" },
{ id: "agentTurnMs", title: "Agent Turn", unit: "ms", regressionKey: "agentLatencyRegressionPercent" },
{ id: "agentTurnP95Ms", title: "Agent Turn p95", unit: "ms", regressionKey: "agentLatencyRegressionPercent" },
{ id: "agentTurnMaxMs", title: "Agent Turn Max", unit: "ms", regressionKey: "agentLatencyRegressionPercent" },
{ id: "coldAgentTurnMs", title: "Cold Agent Turn", unit: "ms", regressionKey: "agentLatencyRegressionPercent" },
{ id: "warmAgentTurnMs", title: "Warm Agent Turn", unit: "ms", regressionKey: "agentLatencyRegressionPercent" },
{ id: "agentColdWarmDeltaMs", title: "Cold/Warm Agent Delta", unit: "ms", regressionKey: "agentLatencyRegressionPercent" },
{ id: "agentPreProviderP95Ms", title: "Pre-Provider p95", unit: "ms", regressionKey: "agentLatencyRegressionPercent" },
{ id: "coldPreProviderMs", title: "Cold Pre-Provider", unit: "ms", regressionKey: "agentLatencyRegressionPercent" },
{ id: "warmPreProviderMs", title: "Warm Pre-Provider", unit: "ms", regressionKey: "agentLatencyRegressionPercent" },
{ id: "healthP95Ms", title: "Health p95", unit: "ms", regressionKey: "startupRegressionPercent" },

View File

@ -144,6 +144,7 @@ export function renderMarkdownReport(report) {
lines.push(`- Agent cold/warm: cold ${record.measurements.coldAgentTurnMs ?? "unknown"} ms; warm ${record.measurements.warmAgentTurnMs ?? "unknown"} ms; delta ${record.measurements.agentColdWarmDeltaMs ?? "unknown"} ms`);
lines.push(`- Agent pre-provider: cold ${record.measurements.coldPreProviderMs ?? "unknown"} ms; warm ${record.measurements.warmPreProviderMs ?? "unknown"} ms; delta ${record.measurements.agentColdWarmPreProviderDeltaMs ?? "unknown"} ms`);
lines.push(`- Agent provider final: cold ${record.measurements.coldProviderFinalMs ?? "unknown"} ms; warm ${record.measurements.warmProviderFinalMs ?? "unknown"} ms`);
lines.push(`- Agent turn stats: count ${record.measurements.agentTurnCount}; p95 ${record.measurements.agentTurnP95Ms ?? "unknown"} ms; max ${record.measurements.agentTurnMaxMs ?? "unknown"} ms; pre-provider p95 ${record.measurements.agentPreProviderP95Ms ?? "unknown"} ms`);
}
if (record.measurements.agentProviderAttribution) {
lines.push(`- Provider evidence: ${record.measurements.agentProviderRequestCount ?? 0} request(s); provider work ${record.measurements.agentProviderFinalMs ?? "unknown"} ms; pre-provider ${record.measurements.agentPreProviderMs ?? "unknown"} ms; post-provider ${record.measurements.agentPostProviderMs ?? "unknown"} ms`);
@ -549,6 +550,13 @@ function summarizeMeasurements(measurements) {
providerFirstByteLatencyMs: measurements.providerFirstByteLatencyMs ?? null,
agentTurnCount: measurements.agentTurnCount ?? null,
agentTurns: measurements.agentTurns ?? null,
agentTurnStats: measurements.agentTurnStats ?? null,
agentTurnP95Ms: measurements.agentTurnP95Ms ?? null,
agentTurnMaxMs: measurements.agentTurnMaxMs ?? null,
agentPreProviderP95Ms: measurements.agentPreProviderP95Ms ?? null,
agentPreProviderMaxMs: measurements.agentPreProviderMaxMs ?? null,
agentProviderFinalP95Ms: measurements.agentProviderFinalP95Ms ?? null,
agentProviderFinalMaxMs: measurements.agentProviderFinalMaxMs ?? null,
coldAgentTurnMs: measurements.coldAgentTurnMs ?? null,
warmAgentTurnMs: measurements.warmAgentTurnMs ?? null,
agentColdWarmDeltaMs: measurements.agentColdWarmDeltaMs ?? null,

View File

@ -1226,6 +1226,9 @@ function agentTurnBreakdownCheck() {
agent: { expectedText: "KOVA_AGENT_OK" },
thresholds: {}
}, { surface: { thresholds: {} }, targetPlan: { kind: "local-build" } });
assertEqual(record.measurements.agentTurnStats?.count, 1, "agent turn stats count");
assertEqual(record.measurements.agentTurnP95Ms, 1000, "agent turn p95");
assertEqual(record.measurements.agentPreProviderP95Ms, 200, "agent pre-provider p95");
const rendered = renderMarkdownReport({
generatedAt: "2026-05-01T00:00:00.000Z",
runId: "self-check-agent-turn-breakdown",
@ -1237,6 +1240,7 @@ function agentTurnBreakdownCheck() {
});
assertEqual(rendered.includes("breakdown:"), true, "markdown includes agent turn breakdown");
assertEqual(rendered.includes("models.catalog.* 70ms"), true, "markdown includes source span evidence");
assertEqual(rendered.includes("Agent turn stats:"), true, "markdown includes agent turn stats");
assertEqual(
summarizeAgentTurnBreakdownForMarkdown(normal.breakdown).includes("unknown 15ms"),
true,

View File

@ -2,10 +2,10 @@
"id": "agent-message",
"title": "Agent Message",
"ownerArea": "agent-runtime",
"description": "Send cold and warm local OpenClaw agent messages and verify response latency, provider routing, gateway health, memory, and logs.",
"description": "Send cold, warm, and repeated local OpenClaw agent messages and verify response latency, provider routing, gateway health, memory, and logs.",
"requiredStates": ["mock-openai-provider"],
"targetKinds": ["npm", "channel", "runtime", "local-build"],
"requiredMetrics": ["agentTurnMs", "coldAgentTurnMs", "warmAgentTurnMs", "agentColdWarmDeltaMs", "coldPreProviderMs", "warmPreProviderMs", "healthP95Ms", "peakRssMb", "providerTimeoutMentions", "pluginLoadFailures"],
"requiredMetrics": ["agentTurnMs", "agentTurnP95Ms", "agentTurnMaxMs", "coldAgentTurnMs", "warmAgentTurnMs", "agentColdWarmDeltaMs", "coldPreProviderMs", "warmPreProviderMs", "agentPreProviderP95Ms", "healthP95Ms", "peakRssMb", "providerTimeoutMentions", "pluginLoadFailures"],
"processRoles": ["gateway", "command-tree", "agent-cli", "agent-process", "mock-provider"],
"thresholds": { "agentTurnMs": 45000, "coldAgentTurnMs": 45000, "warmAgentTurnMs": 15000, "coldWarmDeltaMs": 30000, "preProviderMs": 10000, "providerFinalMs": 3000, "healthP95Ms": 1000, "peakRssMb": 900 },
"roleThresholds": {