feat: add long agent session coverage
This commit is contained in:
parent
97454f3f99
commit
d601bde7f8
@ -26,6 +26,7 @@
|
||||
{ "scenario": "provider-models", "state": "model-auth-configured" },
|
||||
{ "scenario": "provider-models", "state": "model-auth-missing" },
|
||||
{ "scenario": "agent-cold-warm-message", "state": "mock-openai-provider", "timeoutMs": 180000 },
|
||||
{ "scenario": "agent-long-session", "state": "mock-openai-provider", "timeoutMs": 360000 },
|
||||
{ "scenario": "agent-provider-slow", "state": "mock-openai-provider", "timeoutMs": 180000 },
|
||||
{ "scenario": "agent-provider-timeout", "state": "mock-openai-provider", "timeoutMs": 180000 },
|
||||
{ "scenario": "agent-provider-malformed", "state": "mock-openai-provider", "timeoutMs": 180000 },
|
||||
|
||||
@ -98,6 +98,7 @@
|
||||
{ "scenario": "agent-provider-malformed", "state": "mock-openai-provider" },
|
||||
{ "scenario": "agent-provider-streaming-stall", "state": "mock-openai-provider" },
|
||||
{ "scenario": "agent-provider-recovery", "state": "mock-openai-provider" },
|
||||
{ "scenario": "agent-long-session", "state": "mock-openai-provider" },
|
||||
{ "scenario": "failure-injection", "state": "broken-plugin-deps" },
|
||||
{ "scenario": "soak", "state": "large-workspace" },
|
||||
{ "scenario": "cross-platform-smoke", "state": "slow-filesystem" }
|
||||
@ -191,6 +192,11 @@
|
||||
"state": "mock-openai-provider",
|
||||
"timeoutMs": 240000
|
||||
},
|
||||
{
|
||||
"scenario": "agent-long-session",
|
||||
"state": "mock-openai-provider",
|
||||
"timeoutMs": 360000
|
||||
},
|
||||
{
|
||||
"scenario": "dashboard-readiness",
|
||||
"state": "fresh"
|
||||
|
||||
@ -32,6 +32,11 @@
|
||||
"scenario": "agent-cold-warm-message",
|
||||
"state": "mock-openai-provider",
|
||||
"timeoutMs": 180000
|
||||
},
|
||||
{
|
||||
"scenario": "agent-long-session",
|
||||
"state": "mock-openai-provider",
|
||||
"timeoutMs": 360000
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
100
scenarios/agent-long-session.json
Normal file
100
scenarios/agent-long-session.json
Normal file
@ -0,0 +1,100 @@
|
||||
{
|
||||
"id": "agent-long-session",
|
||||
"surface": "agent-message",
|
||||
"title": "Agent Long Session",
|
||||
"objective": "Send repeated simple messages through one OpenClaw session to catch latency drift, provider routing drift, resource growth, health degradation, and child-process leaks during normal assistant use.",
|
||||
"tags": ["agent", "message", "latency", "providers", "soak", "long-session"],
|
||||
"timeoutMs": 360000,
|
||||
"agent": {
|
||||
"expectedText": "KOVA_AGENT_OK"
|
||||
},
|
||||
"thresholds": {
|
||||
"gatewayReadyMs": 30000,
|
||||
"agentTurnMs": 45000,
|
||||
"agentTurnP95Ms": 30000,
|
||||
"agentTurnMaxMs": 45000,
|
||||
"preProviderMs": 10000,
|
||||
"agentPreProviderP95Ms": 8000,
|
||||
"agentPreProviderMaxMs": 12000,
|
||||
"providerFinalMs": 3000,
|
||||
"agentProviderFinalP95Ms": 3000,
|
||||
"preProviderDominanceRatio": 0.8,
|
||||
"agentContainmentHealthFailures": 0,
|
||||
"agentProcessLeaks": 0,
|
||||
"statusMs": 10000,
|
||||
"peakRssMb": 900,
|
||||
"missingDependencyErrors": 0,
|
||||
"pluginLoadFailures": 0,
|
||||
"providerTimeoutMentions": 0
|
||||
},
|
||||
"phases": [
|
||||
{
|
||||
"id": "provision",
|
||||
"title": "Provision Long Session Env",
|
||||
"intent": "Start a disposable OpenClaw gateway before wiring the model provider and sending repeated messages.",
|
||||
"commands": ["ocm start {env} {startSelector} --json"],
|
||||
"evidence": ["gateway port", "runtime binding", "startup readiness"]
|
||||
},
|
||||
{
|
||||
"id": "cold-session-turn",
|
||||
"title": "Cold Session Turn",
|
||||
"intent": "Send the first simple message through OpenClaw's real local embedded agent CLI command in a fresh session.",
|
||||
"commands": [
|
||||
"ocm @{env} -- agent --local --agent main --session-id kova-agent-long-session --message 'Reply with exact ASCII text KOVA_AGENT_OK only.' --thinking off --timeout 120 --json"
|
||||
],
|
||||
"evidence": ["cold command duration", "assistant text", "provider request timing", "role resource samples"]
|
||||
},
|
||||
{
|
||||
"id": "warm-session-turn",
|
||||
"title": "Warm Session Turn",
|
||||
"intent": "Send a warm follow-up in the same session to establish cache behavior after cold discovery work.",
|
||||
"commands": [
|
||||
"ocm @{env} -- agent --local --agent main --session-id kova-agent-long-session --message 'Reply with exact ASCII text KOVA_AGENT_OK only.' --thinking off --timeout 120 --json"
|
||||
],
|
||||
"evidence": ["warm command duration", "assistant text", "provider request timing", "cold/warm delta"]
|
||||
},
|
||||
{
|
||||
"id": "session-turn-3",
|
||||
"title": "Session Turn 3",
|
||||
"intent": "Continue the same OpenClaw session to catch repeated-turn latency or resource drift.",
|
||||
"commands": [
|
||||
"ocm @{env} -- agent --local --agent main --session-id kova-agent-long-session --message 'Reply with exact ASCII text KOVA_AGENT_OK only.' --thinking off --timeout 120 --json"
|
||||
],
|
||||
"evidence": ["turn duration", "assistant text", "provider request timing", "role resource samples"]
|
||||
},
|
||||
{
|
||||
"id": "session-turn-4",
|
||||
"title": "Session Turn 4",
|
||||
"intent": "Continue the same OpenClaw session and verify the assistant path remains stable.",
|
||||
"commands": [
|
||||
"ocm @{env} -- agent --local --agent main --session-id kova-agent-long-session --message 'Reply with exact ASCII text KOVA_AGENT_OK only.' --thinking off --timeout 120 --json"
|
||||
],
|
||||
"evidence": ["turn duration", "assistant text", "provider request timing", "role resource samples"]
|
||||
},
|
||||
{
|
||||
"id": "session-turn-5",
|
||||
"title": "Session Turn 5",
|
||||
"intent": "Continue repeated assistant use and catch process or memory growth between turns.",
|
||||
"commands": [
|
||||
"ocm @{env} -- agent --local --agent main --session-id kova-agent-long-session --message 'Reply with exact ASCII text KOVA_AGENT_OK only.' --thinking off --timeout 120 --json"
|
||||
],
|
||||
"evidence": ["turn duration", "assistant text", "provider request timing", "role resource samples"]
|
||||
},
|
||||
{
|
||||
"id": "session-turn-6",
|
||||
"title": "Session Turn 6",
|
||||
"intent": "Send a final repeated assistant message before checking gateway health and process cleanup.",
|
||||
"commands": [
|
||||
"ocm @{env} -- agent --local --agent main --session-id kova-agent-long-session --message 'Reply with exact ASCII text KOVA_AGENT_OK only.' --thinking off --timeout 120 --json"
|
||||
],
|
||||
"evidence": ["turn duration", "assistant text", "provider request timing", "role resource samples"]
|
||||
},
|
||||
{
|
||||
"id": "post-session-health",
|
||||
"title": "Post-Session Gateway Health",
|
||||
"intent": "Verify the gateway remains responsive after repeated agent turns and capture provider/plugin diagnostics.",
|
||||
"commands": ["ocm @{env} -- status", "ocm logs {env} --tail 500 --raw"],
|
||||
"evidence": ["gateway status", "provider logs", "plugin errors", "memory after repeated turns", "process leak summary"]
|
||||
}
|
||||
]
|
||||
}
|
||||
@ -44,6 +44,7 @@ export function evaluateRecord(record, scenario, options = {}) {
|
||||
const coldAgentTurn = selectAgentTurn(agentTurns, "cold") ?? agentTurns[0] ?? null;
|
||||
const warmAgentTurn = selectAgentTurn(agentTurns, "warm") ?? agentTurns[1] ?? null;
|
||||
const providerTurn = collectSlowestProviderTurn(agentTurns);
|
||||
const agentTurnStats = summarizeAgentTurnStats(agentTurns);
|
||||
const agentTurnMs = maxNullable(maxDurationWhere(allResults, isAgentMessageCommand), maxTurnDuration(agentTurns));
|
||||
const agentResponseOk = agentTurns.length === 0 ? null : agentTurns.every((turn) => turn.responseOk === true);
|
||||
const agentProviderSimulation = evaluateProviderSimulation({ turns: agentTurns, scenario, record, thresholds });
|
||||
@ -302,6 +303,7 @@ export function evaluateRecord(record, scenario, options = {}) {
|
||||
}
|
||||
checkAgentTurnCorrectness(violations, agentTurns, scenario.agent?.expectedText ?? null);
|
||||
checkAgentTurnThresholds(violations, agentTurns, { coldAgentTurn, warmAgentTurn, providerTurn, agentLatencyDiagnosis }, thresholds, record);
|
||||
checkAgentTurnAggregateThresholds(violations, agentTurnStats, thresholds);
|
||||
checkProviderSimulation(violations, agentProviderSimulation);
|
||||
checkAgentFailureContainment(violations, agentFailureContainment);
|
||||
|
||||
@ -318,6 +320,16 @@ export function evaluateRecord(record, scenario, options = {}) {
|
||||
agentResponseOk,
|
||||
agentTurnCount: agentTurns.length,
|
||||
agentTurns,
|
||||
agentTurnStats,
|
||||
agentTurnMedianMs: agentTurnStats.totalTurnMs.median,
|
||||
agentTurnP95Ms: agentTurnStats.totalTurnMs.p95,
|
||||
agentTurnMaxMs: agentTurnStats.totalTurnMs.max,
|
||||
agentPreProviderMedianMs: agentTurnStats.preProviderMs.median,
|
||||
agentPreProviderP95Ms: agentTurnStats.preProviderMs.p95,
|
||||
agentPreProviderMaxMs: agentTurnStats.preProviderMs.max,
|
||||
agentProviderFinalMedianMs: agentTurnStats.providerFinalMs.median,
|
||||
agentProviderFinalP95Ms: agentTurnStats.providerFinalMs.p95,
|
||||
agentProviderFinalMaxMs: agentTurnStats.providerFinalMs.max,
|
||||
coldAgentTurnMs: coldAgentTurn?.totalTurnMs ?? null,
|
||||
warmAgentTurnMs: warmAgentTurn?.totalTurnMs ?? null,
|
||||
agentColdWarmDeltaMs: delta(coldAgentTurn?.totalTurnMs, warmAgentTurn?.totalTurnMs),
|
||||
@ -606,6 +618,61 @@ function maxTurnDuration(turns) {
|
||||
return durations.length === 0 ? null : Math.max(...durations);
|
||||
}
|
||||
|
||||
function summarizeAgentTurnStats(turns) {
|
||||
return {
|
||||
schemaVersion: "kova.agentTurnStats.v1",
|
||||
count: turns.length,
|
||||
totalTurnMs: summarizeNumericField(turns, "totalTurnMs"),
|
||||
preProviderMs: summarizeNumericField(turns, "preProviderMs"),
|
||||
providerFinalMs: summarizeNumericField(turns, "providerFinalMs"),
|
||||
postProviderMs: summarizeNumericField(turns, "postProviderMs"),
|
||||
firstByteLatencyMs: summarizeNumericField(turns, "firstByteLatencyMs"),
|
||||
processLeakCount: turns.reduce((sum, turn) => sum + (turn.processLeakCount ?? 0), 0),
|
||||
missingProviderRequestCount: turns.filter((turn) => turn.missingProviderRequest === true).length,
|
||||
responseOkCount: turns.filter((turn) => turn.responseOk === true).length
|
||||
};
|
||||
}
|
||||
|
||||
function summarizeNumericField(items, field) {
|
||||
const values = items
|
||||
.map((item) => item?.[field])
|
||||
.filter((value) => typeof value === "number" && Number.isFinite(value))
|
||||
.toSorted((left, right) => left - right);
|
||||
if (values.length === 0) {
|
||||
return {
|
||||
count: 0,
|
||||
min: null,
|
||||
median: null,
|
||||
p95: null,
|
||||
max: null
|
||||
};
|
||||
}
|
||||
return {
|
||||
count: values.length,
|
||||
min: values[0],
|
||||
median: percentile(values, 50),
|
||||
p95: percentile(values, 95),
|
||||
max: values.at(-1)
|
||||
};
|
||||
}
|
||||
|
||||
function percentile(sortedValues, percentileValue) {
|
||||
if (sortedValues.length === 0) {
|
||||
return null;
|
||||
}
|
||||
if (sortedValues.length === 1) {
|
||||
return sortedValues[0];
|
||||
}
|
||||
const position = (percentileValue / 100) * (sortedValues.length - 1);
|
||||
const lower = Math.floor(position);
|
||||
const upper = Math.ceil(position);
|
||||
if (lower === upper) {
|
||||
return sortedValues[lower];
|
||||
}
|
||||
const weight = position - lower;
|
||||
return Math.round((sortedValues[lower] * (1 - weight) + sortedValues[upper] * weight) * 1000) / 1000;
|
||||
}
|
||||
|
||||
function checkAgentTurnCorrectness(violations, turns, expectedText) {
|
||||
for (const turn of turns) {
|
||||
if (turn.expectedFailure === true) {
|
||||
@ -860,6 +927,28 @@ function checkAgentTurnThresholds(violations, turns, selected, thresholds, recor
|
||||
}
|
||||
}
|
||||
|
||||
function checkAgentTurnAggregateThresholds(violations, stats, thresholds) {
|
||||
checkAggregateThreshold(violations, stats.totalTurnMs.p95, "agentTurnP95Ms", thresholds.agentTurnP95Ms);
|
||||
checkAggregateThreshold(violations, stats.totalTurnMs.max, "agentTurnMaxMs", thresholds.agentTurnMaxMs);
|
||||
checkAggregateThreshold(violations, stats.preProviderMs.p95, "agentPreProviderP95Ms", thresholds.agentPreProviderP95Ms);
|
||||
checkAggregateThreshold(violations, stats.preProviderMs.max, "agentPreProviderMaxMs", thresholds.agentPreProviderMaxMs);
|
||||
checkAggregateThreshold(violations, stats.providerFinalMs.p95, "agentProviderFinalP95Ms", thresholds.agentProviderFinalP95Ms);
|
||||
checkAggregateThreshold(violations, stats.providerFinalMs.max, "agentProviderFinalMaxMs", thresholds.agentProviderFinalMaxMs);
|
||||
}
|
||||
|
||||
function checkAggregateThreshold(violations, actual, metric, threshold) {
|
||||
if (typeof threshold !== "number" || typeof actual !== "number" || actual <= threshold) {
|
||||
return;
|
||||
}
|
||||
violations.push({
|
||||
kind: "agent-latency",
|
||||
metric,
|
||||
expected: `<= ${threshold}`,
|
||||
actual,
|
||||
message: `${metric} ${actual}ms exceeded threshold ${threshold}ms`
|
||||
});
|
||||
}
|
||||
|
||||
function checkTurnThreshold(violations, turn, metric, threshold, message) {
|
||||
if (!turn || typeof threshold !== "number" || typeof turn[metric] !== "number" || turn[metric] <= threshold) {
|
||||
return;
|
||||
|
||||
@ -9,9 +9,12 @@ export const PERFORMANCE_METRICS = [
|
||||
{ id: "openclawEventLoopMaxMs", title: "Event Loop Max", unit: "ms", regressionKey: "eventLoopRegressionPercent" },
|
||||
{ id: "eventLoopDelayMs", title: "Event Loop Delay", unit: "ms", regressionKey: "eventLoopRegressionPercent" },
|
||||
{ id: "agentTurnMs", title: "Agent Turn", unit: "ms", regressionKey: "agentLatencyRegressionPercent" },
|
||||
{ id: "agentTurnP95Ms", title: "Agent Turn p95", unit: "ms", regressionKey: "agentLatencyRegressionPercent" },
|
||||
{ id: "agentTurnMaxMs", title: "Agent Turn Max", unit: "ms", regressionKey: "agentLatencyRegressionPercent" },
|
||||
{ id: "coldAgentTurnMs", title: "Cold Agent Turn", unit: "ms", regressionKey: "agentLatencyRegressionPercent" },
|
||||
{ id: "warmAgentTurnMs", title: "Warm Agent Turn", unit: "ms", regressionKey: "agentLatencyRegressionPercent" },
|
||||
{ id: "agentColdWarmDeltaMs", title: "Cold/Warm Agent Delta", unit: "ms", regressionKey: "agentLatencyRegressionPercent" },
|
||||
{ id: "agentPreProviderP95Ms", title: "Pre-Provider p95", unit: "ms", regressionKey: "agentLatencyRegressionPercent" },
|
||||
{ id: "coldPreProviderMs", title: "Cold Pre-Provider", unit: "ms", regressionKey: "agentLatencyRegressionPercent" },
|
||||
{ id: "warmPreProviderMs", title: "Warm Pre-Provider", unit: "ms", regressionKey: "agentLatencyRegressionPercent" },
|
||||
{ id: "healthP95Ms", title: "Health p95", unit: "ms", regressionKey: "startupRegressionPercent" },
|
||||
|
||||
@ -144,6 +144,7 @@ export function renderMarkdownReport(report) {
|
||||
lines.push(`- Agent cold/warm: cold ${record.measurements.coldAgentTurnMs ?? "unknown"} ms; warm ${record.measurements.warmAgentTurnMs ?? "unknown"} ms; delta ${record.measurements.agentColdWarmDeltaMs ?? "unknown"} ms`);
|
||||
lines.push(`- Agent pre-provider: cold ${record.measurements.coldPreProviderMs ?? "unknown"} ms; warm ${record.measurements.warmPreProviderMs ?? "unknown"} ms; delta ${record.measurements.agentColdWarmPreProviderDeltaMs ?? "unknown"} ms`);
|
||||
lines.push(`- Agent provider final: cold ${record.measurements.coldProviderFinalMs ?? "unknown"} ms; warm ${record.measurements.warmProviderFinalMs ?? "unknown"} ms`);
|
||||
lines.push(`- Agent turn stats: count ${record.measurements.agentTurnCount}; p95 ${record.measurements.agentTurnP95Ms ?? "unknown"} ms; max ${record.measurements.agentTurnMaxMs ?? "unknown"} ms; pre-provider p95 ${record.measurements.agentPreProviderP95Ms ?? "unknown"} ms`);
|
||||
}
|
||||
if (record.measurements.agentProviderAttribution) {
|
||||
lines.push(`- Provider evidence: ${record.measurements.agentProviderRequestCount ?? 0} request(s); provider work ${record.measurements.agentProviderFinalMs ?? "unknown"} ms; pre-provider ${record.measurements.agentPreProviderMs ?? "unknown"} ms; post-provider ${record.measurements.agentPostProviderMs ?? "unknown"} ms`);
|
||||
@ -549,6 +550,13 @@ function summarizeMeasurements(measurements) {
|
||||
providerFirstByteLatencyMs: measurements.providerFirstByteLatencyMs ?? null,
|
||||
agentTurnCount: measurements.agentTurnCount ?? null,
|
||||
agentTurns: measurements.agentTurns ?? null,
|
||||
agentTurnStats: measurements.agentTurnStats ?? null,
|
||||
agentTurnP95Ms: measurements.agentTurnP95Ms ?? null,
|
||||
agentTurnMaxMs: measurements.agentTurnMaxMs ?? null,
|
||||
agentPreProviderP95Ms: measurements.agentPreProviderP95Ms ?? null,
|
||||
agentPreProviderMaxMs: measurements.agentPreProviderMaxMs ?? null,
|
||||
agentProviderFinalP95Ms: measurements.agentProviderFinalP95Ms ?? null,
|
||||
agentProviderFinalMaxMs: measurements.agentProviderFinalMaxMs ?? null,
|
||||
coldAgentTurnMs: measurements.coldAgentTurnMs ?? null,
|
||||
warmAgentTurnMs: measurements.warmAgentTurnMs ?? null,
|
||||
agentColdWarmDeltaMs: measurements.agentColdWarmDeltaMs ?? null,
|
||||
|
||||
@ -1226,6 +1226,9 @@ function agentTurnBreakdownCheck() {
|
||||
agent: { expectedText: "KOVA_AGENT_OK" },
|
||||
thresholds: {}
|
||||
}, { surface: { thresholds: {} }, targetPlan: { kind: "local-build" } });
|
||||
assertEqual(record.measurements.agentTurnStats?.count, 1, "agent turn stats count");
|
||||
assertEqual(record.measurements.agentTurnP95Ms, 1000, "agent turn p95");
|
||||
assertEqual(record.measurements.agentPreProviderP95Ms, 200, "agent pre-provider p95");
|
||||
const rendered = renderMarkdownReport({
|
||||
generatedAt: "2026-05-01T00:00:00.000Z",
|
||||
runId: "self-check-agent-turn-breakdown",
|
||||
@ -1237,6 +1240,7 @@ function agentTurnBreakdownCheck() {
|
||||
});
|
||||
assertEqual(rendered.includes("breakdown:"), true, "markdown includes agent turn breakdown");
|
||||
assertEqual(rendered.includes("models.catalog.* 70ms"), true, "markdown includes source span evidence");
|
||||
assertEqual(rendered.includes("Agent turn stats:"), true, "markdown includes agent turn stats");
|
||||
assertEqual(
|
||||
summarizeAgentTurnBreakdownForMarkdown(normal.breakdown).includes("unknown 15ms"),
|
||||
true,
|
||||
|
||||
@ -2,10 +2,10 @@
|
||||
"id": "agent-message",
|
||||
"title": "Agent Message",
|
||||
"ownerArea": "agent-runtime",
|
||||
"description": "Send cold and warm local OpenClaw agent messages and verify response latency, provider routing, gateway health, memory, and logs.",
|
||||
"description": "Send cold, warm, and repeated local OpenClaw agent messages and verify response latency, provider routing, gateway health, memory, and logs.",
|
||||
"requiredStates": ["mock-openai-provider"],
|
||||
"targetKinds": ["npm", "channel", "runtime", "local-build"],
|
||||
"requiredMetrics": ["agentTurnMs", "coldAgentTurnMs", "warmAgentTurnMs", "agentColdWarmDeltaMs", "coldPreProviderMs", "warmPreProviderMs", "healthP95Ms", "peakRssMb", "providerTimeoutMentions", "pluginLoadFailures"],
|
||||
"requiredMetrics": ["agentTurnMs", "agentTurnP95Ms", "agentTurnMaxMs", "coldAgentTurnMs", "warmAgentTurnMs", "agentColdWarmDeltaMs", "coldPreProviderMs", "warmPreProviderMs", "agentPreProviderP95Ms", "healthP95Ms", "peakRssMb", "providerTimeoutMentions", "pluginLoadFailures"],
|
||||
"processRoles": ["gateway", "command-tree", "agent-cli", "agent-process", "mock-provider"],
|
||||
"thresholds": { "agentTurnMs": 45000, "coldAgentTurnMs": 45000, "warmAgentTurnMs": 15000, "coldWarmDeltaMs": 30000, "preProviderMs": 10000, "providerFinalMs": 3000, "healthP95Ms": 1000, "peakRssMb": 900 },
|
||||
"roleThresholds": {
|
||||
|
||||
Loading…
Reference in New Issue
Block a user