feat: add long agent session coverage

2026-05-01 08:26:39 +01:00 · 2026-05-01 08:26:39 +01:00 · d601bde7f8
commit d601bde7f8
parent 97454f3f99
9 changed files with 218 additions and 2 deletions
--- a/profiles/exhaustive.json
+++ b/profiles/exhaustive.json
@ -26,6 +26,7 @@
    { "scenario": "provider-models", "state": "model-auth-configured" },
    { "scenario": "provider-models", "state": "model-auth-missing" },
    { "scenario": "agent-cold-warm-message", "state": "mock-openai-provider", "timeoutMs": 180000 },
+    { "scenario": "agent-long-session", "state": "mock-openai-provider", "timeoutMs": 360000 },
    { "scenario": "agent-provider-slow", "state": "mock-openai-provider", "timeoutMs": 180000 },
    { "scenario": "agent-provider-timeout", "state": "mock-openai-provider", "timeoutMs": 180000 },
    { "scenario": "agent-provider-malformed", "state": "mock-openai-provider", "timeoutMs": 180000 },
--- a/profiles/release.json
+++ b/profiles/release.json
@ -98,6 +98,7 @@
      { "scenario": "agent-provider-malformed", "state": "mock-openai-provider" },
      { "scenario": "agent-provider-streaming-stall", "state": "mock-openai-provider" },
      { "scenario": "agent-provider-recovery", "state": "mock-openai-provider" },
+      { "scenario": "agent-long-session", "state": "mock-openai-provider" },
      { "scenario": "failure-injection", "state": "broken-plugin-deps" },
      { "scenario": "soak", "state": "large-workspace" },
      { "scenario": "cross-platform-smoke", "state": "slow-filesystem" }
@ -191,6 +192,11 @@
      "state": "mock-openai-provider",
      "timeoutMs": 240000
    },
+    {
+      "scenario": "agent-long-session",
+      "state": "mock-openai-provider",
+      "timeoutMs": 360000
+    },
    {
      "scenario": "dashboard-readiness",
      "state": "fresh"
--- a/profiles/soak.json
+++ b/profiles/soak.json
@ -32,6 +32,11 @@
      "scenario": "agent-cold-warm-message",
      "state": "mock-openai-provider",
      "timeoutMs": 180000
+    },
+    {
+      "scenario": "agent-long-session",
+      "state": "mock-openai-provider",
+      "timeoutMs": 360000
    }
  ]
 }
--- a/scenarios/agent-long-session.json
+++ b/scenarios/agent-long-session.json
@ -0,0 +1,100 @@
+{
+  "id": "agent-long-session",
+  "surface": "agent-message",
+  "title": "Agent Long Session",
+  "objective": "Send repeated simple messages through one OpenClaw session to catch latency drift, provider routing drift, resource growth, health degradation, and child-process leaks during normal assistant use.",
+  "tags": ["agent", "message", "latency", "providers", "soak", "long-session"],
+  "timeoutMs": 360000,
+  "agent": {
+    "expectedText": "KOVA_AGENT_OK"
+  },
+  "thresholds": {
+    "gatewayReadyMs": 30000,
+    "agentTurnMs": 45000,
+    "agentTurnP95Ms": 30000,
+    "agentTurnMaxMs": 45000,
+    "preProviderMs": 10000,
+    "agentPreProviderP95Ms": 8000,
+    "agentPreProviderMaxMs": 12000,
+    "providerFinalMs": 3000,
+    "agentProviderFinalP95Ms": 3000,
+    "preProviderDominanceRatio": 0.8,
+    "agentContainmentHealthFailures": 0,
+    "agentProcessLeaks": 0,
+    "statusMs": 10000,
+    "peakRssMb": 900,
+    "missingDependencyErrors": 0,
+    "pluginLoadFailures": 0,
+    "providerTimeoutMentions": 0
+  },
+  "phases": [
+    {
+      "id": "provision",
+      "title": "Provision Long Session Env",
+      "intent": "Start a disposable OpenClaw gateway before wiring the model provider and sending repeated messages.",
+      "commands": ["ocm start {env} {startSelector} --json"],
+      "evidence": ["gateway port", "runtime binding", "startup readiness"]
+    },
+    {
+      "id": "cold-session-turn",
+      "title": "Cold Session Turn",
+      "intent": "Send the first simple message through OpenClaw's real local embedded agent CLI command in a fresh session.",
+      "commands": [
+        "ocm @{env} -- agent --local --agent main --session-id kova-agent-long-session --message 'Reply with exact ASCII text KOVA_AGENT_OK only.' --thinking off --timeout 120 --json"
+      ],
+      "evidence": ["cold command duration", "assistant text", "provider request timing", "role resource samples"]
+    },
+    {
+      "id": "warm-session-turn",
+      "title": "Warm Session Turn",
+      "intent": "Send a warm follow-up in the same session to establish cache behavior after cold discovery work.",
+      "commands": [
+        "ocm @{env} -- agent --local --agent main --session-id kova-agent-long-session --message 'Reply with exact ASCII text KOVA_AGENT_OK only.' --thinking off --timeout 120 --json"
+      ],
+      "evidence": ["warm command duration", "assistant text", "provider request timing", "cold/warm delta"]
+    },
+    {
+      "id": "session-turn-3",
+      "title": "Session Turn 3",
+      "intent": "Continue the same OpenClaw session to catch repeated-turn latency or resource drift.",
+      "commands": [
+        "ocm @{env} -- agent --local --agent main --session-id kova-agent-long-session --message 'Reply with exact ASCII text KOVA_AGENT_OK only.' --thinking off --timeout 120 --json"
+      ],
+      "evidence": ["turn duration", "assistant text", "provider request timing", "role resource samples"]
+    },
+    {
+      "id": "session-turn-4",
+      "title": "Session Turn 4",
+      "intent": "Continue the same OpenClaw session and verify the assistant path remains stable.",
+      "commands": [
+        "ocm @{env} -- agent --local --agent main --session-id kova-agent-long-session --message 'Reply with exact ASCII text KOVA_AGENT_OK only.' --thinking off --timeout 120 --json"
+      ],
+      "evidence": ["turn duration", "assistant text", "provider request timing", "role resource samples"]
+    },
+    {
+      "id": "session-turn-5",
+      "title": "Session Turn 5",
+      "intent": "Continue repeated assistant use and catch process or memory growth between turns.",
+      "commands": [
+        "ocm @{env} -- agent --local --agent main --session-id kova-agent-long-session --message 'Reply with exact ASCII text KOVA_AGENT_OK only.' --thinking off --timeout 120 --json"
+      ],
+      "evidence": ["turn duration", "assistant text", "provider request timing", "role resource samples"]
+    },
+    {
+      "id": "session-turn-6",
+      "title": "Session Turn 6",
+      "intent": "Send a final repeated assistant message before checking gateway health and process cleanup.",
+      "commands": [
+        "ocm @{env} -- agent --local --agent main --session-id kova-agent-long-session --message 'Reply with exact ASCII text KOVA_AGENT_OK only.' --thinking off --timeout 120 --json"
+      ],
+      "evidence": ["turn duration", "assistant text", "provider request timing", "role resource samples"]
+    },
+    {
+      "id": "post-session-health",
+      "title": "Post-Session Gateway Health",
+      "intent": "Verify the gateway remains responsive after repeated agent turns and capture provider/plugin diagnostics.",
+      "commands": ["ocm @{env} -- status", "ocm logs {env} --tail 500 --raw"],
+      "evidence": ["gateway status", "provider logs", "plugin errors", "memory after repeated turns", "process leak summary"]
+    }
+  ]
+}
--- a/src/evaluator.mjs
+++ b/src/evaluator.mjs
@ -44,6 +44,7 @@ export function evaluateRecord(record, scenario, options = {}) {
  const coldAgentTurn = selectAgentTurn(agentTurns, "cold") ?? agentTurns[0] ?? null;
  const warmAgentTurn = selectAgentTurn(agentTurns, "warm") ?? agentTurns[1] ?? null;
  const providerTurn = collectSlowestProviderTurn(agentTurns);
+  const agentTurnStats = summarizeAgentTurnStats(agentTurns);
  const agentTurnMs = maxNullable(maxDurationWhere(allResults, isAgentMessageCommand), maxTurnDuration(agentTurns));
  const agentResponseOk = agentTurns.length === 0 ? null : agentTurns.every((turn) => turn.responseOk === true);
  const agentProviderSimulation = evaluateProviderSimulation({ turns: agentTurns, scenario, record, thresholds });
@ -302,6 +303,7 @@ export function evaluateRecord(record, scenario, options = {}) {
  }
  checkAgentTurnCorrectness(violations, agentTurns, scenario.agent?.expectedText ?? null);
  checkAgentTurnThresholds(violations, agentTurns, { coldAgentTurn, warmAgentTurn, providerTurn, agentLatencyDiagnosis }, thresholds, record);
+  checkAgentTurnAggregateThresholds(violations, agentTurnStats, thresholds);
  checkProviderSimulation(violations, agentProviderSimulation);
  checkAgentFailureContainment(violations, agentFailureContainment);

@ -318,6 +320,16 @@ export function evaluateRecord(record, scenario, options = {}) {
    agentResponseOk,
    agentTurnCount: agentTurns.length,
    agentTurns,
+    agentTurnStats,
+    agentTurnMedianMs: agentTurnStats.totalTurnMs.median,
+    agentTurnP95Ms: agentTurnStats.totalTurnMs.p95,
+    agentTurnMaxMs: agentTurnStats.totalTurnMs.max,
+    agentPreProviderMedianMs: agentTurnStats.preProviderMs.median,
+    agentPreProviderP95Ms: agentTurnStats.preProviderMs.p95,
+    agentPreProviderMaxMs: agentTurnStats.preProviderMs.max,
+    agentProviderFinalMedianMs: agentTurnStats.providerFinalMs.median,
+    agentProviderFinalP95Ms: agentTurnStats.providerFinalMs.p95,
+    agentProviderFinalMaxMs: agentTurnStats.providerFinalMs.max,
    coldAgentTurnMs: coldAgentTurn?.totalTurnMs ?? null,
    warmAgentTurnMs: warmAgentTurn?.totalTurnMs ?? null,
    agentColdWarmDeltaMs: delta(coldAgentTurn?.totalTurnMs, warmAgentTurn?.totalTurnMs),
@ -606,6 +618,61 @@ function maxTurnDuration(turns) {
  return durations.length === 0 ? null : Math.max(...durations);
 }

+function summarizeAgentTurnStats(turns) {
+  return {
+    schemaVersion: "kova.agentTurnStats.v1",
+    count: turns.length,
+    totalTurnMs: summarizeNumericField(turns, "totalTurnMs"),
+    preProviderMs: summarizeNumericField(turns, "preProviderMs"),
+    providerFinalMs: summarizeNumericField(turns, "providerFinalMs"),
+    postProviderMs: summarizeNumericField(turns, "postProviderMs"),
+    firstByteLatencyMs: summarizeNumericField(turns, "firstByteLatencyMs"),
+    processLeakCount: turns.reduce((sum, turn) => sum + (turn.processLeakCount ?? 0), 0),
+    missingProviderRequestCount: turns.filter((turn) => turn.missingProviderRequest === true).length,
+    responseOkCount: turns.filter((turn) => turn.responseOk === true).length
+  };
+}
+
+function summarizeNumericField(items, field) {
+  const values = items
+    .map((item) => item?.[field])
+    .filter((value) => typeof value === "number" && Number.isFinite(value))
+    .toSorted((left, right) => left - right);
+  if (values.length === 0) {
+    return {
+      count: 0,
+      min: null,
+      median: null,
+      p95: null,
+      max: null
+    };
+  }
+  return {
+    count: values.length,
+    min: values[0],
+    median: percentile(values, 50),
+    p95: percentile(values, 95),
+    max: values.at(-1)
+  };
+}
+
+function percentile(sortedValues, percentileValue) {
+  if (sortedValues.length === 0) {
+    return null;
+  }
+  if (sortedValues.length === 1) {
+    return sortedValues[0];
+  }
+  const position = (percentileValue / 100) * (sortedValues.length - 1);
+  const lower = Math.floor(position);
+  const upper = Math.ceil(position);
+  if (lower === upper) {
+    return sortedValues[lower];
+  }
+  const weight = position - lower;
+  return Math.round((sortedValues[lower] * (1 - weight) + sortedValues[upper] * weight) * 1000) / 1000;
+}
+
 function checkAgentTurnCorrectness(violations, turns, expectedText) {
  for (const turn of turns) {
    if (turn.expectedFailure === true) {
@ -860,6 +927,28 @@ function checkAgentTurnThresholds(violations, turns, selected, thresholds, recor
  }
 }

+function checkAgentTurnAggregateThresholds(violations, stats, thresholds) {
+  checkAggregateThreshold(violations, stats.totalTurnMs.p95, "agentTurnP95Ms", thresholds.agentTurnP95Ms);
+  checkAggregateThreshold(violations, stats.totalTurnMs.max, "agentTurnMaxMs", thresholds.agentTurnMaxMs);
+  checkAggregateThreshold(violations, stats.preProviderMs.p95, "agentPreProviderP95Ms", thresholds.agentPreProviderP95Ms);
+  checkAggregateThreshold(violations, stats.preProviderMs.max, "agentPreProviderMaxMs", thresholds.agentPreProviderMaxMs);
+  checkAggregateThreshold(violations, stats.providerFinalMs.p95, "agentProviderFinalP95Ms", thresholds.agentProviderFinalP95Ms);
+  checkAggregateThreshold(violations, stats.providerFinalMs.max, "agentProviderFinalMaxMs", thresholds.agentProviderFinalMaxMs);
+}
+
+function checkAggregateThreshold(violations, actual, metric, threshold) {
+  if (typeof threshold !== "number" || typeof actual !== "number" || actual <= threshold) {
+    return;
+  }
+  violations.push({
+    kind: "agent-latency",
+    metric,
+    expected: `<= ${threshold}`,
+    actual,
+    message: `${metric} ${actual}ms exceeded threshold ${threshold}ms`
+  });
+}
+
 function checkTurnThreshold(violations, turn, metric, threshold, message) {
  if (!turn || typeof threshold !== "number" || typeof turn[metric] !== "number" || turn[metric] <= threshold) {
    return;
--- a/src/performance/stats.mjs
+++ b/src/performance/stats.mjs
@ -9,9 +9,12 @@ export const PERFORMANCE_METRICS = [
  { id: "openclawEventLoopMaxMs", title: "Event Loop Max", unit: "ms", regressionKey: "eventLoopRegressionPercent" },
  { id: "eventLoopDelayMs", title: "Event Loop Delay", unit: "ms", regressionKey: "eventLoopRegressionPercent" },
  { id: "agentTurnMs", title: "Agent Turn", unit: "ms", regressionKey: "agentLatencyRegressionPercent" },
+  { id: "agentTurnP95Ms", title: "Agent Turn p95", unit: "ms", regressionKey: "agentLatencyRegressionPercent" },
+  { id: "agentTurnMaxMs", title: "Agent Turn Max", unit: "ms", regressionKey: "agentLatencyRegressionPercent" },
  { id: "coldAgentTurnMs", title: "Cold Agent Turn", unit: "ms", regressionKey: "agentLatencyRegressionPercent" },
  { id: "warmAgentTurnMs", title: "Warm Agent Turn", unit: "ms", regressionKey: "agentLatencyRegressionPercent" },
  { id: "agentColdWarmDeltaMs", title: "Cold/Warm Agent Delta", unit: "ms", regressionKey: "agentLatencyRegressionPercent" },
+  { id: "agentPreProviderP95Ms", title: "Pre-Provider p95", unit: "ms", regressionKey: "agentLatencyRegressionPercent" },
  { id: "coldPreProviderMs", title: "Cold Pre-Provider", unit: "ms", regressionKey: "agentLatencyRegressionPercent" },
  { id: "warmPreProviderMs", title: "Warm Pre-Provider", unit: "ms", regressionKey: "agentLatencyRegressionPercent" },
  { id: "healthP95Ms", title: "Health p95", unit: "ms", regressionKey: "startupRegressionPercent" },
--- a/src/report.mjs
+++ b/src/report.mjs
@ -144,6 +144,7 @@ export function renderMarkdownReport(report) {
        lines.push(`- Agent cold/warm: cold ${record.measurements.coldAgentTurnMs ?? "unknown"} ms; warm ${record.measurements.warmAgentTurnMs ?? "unknown"} ms; delta ${record.measurements.agentColdWarmDeltaMs ?? "unknown"} ms`);
        lines.push(`- Agent pre-provider: cold ${record.measurements.coldPreProviderMs ?? "unknown"} ms; warm ${record.measurements.warmPreProviderMs ?? "unknown"} ms; delta ${record.measurements.agentColdWarmPreProviderDeltaMs ?? "unknown"} ms`);
        lines.push(`- Agent provider final: cold ${record.measurements.coldProviderFinalMs ?? "unknown"} ms; warm ${record.measurements.warmProviderFinalMs ?? "unknown"} ms`);
+        lines.push(`- Agent turn stats: count ${record.measurements.agentTurnCount}; p95 ${record.measurements.agentTurnP95Ms ?? "unknown"} ms; max ${record.measurements.agentTurnMaxMs ?? "unknown"} ms; pre-provider p95 ${record.measurements.agentPreProviderP95Ms ?? "unknown"} ms`);
      }
      if (record.measurements.agentProviderAttribution) {
        lines.push(`- Provider evidence: ${record.measurements.agentProviderRequestCount ?? 0} request(s); provider work ${record.measurements.agentProviderFinalMs ?? "unknown"} ms; pre-provider ${record.measurements.agentPreProviderMs ?? "unknown"} ms; post-provider ${record.measurements.agentPostProviderMs ?? "unknown"} ms`);
@ -549,6 +550,13 @@ function summarizeMeasurements(measurements) {
    providerFirstByteLatencyMs: measurements.providerFirstByteLatencyMs ?? null,
    agentTurnCount: measurements.agentTurnCount ?? null,
    agentTurns: measurements.agentTurns ?? null,
+    agentTurnStats: measurements.agentTurnStats ?? null,
+    agentTurnP95Ms: measurements.agentTurnP95Ms ?? null,
+    agentTurnMaxMs: measurements.agentTurnMaxMs ?? null,
+    agentPreProviderP95Ms: measurements.agentPreProviderP95Ms ?? null,
+    agentPreProviderMaxMs: measurements.agentPreProviderMaxMs ?? null,
+    agentProviderFinalP95Ms: measurements.agentProviderFinalP95Ms ?? null,
+    agentProviderFinalMaxMs: measurements.agentProviderFinalMaxMs ?? null,
    coldAgentTurnMs: measurements.coldAgentTurnMs ?? null,
    warmAgentTurnMs: measurements.warmAgentTurnMs ?? null,
    agentColdWarmDeltaMs: measurements.agentColdWarmDeltaMs ?? null,
--- a/src/selfcheck.mjs
+++ b/src/selfcheck.mjs
@ -1226,6 +1226,9 @@ function agentTurnBreakdownCheck() {
      agent: { expectedText: "KOVA_AGENT_OK" },
      thresholds: {}
    }, { surface: { thresholds: {} }, targetPlan: { kind: "local-build" } });
+    assertEqual(record.measurements.agentTurnStats?.count, 1, "agent turn stats count");
+    assertEqual(record.measurements.agentTurnP95Ms, 1000, "agent turn p95");
+    assertEqual(record.measurements.agentPreProviderP95Ms, 200, "agent pre-provider p95");
    const rendered = renderMarkdownReport({
      generatedAt: "2026-05-01T00:00:00.000Z",
      runId: "self-check-agent-turn-breakdown",
@ -1237,6 +1240,7 @@ function agentTurnBreakdownCheck() {
    });
    assertEqual(rendered.includes("breakdown:"), true, "markdown includes agent turn breakdown");
    assertEqual(rendered.includes("models.catalog.* 70ms"), true, "markdown includes source span evidence");
+    assertEqual(rendered.includes("Agent turn stats:"), true, "markdown includes agent turn stats");
    assertEqual(
      summarizeAgentTurnBreakdownForMarkdown(normal.breakdown).includes("unknown 15ms"),
      true,
--- a/surfaces/agent-message.json
+++ b/surfaces/agent-message.json
@ -2,10 +2,10 @@
  "id": "agent-message",
  "title": "Agent Message",
  "ownerArea": "agent-runtime",
-  "description": "Send cold and warm local OpenClaw agent messages and verify response latency, provider routing, gateway health, memory, and logs.",
+  "description": "Send cold, warm, and repeated local OpenClaw agent messages and verify response latency, provider routing, gateway health, memory, and logs.",
  "requiredStates": ["mock-openai-provider"],
  "targetKinds": ["npm", "channel", "runtime", "local-build"],
-  "requiredMetrics": ["agentTurnMs", "coldAgentTurnMs", "warmAgentTurnMs", "agentColdWarmDeltaMs", "coldPreProviderMs", "warmPreProviderMs", "healthP95Ms", "peakRssMb", "providerTimeoutMentions", "pluginLoadFailures"],
+  "requiredMetrics": ["agentTurnMs", "agentTurnP95Ms", "agentTurnMaxMs", "coldAgentTurnMs", "warmAgentTurnMs", "agentColdWarmDeltaMs", "coldPreProviderMs", "warmPreProviderMs", "agentPreProviderP95Ms", "healthP95Ms", "peakRssMb", "providerTimeoutMentions", "pluginLoadFailures"],
  "processRoles": ["gateway", "command-tree", "agent-cli", "agent-process", "mock-provider"],
  "thresholds": { "agentTurnMs": 45000, "coldAgentTurnMs": 45000, "warmAgentTurnMs": 15000, "coldWarmDeltaMs": 30000, "preProviderMs": 10000, "providerFinalMs": 3000, "healthP95Ms": 1000, "peakRssMb": 900 },
  "roleThresholds": {