From d601bde7f8b85d77db565b6846133adea6aae14c Mon Sep 17 00:00:00 2001
From: Shakker <shakkerdroid@gmail.com>
Date: Fri, 1 May 2026 08:26:39 +0100
Subject: [PATCH] feat: add long agent session coverage

---
 profiles/exhaustive.json          |   1 +
 profiles/release.json             |   6 ++
 profiles/soak.json                |   5 ++
 scenarios/agent-long-session.json | 100 ++++++++++++++++++++++++++++++
 src/evaluator.mjs                 |  89 ++++++++++++++++++++++++++
 src/performance/stats.mjs         |   3 +
 src/report.mjs                    |   8 +++
 src/selfcheck.mjs                 |   4 ++
 surfaces/agent-message.json       |   4 +-
 9 files changed, 218 insertions(+), 2 deletions(-)
 create mode 100644 scenarios/agent-long-session.json

diff --git a/profiles/exhaustive.json b/profiles/exhaustive.json
index c73e76f..2865da1 100644
--- a/profiles/exhaustive.json
+++ b/profiles/exhaustive.json
@@ -26,6 +26,7 @@
     { "scenario": "provider-models", "state": "model-auth-configured" },
     { "scenario": "provider-models", "state": "model-auth-missing" },
     { "scenario": "agent-cold-warm-message", "state": "mock-openai-provider", "timeoutMs": 180000 },
+    { "scenario": "agent-long-session", "state": "mock-openai-provider", "timeoutMs": 360000 },
     { "scenario": "agent-provider-slow", "state": "mock-openai-provider", "timeoutMs": 180000 },
     { "scenario": "agent-provider-timeout", "state": "mock-openai-provider", "timeoutMs": 180000 },
     { "scenario": "agent-provider-malformed", "state": "mock-openai-provider", "timeoutMs": 180000 },
diff --git a/profiles/release.json b/profiles/release.json
index a5ab40a..eabc1cc 100644
--- a/profiles/release.json
+++ b/profiles/release.json
@@ -98,6 +98,7 @@
       { "scenario": "agent-provider-malformed", "state": "mock-openai-provider" },
       { "scenario": "agent-provider-streaming-stall", "state": "mock-openai-provider" },
       { "scenario": "agent-provider-recovery", "state": "mock-openai-provider" },
+      { "scenario": "agent-long-session", "state": "mock-openai-provider" },
       { "scenario": "failure-injection", "state": "broken-plugin-deps" },
       { "scenario": "soak", "state": "large-workspace" },
       { "scenario": "cross-platform-smoke", "state": "slow-filesystem" }
@@ -191,6 +192,11 @@
       "state": "mock-openai-provider",
       "timeoutMs": 240000
     },
+    {
+      "scenario": "agent-long-session",
+      "state": "mock-openai-provider",
+      "timeoutMs": 360000
+    },
     {
       "scenario": "dashboard-readiness",
       "state": "fresh"
diff --git a/profiles/soak.json b/profiles/soak.json
index a252ba9..00416a4 100644
--- a/profiles/soak.json
+++ b/profiles/soak.json
@@ -32,6 +32,11 @@
       "scenario": "agent-cold-warm-message",
       "state": "mock-openai-provider",
       "timeoutMs": 180000
+    },
+    {
+      "scenario": "agent-long-session",
+      "state": "mock-openai-provider",
+      "timeoutMs": 360000
     }
   ]
 }
diff --git a/scenarios/agent-long-session.json b/scenarios/agent-long-session.json
new file mode 100644
index 0000000..1046a91
--- /dev/null
+++ b/scenarios/agent-long-session.json
@@ -0,0 +1,100 @@
+{
+  "id": "agent-long-session",
+  "surface": "agent-message",
+  "title": "Agent Long Session",
+  "objective": "Send repeated simple messages through one OpenClaw session to catch latency drift, provider routing drift, resource growth, health degradation, and child-process leaks during normal assistant use.",
+  "tags": ["agent", "message", "latency", "providers", "soak", "long-session"],
+  "timeoutMs": 360000,
+  "agent": {
+    "expectedText": "KOVA_AGENT_OK"
+  },
+  "thresholds": {
+    "gatewayReadyMs": 30000,
+    "agentTurnMs": 45000,
+    "agentTurnP95Ms": 30000,
+    "agentTurnMaxMs": 45000,
+    "preProviderMs": 10000,
+    "agentPreProviderP95Ms": 8000,
+    "agentPreProviderMaxMs": 12000,
+    "providerFinalMs": 3000,
+    "agentProviderFinalP95Ms": 3000,
+    "preProviderDominanceRatio": 0.8,
+    "agentContainmentHealthFailures": 0,
+    "agentProcessLeaks": 0,
+    "statusMs": 10000,
+    "peakRssMb": 900,
+    "missingDependencyErrors": 0,
+    "pluginLoadFailures": 0,
+    "providerTimeoutMentions": 0
+  },
+  "phases": [
+    {
+      "id": "provision",
+      "title": "Provision Long Session Env",
+      "intent": "Start a disposable OpenClaw gateway before wiring the model provider and sending repeated messages.",
+      "commands": ["ocm start {env} {startSelector} --json"],
+      "evidence": ["gateway port", "runtime binding", "startup readiness"]
+    },
+    {
+      "id": "cold-session-turn",
+      "title": "Cold Session Turn",
+      "intent": "Send the first simple message through OpenClaw's real local embedded agent CLI command in a fresh session.",
+      "commands": [
+        "ocm @{env} -- agent --local --agent main --session-id kova-agent-long-session --message 'Reply with exact ASCII text KOVA_AGENT_OK only.' --thinking off --timeout 120 --json"
+      ],
+      "evidence": ["cold command duration", "assistant text", "provider request timing", "role resource samples"]
+    },
+    {
+      "id": "warm-session-turn",
+      "title": "Warm Session Turn",
+      "intent": "Send a warm follow-up in the same session to establish cache behavior after cold discovery work.",
+      "commands": [
+        "ocm @{env} -- agent --local --agent main --session-id kova-agent-long-session --message 'Reply with exact ASCII text KOVA_AGENT_OK only.' --thinking off --timeout 120 --json"
+      ],
+      "evidence": ["warm command duration", "assistant text", "provider request timing", "cold/warm delta"]
+    },
+    {
+      "id": "session-turn-3",
+      "title": "Session Turn 3",
+      "intent": "Continue the same OpenClaw session to catch repeated-turn latency or resource drift.",
+      "commands": [
+        "ocm @{env} -- agent --local --agent main --session-id kova-agent-long-session --message 'Reply with exact ASCII text KOVA_AGENT_OK only.' --thinking off --timeout 120 --json"
+      ],
+      "evidence": ["turn duration", "assistant text", "provider request timing", "role resource samples"]
+    },
+    {
+      "id": "session-turn-4",
+      "title": "Session Turn 4",
+      "intent": "Continue the same OpenClaw session and verify the assistant path remains stable.",
+      "commands": [
+        "ocm @{env} -- agent --local --agent main --session-id kova-agent-long-session --message 'Reply with exact ASCII text KOVA_AGENT_OK only.' --thinking off --timeout 120 --json"
+      ],
+      "evidence": ["turn duration", "assistant text", "provider request timing", "role resource samples"]
+    },
+    {
+      "id": "session-turn-5",
+      "title": "Session Turn 5",
+      "intent": "Continue repeated assistant use and catch process or memory growth between turns.",
+      "commands": [
+        "ocm @{env} -- agent --local --agent main --session-id kova-agent-long-session --message 'Reply with exact ASCII text KOVA_AGENT_OK only.' --thinking off --timeout 120 --json"
+      ],
+      "evidence": ["turn duration", "assistant text", "provider request timing", "role resource samples"]
+    },
+    {
+      "id": "session-turn-6",
+      "title": "Session Turn 6",
+      "intent": "Send a final repeated assistant message before checking gateway health and process cleanup.",
+      "commands": [
+        "ocm @{env} -- agent --local --agent main --session-id kova-agent-long-session --message 'Reply with exact ASCII text KOVA_AGENT_OK only.' --thinking off --timeout 120 --json"
+      ],
+      "evidence": ["turn duration", "assistant text", "provider request timing", "role resource samples"]
+    },
+    {
+      "id": "post-session-health",
+      "title": "Post-Session Gateway Health",
+      "intent": "Verify the gateway remains responsive after repeated agent turns and capture provider/plugin diagnostics.",
+      "commands": ["ocm @{env} -- status", "ocm logs {env} --tail 500 --raw"],
+      "evidence": ["gateway status", "provider logs", "plugin errors", "memory after repeated turns", "process leak summary"]
+    }
+  ]
+}
diff --git a/src/evaluator.mjs b/src/evaluator.mjs
index 09211b9..0d59fbe 100644
--- a/src/evaluator.mjs
+++ b/src/evaluator.mjs
@@ -44,6 +44,7 @@ export function evaluateRecord(record, scenario, options = {}) {
   const coldAgentTurn = selectAgentTurn(agentTurns, "cold") ?? agentTurns[0] ?? null;
   const warmAgentTurn = selectAgentTurn(agentTurns, "warm") ?? agentTurns[1] ?? null;
   const providerTurn = collectSlowestProviderTurn(agentTurns);
+  const agentTurnStats = summarizeAgentTurnStats(agentTurns);
   const agentTurnMs = maxNullable(maxDurationWhere(allResults, isAgentMessageCommand), maxTurnDuration(agentTurns));
   const agentResponseOk = agentTurns.length === 0 ? null : agentTurns.every((turn) => turn.responseOk === true);
   const agentProviderSimulation = evaluateProviderSimulation({ turns: agentTurns, scenario, record, thresholds });
@@ -302,6 +303,7 @@ export function evaluateRecord(record, scenario, options = {}) {
   }
   checkAgentTurnCorrectness(violations, agentTurns, scenario.agent?.expectedText ?? null);
   checkAgentTurnThresholds(violations, agentTurns, { coldAgentTurn, warmAgentTurn, providerTurn, agentLatencyDiagnosis }, thresholds, record);
+  checkAgentTurnAggregateThresholds(violations, agentTurnStats, thresholds);
   checkProviderSimulation(violations, agentProviderSimulation);
   checkAgentFailureContainment(violations, agentFailureContainment);
 
@@ -318,6 +320,16 @@ export function evaluateRecord(record, scenario, options = {}) {
     agentResponseOk,
     agentTurnCount: agentTurns.length,
     agentTurns,
+    agentTurnStats,
+    agentTurnMedianMs: agentTurnStats.totalTurnMs.median,
+    agentTurnP95Ms: agentTurnStats.totalTurnMs.p95,
+    agentTurnMaxMs: agentTurnStats.totalTurnMs.max,
+    agentPreProviderMedianMs: agentTurnStats.preProviderMs.median,
+    agentPreProviderP95Ms: agentTurnStats.preProviderMs.p95,
+    agentPreProviderMaxMs: agentTurnStats.preProviderMs.max,
+    agentProviderFinalMedianMs: agentTurnStats.providerFinalMs.median,
+    agentProviderFinalP95Ms: agentTurnStats.providerFinalMs.p95,
+    agentProviderFinalMaxMs: agentTurnStats.providerFinalMs.max,
     coldAgentTurnMs: coldAgentTurn?.totalTurnMs ?? null,
     warmAgentTurnMs: warmAgentTurn?.totalTurnMs ?? null,
     agentColdWarmDeltaMs: delta(coldAgentTurn?.totalTurnMs, warmAgentTurn?.totalTurnMs),
@@ -606,6 +618,61 @@ function maxTurnDuration(turns) {
   return durations.length === 0 ? null : Math.max(...durations);
 }
 
+function summarizeAgentTurnStats(turns) {
+  return {
+    schemaVersion: "kova.agentTurnStats.v1",
+    count: turns.length,
+    totalTurnMs: summarizeNumericField(turns, "totalTurnMs"),
+    preProviderMs: summarizeNumericField(turns, "preProviderMs"),
+    providerFinalMs: summarizeNumericField(turns, "providerFinalMs"),
+    postProviderMs: summarizeNumericField(turns, "postProviderMs"),
+    firstByteLatencyMs: summarizeNumericField(turns, "firstByteLatencyMs"),
+    processLeakCount: turns.reduce((sum, turn) => sum + (turn.processLeakCount ?? 0), 0),
+    missingProviderRequestCount: turns.filter((turn) => turn.missingProviderRequest === true).length,
+    responseOkCount: turns.filter((turn) => turn.responseOk === true).length
+  };
+}
+
+function summarizeNumericField(items, field) {
+  const values = items
+    .map((item) => item?.[field])
+    .filter((value) => typeof value === "number" && Number.isFinite(value))
+    .toSorted((left, right) => left - right);
+  if (values.length === 0) {
+    return {
+      count: 0,
+      min: null,
+      median: null,
+      p95: null,
+      max: null
+    };
+  }
+  return {
+    count: values.length,
+    min: values[0],
+    median: percentile(values, 50),
+    p95: percentile(values, 95),
+    max: values.at(-1)
+  };
+}
+
+function percentile(sortedValues, percentileValue) {
+  if (sortedValues.length === 0) {
+    return null;
+  }
+  if (sortedValues.length === 1) {
+    return sortedValues[0];
+  }
+  const position = (percentileValue / 100) * (sortedValues.length - 1);
+  const lower = Math.floor(position);
+  const upper = Math.ceil(position);
+  if (lower === upper) {
+    return sortedValues[lower];
+  }
+  const weight = position - lower;
+  return Math.round((sortedValues[lower] * (1 - weight) + sortedValues[upper] * weight) * 1000) / 1000;
+}
+
 function checkAgentTurnCorrectness(violations, turns, expectedText) {
   for (const turn of turns) {
     if (turn.expectedFailure === true) {
@@ -860,6 +927,28 @@ function checkAgentTurnThresholds(violations, turns, selected, thresholds, recor
   }
 }
 
+function checkAgentTurnAggregateThresholds(violations, stats, thresholds) {
+  checkAggregateThreshold(violations, stats.totalTurnMs.p95, "agentTurnP95Ms", thresholds.agentTurnP95Ms);
+  checkAggregateThreshold(violations, stats.totalTurnMs.max, "agentTurnMaxMs", thresholds.agentTurnMaxMs);
+  checkAggregateThreshold(violations, stats.preProviderMs.p95, "agentPreProviderP95Ms", thresholds.agentPreProviderP95Ms);
+  checkAggregateThreshold(violations, stats.preProviderMs.max, "agentPreProviderMaxMs", thresholds.agentPreProviderMaxMs);
+  checkAggregateThreshold(violations, stats.providerFinalMs.p95, "agentProviderFinalP95Ms", thresholds.agentProviderFinalP95Ms);
+  checkAggregateThreshold(violations, stats.providerFinalMs.max, "agentProviderFinalMaxMs", thresholds.agentProviderFinalMaxMs);
+}
+
+function checkAggregateThreshold(violations, actual, metric, threshold) {
+  if (typeof threshold !== "number" || typeof actual !== "number" || actual <= threshold) {
+    return;
+  }
+  violations.push({
+    kind: "agent-latency",
+    metric,
+    expected: `<= ${threshold}`,
+    actual,
+    message: `${metric} ${actual}ms exceeded threshold ${threshold}ms`
+  });
+}
+
 function checkTurnThreshold(violations, turn, metric, threshold, message) {
   if (!turn || typeof threshold !== "number" || typeof turn[metric] !== "number" || turn[metric] <= threshold) {
     return;
diff --git a/src/performance/stats.mjs b/src/performance/stats.mjs
index bc0402e..361439b 100644
--- a/src/performance/stats.mjs
+++ b/src/performance/stats.mjs
@@ -9,9 +9,12 @@ export const PERFORMANCE_METRICS = [
   { id: "openclawEventLoopMaxMs", title: "Event Loop Max", unit: "ms", regressionKey: "eventLoopRegressionPercent" },
   { id: "eventLoopDelayMs", title: "Event Loop Delay", unit: "ms", regressionKey: "eventLoopRegressionPercent" },
   { id: "agentTurnMs", title: "Agent Turn", unit: "ms", regressionKey: "agentLatencyRegressionPercent" },
+  { id: "agentTurnP95Ms", title: "Agent Turn p95", unit: "ms", regressionKey: "agentLatencyRegressionPercent" },
+  { id: "agentTurnMaxMs", title: "Agent Turn Max", unit: "ms", regressionKey: "agentLatencyRegressionPercent" },
   { id: "coldAgentTurnMs", title: "Cold Agent Turn", unit: "ms", regressionKey: "agentLatencyRegressionPercent" },
   { id: "warmAgentTurnMs", title: "Warm Agent Turn", unit: "ms", regressionKey: "agentLatencyRegressionPercent" },
   { id: "agentColdWarmDeltaMs", title: "Cold/Warm Agent Delta", unit: "ms", regressionKey: "agentLatencyRegressionPercent" },
+  { id: "agentPreProviderP95Ms", title: "Pre-Provider p95", unit: "ms", regressionKey: "agentLatencyRegressionPercent" },
   { id: "coldPreProviderMs", title: "Cold Pre-Provider", unit: "ms", regressionKey: "agentLatencyRegressionPercent" },
   { id: "warmPreProviderMs", title: "Warm Pre-Provider", unit: "ms", regressionKey: "agentLatencyRegressionPercent" },
   { id: "healthP95Ms", title: "Health p95", unit: "ms", regressionKey: "startupRegressionPercent" },
diff --git a/src/report.mjs b/src/report.mjs
index 86f97ef..43f784c 100644
--- a/src/report.mjs
+++ b/src/report.mjs
@@ -144,6 +144,7 @@ export function renderMarkdownReport(report) {
         lines.push(`- Agent cold/warm: cold ${record.measurements.coldAgentTurnMs ?? "unknown"} ms; warm ${record.measurements.warmAgentTurnMs ?? "unknown"} ms; delta ${record.measurements.agentColdWarmDeltaMs ?? "unknown"} ms`);
         lines.push(`- Agent pre-provider: cold ${record.measurements.coldPreProviderMs ?? "unknown"} ms; warm ${record.measurements.warmPreProviderMs ?? "unknown"} ms; delta ${record.measurements.agentColdWarmPreProviderDeltaMs ?? "unknown"} ms`);
         lines.push(`- Agent provider final: cold ${record.measurements.coldProviderFinalMs ?? "unknown"} ms; warm ${record.measurements.warmProviderFinalMs ?? "unknown"} ms`);
+        lines.push(`- Agent turn stats: count ${record.measurements.agentTurnCount}; p95 ${record.measurements.agentTurnP95Ms ?? "unknown"} ms; max ${record.measurements.agentTurnMaxMs ?? "unknown"} ms; pre-provider p95 ${record.measurements.agentPreProviderP95Ms ?? "unknown"} ms`);
       }
       if (record.measurements.agentProviderAttribution) {
         lines.push(`- Provider evidence: ${record.measurements.agentProviderRequestCount ?? 0} request(s); provider work ${record.measurements.agentProviderFinalMs ?? "unknown"} ms; pre-provider ${record.measurements.agentPreProviderMs ?? "unknown"} ms; post-provider ${record.measurements.agentPostProviderMs ?? "unknown"} ms`);
@@ -549,6 +550,13 @@ function summarizeMeasurements(measurements) {
     providerFirstByteLatencyMs: measurements.providerFirstByteLatencyMs ?? null,
     agentTurnCount: measurements.agentTurnCount ?? null,
     agentTurns: measurements.agentTurns ?? null,
+    agentTurnStats: measurements.agentTurnStats ?? null,
+    agentTurnP95Ms: measurements.agentTurnP95Ms ?? null,
+    agentTurnMaxMs: measurements.agentTurnMaxMs ?? null,
+    agentPreProviderP95Ms: measurements.agentPreProviderP95Ms ?? null,
+    agentPreProviderMaxMs: measurements.agentPreProviderMaxMs ?? null,
+    agentProviderFinalP95Ms: measurements.agentProviderFinalP95Ms ?? null,
+    agentProviderFinalMaxMs: measurements.agentProviderFinalMaxMs ?? null,
     coldAgentTurnMs: measurements.coldAgentTurnMs ?? null,
     warmAgentTurnMs: measurements.warmAgentTurnMs ?? null,
     agentColdWarmDeltaMs: measurements.agentColdWarmDeltaMs ?? null,
diff --git a/src/selfcheck.mjs b/src/selfcheck.mjs
index 1dbbd15..1c2917a 100644
--- a/src/selfcheck.mjs
+++ b/src/selfcheck.mjs
@@ -1226,6 +1226,9 @@ function agentTurnBreakdownCheck() {
       agent: { expectedText: "KOVA_AGENT_OK" },
       thresholds: {}
     }, { surface: { thresholds: {} }, targetPlan: { kind: "local-build" } });
+    assertEqual(record.measurements.agentTurnStats?.count, 1, "agent turn stats count");
+    assertEqual(record.measurements.agentTurnP95Ms, 1000, "agent turn p95");
+    assertEqual(record.measurements.agentPreProviderP95Ms, 200, "agent pre-provider p95");
     const rendered = renderMarkdownReport({
       generatedAt: "2026-05-01T00:00:00.000Z",
       runId: "self-check-agent-turn-breakdown",
@@ -1237,6 +1240,7 @@ function agentTurnBreakdownCheck() {
     });
     assertEqual(rendered.includes("breakdown:"), true, "markdown includes agent turn breakdown");
     assertEqual(rendered.includes("models.catalog.* 70ms"), true, "markdown includes source span evidence");
+    assertEqual(rendered.includes("Agent turn stats:"), true, "markdown includes agent turn stats");
     assertEqual(
       summarizeAgentTurnBreakdownForMarkdown(normal.breakdown).includes("unknown 15ms"),
       true,
diff --git a/surfaces/agent-message.json b/surfaces/agent-message.json
index 6cca384..03931e2 100644
--- a/surfaces/agent-message.json
+++ b/surfaces/agent-message.json
@@ -2,10 +2,10 @@
   "id": "agent-message",
   "title": "Agent Message",
   "ownerArea": "agent-runtime",
-  "description": "Send cold and warm local OpenClaw agent messages and verify response latency, provider routing, gateway health, memory, and logs.",
+  "description": "Send cold, warm, and repeated local OpenClaw agent messages and verify response latency, provider routing, gateway health, memory, and logs.",
   "requiredStates": ["mock-openai-provider"],
   "targetKinds": ["npm", "channel", "runtime", "local-build"],
-  "requiredMetrics": ["agentTurnMs", "coldAgentTurnMs", "warmAgentTurnMs", "agentColdWarmDeltaMs", "coldPreProviderMs", "warmPreProviderMs", "healthP95Ms", "peakRssMb", "providerTimeoutMentions", "pluginLoadFailures"],
+  "requiredMetrics": ["agentTurnMs", "agentTurnP95Ms", "agentTurnMaxMs", "coldAgentTurnMs", "warmAgentTurnMs", "agentColdWarmDeltaMs", "coldPreProviderMs", "warmPreProviderMs", "agentPreProviderP95Ms", "healthP95Ms", "peakRssMb", "providerTimeoutMentions", "pluginLoadFailures"],
   "processRoles": ["gateway", "command-tree", "agent-cli", "agent-process", "mock-provider"],
   "thresholds": { "agentTurnMs": 45000, "coldAgentTurnMs": 45000, "warmAgentTurnMs": 15000, "coldWarmDeltaMs": 30000, "preProviderMs": 10000, "providerFinalMs": 3000, "healthP95Ms": 1000, "peakRssMb": 900 },
   "roleThresholds": {