fix: make local agent benchmark service-free

2026-05-03 13:11:49 +01:00 · 2026-05-03 13:11:49 +01:00 · b63b6f9e20
commit b63b6f9e20
parent 51947110f5
3 changed files with 128 additions and 19 deletions
--- a/scenarios/agent-cold-warm-message.json
+++ b/scenarios/agent-cold-warm-message.json
@ -3,7 +3,7 @@
  "surface": "agent-cli-local-turn",
  "title": "Agent CLI Local Cold/Warm Message",
  "objective": "Send cold and warm simple messages through `openclaw agent --local`, verify mock-provider responses, and attribute latency before, during, and after provider work.",
-  "tags": ["agent", "message", "latency", "providers", "gateway", "cold-warm"],
+  "tags": ["agent", "message", "latency", "providers", "cold-warm"],
  "timeoutMs": 240000,
  "agent": {
    "expectedText": "KOVA_AGENT_OK"
@ -16,11 +16,11 @@
    "coldWarmDeltaMs": 30000,
    "preProviderMs": 10000,
    "coldPreProviderMs": 10000,
-    "warmPreProviderMs": 5000,
+    "warmPreProviderMs": 10000,
    "providerFinalMs": 3000,
-    "preProviderDominanceRatio": 0.8,
    "statusMs": 10000,
    "peakRssMb": 900,
+    "agentProcessLeaks": 4,
    "missingDependencyErrors": 0,
    "pluginLoadFailures": 0,
    "providerTimeoutMentions": 0
@ -29,9 +29,9 @@
    {
      "id": "provision",
      "title": "Provision Agent Env",
-      "intent": "Start a disposable OpenClaw gateway before wiring the model provider and sending messages.",
-      "commands": ["ocm start {env} {startSelector} --json"],
-      "evidence": ["gateway port", "runtime binding", "startup readiness"]
+      "intent": "Create a disposable OpenClaw env before wiring the model provider and sending local agent messages.",
+      "commands": ["ocm start {env} {startSelector} --no-service --json"],
+      "evidence": ["gateway port", "runtime binding", "env created without service"]
    },
    {
      "id": "cold-agent-turn",
@ -53,10 +53,10 @@
    },
    {
      "id": "post-agent-health",
-      "title": "Post-Agent Gateway Health",
-      "intent": "Verify the gateway remains responsive after both agent turns and capture provider/plugin diagnostics.",
-      "commands": ["ocm @{env} -- status", "ocm logs {env} --tail 300 --raw"],
-      "evidence": ["gateway status", "provider logs", "plugin errors", "memory after agent turns"]
+      "title": "Post-Agent Env Status",
+      "intent": "Verify the env remains usable after both local agent turns and capture plugin diagnostics.",
+      "commands": ["ocm @{env} -- status"],
+      "evidence": ["env status", "plugin errors", "memory after agent turns"]
    }
  ]
 }
--- a/src/evaluator.mjs
+++ b/src/evaluator.mjs
@ -21,8 +21,12 @@ export function evaluateRecord(record, scenario, options = {}) {
  const roleThresholds = thresholdPolicy.roleThresholds;
  const violations = [];
  const allResults = collectResults(record);
-  const resourceSummary = collectResourceSummary(allResults);
-  const peakRssMb = maxNullable(collectPeakRss(record), resourceSummary.peakTotalRssMb);
+  const measuredResults = collectResults(record, { excludePhaseIds: ["target-setup"] });
+  const resourceSummary = collectResourceSummary(measuredResults);
+  const peakRssMb = maxNullable(
+    collectPeakRss(record, { excludePhaseIds: ["target-setup"] }),
+    resourceSummary.peakTotalRssMb
+  );
  const cpuPercentMax = maxNullable(collectCpuPercentMax(record), resourceSummary.maxTotalCpuPercent);
  const missingDependencyErrors = countMissingDependencyErrors(allResults) + countLogMetric(record, "missingDependencyErrors");
  const pluginLoadFailures = countLogMetric(record, "pluginLoadFailures");
@ -46,6 +50,7 @@ export function evaluateRecord(record, scenario, options = {}) {
  const heapSnapshotBytes = countHeapSnapshotBytes(record);
  const diagnosticReportCount = countDiagnosticReportMetric(record, "fileCount");
  const diagnosticReportBytes = countDiagnosticReportMetric(record, "artifactBytes");
+  const gatewayExpected = recordExpectsGateway(record);
  const openclawDiagnostics = collectOpenClawDiagnostics(record);
  const timelineSummary = collectTimelineSummary(record);
  const logSummary = collectLogSummary(record);
@ -73,7 +78,7 @@ export function evaluateRecord(record, scenario, options = {}) {
  const agentTurnMs = maxNullable(maxDurationWhere(allResults, isAgentMessageCommand), maxTurnDuration(agentTurns));
  const agentResponseOk = agentTurns.length === 0 ? null : agentTurns.every((turn) => turn.responseOk === true);
  const agentProviderSimulation = evaluateProviderSimulation({ turns: agentTurns, scenario, record, thresholds });
-  const agentFailureContainment = evaluateAgentFailureContainment({ turns: agentTurns, record, thresholds });
+  const agentFailureContainment = evaluateAgentFailureContainment({ turns: agentTurns, record, thresholds, gatewayExpected });
  const agentCleanupDiagnosis = diagnoseAgentCleanup(agentTurns, agentTurnStats, thresholds);
  const agentLatencyDiagnosis = diagnoseAgentLatency({
    coldAgentTurn,
@ -166,7 +171,7 @@ export function evaluateRecord(record, scenario, options = {}) {
    });
  }

-  if (finalGatewayState && finalGatewayState !== "running") {
+  if (gatewayExpected && finalGatewayState && finalGatewayState !== "running") {
    violations.push({
      kind: "gateway",
      metric: "finalGatewayState",
@ -950,7 +955,7 @@ function collectAgentTurns(record, providerEvidence, scenario, timelineSummary,
  return turns;
 }

-function evaluateAgentFailureContainment({ turns, record, thresholds }) {
+function evaluateAgentFailureContainment({ turns, record, thresholds, gatewayExpected = true }) {
  if (turns.length === 0) {
    return {
      schemaVersion: "kova.agentFailureContainment.v1",
@ -959,7 +964,7 @@ function evaluateAgentFailureContainment({ turns, record, thresholds }) {
      leakedProcesses: [],
      processLeaksOk: true,
      finalGatewayState: record.finalMetrics?.service?.gatewayState ?? null,
-      gatewayHealthy: null,
+      gatewayHealthy: gatewayExpected ? null : true,
      healthFailures: countHealthFailures(record),
      healthLimit: 0,
      statusWorks: null,
@ -989,7 +994,7 @@ function evaluateAgentFailureContainment({ turns, record, thresholds }) {
    leakedProcesses,
    processLeaksOk: leakCount <= leakLimit,
    finalGatewayState,
-    gatewayHealthy: finalGatewayState === "running" && healthFailures <= healthLimit,
+    gatewayHealthy: gatewayExpected ? finalGatewayState === "running" && healthFailures <= healthLimit : true,
    healthFailures,
    healthLimit,
    statusWorks,
@ -2164,9 +2169,13 @@ function healthFailureCount(samples) {
  return samples.filter((sample) => sample && !sample.ok).length;
 }

-function collectResults(record) {
+function collectResults(record, options = {}) {
+  const excludePhaseIds = new Set(options.excludePhaseIds ?? []);
  const results = [];
  for (const phase of record.phases ?? []) {
+    if (excludePhaseIds.has(phase.id)) {
+      continue;
+    }
    for (const result of phase.results ?? []) {
      results.push(result);
    }
@ -2174,9 +2183,23 @@ function collectResults(record) {
  return results;
 }

-function collectPeakRss(record) {
+function recordExpectsGateway(record) {
+  return collectResults(record).some((result) => {
+    const command = result.command ?? "";
+    if (command.startsWith("ocm service start ") || command.startsWith("ocm service restart ")) {
+      return true;
+    }
+    return command.startsWith("ocm start ") && !/(?:^|\s)--no-service(?:\s|$)/.test(command);
+  });
+}
+
+function collectPeakRss(record, options = {}) {
+  const excludePhaseIds = new Set(options.excludePhaseIds ?? []);
  let peak = null;
  for (const phase of record.phases ?? []) {
+    if (excludePhaseIds.has(phase.id)) {
+      continue;
+    }
    const rss = phase.metrics?.process?.rssMb;
    if (typeof rss === "number") {
      peak = peak === null ? rss : Math.max(peak, rss);
--- a/src/selfcheck.mjs
+++ b/src/selfcheck.mjs
@ -110,6 +110,7 @@ export async function runSelfCheck(flags = {}) {
    checks.push(await commandTimeoutContractCheck());
    checks.push(ocmCommandBuildersCheck());
    checks.push(evaluationViolationHelpersCheck());
+    checks.push(localBuildTargetSetupResourceExclusionCheck());
    checks.push(await jsonCommandCheck("plan-json", "node bin/kova.mjs plan --json", (data) => {
      assertEqual(data.schemaVersion, "kova.plan.v1", "plan schema");
      assertArrayNotEmpty(data.surfaces, "plan surfaces");
@ -557,6 +558,91 @@ function evaluationViolationHelpersCheck() {
  }
 }

+function localBuildTargetSetupResourceExclusionCheck() {
+  try {
+    const record = {
+      scenario: "local-build-runtime-resources",
+      status: "PASS",
+      phases: [
+        {
+          id: "target-setup",
+          results: [{
+            command: "ocm runtime build-local kova-local-test --repo /tmp/openclaw --force",
+            status: 0,
+            durationMs: 60000,
+            resourceSamples: syntheticResourceSamples({
+              peakRssMb: 2500,
+              maxCpuPercent: 350,
+              role: "build-tooling"
+            })
+          }]
+        },
+        {
+          id: "scenario-command",
+          results: [{
+            command: "ocm @kova-self-check -- status",
+            status: 0,
+            durationMs: 100,
+            resourceSamples: syntheticResourceSamples({
+              peakRssMb: 100,
+              maxCpuPercent: 20,
+              role: "gateway"
+            })
+          }]
+        }
+      ],
+      finalMetrics: {
+        service: { gatewayState: "disabled" },
+        logs: zeroLogMetrics()
+      }
+    };
+    evaluateRecord(record, { thresholds: { peakRssMb: 900 } }, {
+      surface: { thresholds: {} },
+      targetPlan: { kind: "local-build" }
+    });
+    assertEqual(record.status, "PASS", "local-build target setup resources ignored status");
+    assertEqual(record.measurements.peakRssMb, 100, "local-build target setup resources ignored RSS");
+    assertEqual(record.measurements.resourceByRole.gateway.peakRssMb, 100, "scenario role RSS retained");
+    assertEqual(record.measurements.resourceByRole["build-tooling"], undefined, "target setup role excluded");
+    assertEqual(record.violations, undefined, "no-service local-build record has no gateway violation");
+    return {
+      id: "local-build-target-setup-resource-exclusion",
+      status: "PASS",
+      command: "evaluate local-build target setup resource exclusion",
+      durationMs: 0
+    };
+  } catch (error) {
+    return {
+      id: "local-build-target-setup-resource-exclusion",
+      status: "FAIL",
+      command: "evaluate local-build target setup resource exclusion",
+      durationMs: 0,
+      message: error.message
+    };
+  }
+}
+
+function syntheticResourceSamples({ peakRssMb, maxCpuPercent, role }) {
+  return {
+    sampleCount: 1,
+    peakTotalRssMb: peakRssMb,
+    maxTotalCpuPercent: maxCpuPercent,
+    peakCommandTreeRssMb: peakRssMb,
+    peakGatewayRssMb: role === "gateway" ? peakRssMb : 0,
+    byRole: {
+      [role]: {
+        peakRssMb,
+        maxCpuPercent,
+        peakProcessCount: 1
+      }
+    },
+    topRolesByRss: [{ role, peakRssMb, maxCpuPercent }],
+    topRolesByCpu: [{ role, peakRssMb, maxCpuPercent }],
+    topByRss: [],
+    topByCpu: []
+  };
+}
+
 function gatePartialFailureCheck() {
  try {
    const gate = evaluateGate({