fix: separate gateway measurement scope

2026-05-07 10:32:11 +01:00 · 2026-05-07 10:32:11 +01:00 · a995487433
commit a995487433
parent 9d9ff32d32
8 changed files with 389 additions and 20 deletions
--- a/src/evaluator.mjs
+++ b/src/evaluator.mjs
@ -11,6 +11,7 @@ import { computeProviderTurnAttribution } from "./collectors/provider.mjs";
 import { summarizeRuntimeDepsLogs } from "./collectors/logs.mjs";
 import { buildHealthMeasurement, healthReadinessClassification } from "./health.mjs";
 import { resolveThresholdPolicy } from "./evaluation/thresholds.mjs";
+import { measuredProductPhase, measurementScopeForPhase, normalizeMeasurementScope } from "./measurement-contract.mjs";
 import {
  checkAggregateThreshold,
  checkDuration,
@ -30,13 +31,18 @@ export function evaluateRecord(record, scenario, options = {}) {
  const roleThresholds = thresholdPolicy.roleThresholds;
  const violations = [];
  const allResults = collectResults(record);
-  const measuredResults = collectResults(record, { excludePhaseIds: ["target-setup"] });
+  const measurementScopeSummary = summarizeMeasurementScopes(record);
+  const measuredResults = collectResults(record, { productOnly: true });
  const resourceSummary = collectResourceSummary(measuredResults);
-  const peakRssMb = maxNullable(
-    collectPeakRss(record, { excludePhaseIds: ["target-setup"] }),
+  const primaryResourceRole = options.surface?.resourcePrimaryRole ?? null;
+  const primaryRoleResources = primaryResourceRole ? resourceSummary.byRole[primaryResourceRole] : null;
+  const peakTrackedRssMb = maxNullable(
+    collectPeakRss(record, { productOnly: true }),
    resourceSummary.peakTotalRssMb
  );
-  const cpuPercentMax = maxNullable(collectCpuPercentMax(record), resourceSummary.maxTotalCpuPercent);
+  const cpuPercentMaxTracked = maxNullable(collectCpuPercentMax(record, { productOnly: true }), resourceSummary.maxTotalCpuPercent);
+  const peakRssMb = typeof primaryRoleResources?.peakRssMb === "number" ? primaryRoleResources.peakRssMb : peakTrackedRssMb;
+  const cpuPercentMax = typeof primaryRoleResources?.maxCpuPercent === "number" ? primaryRoleResources.maxCpuPercent : cpuPercentMaxTracked;
  const missingDependencyErrors = countMissingDependencyErrors(allResults) + countLogMetric(record, "missingDependencyErrors");
  const pluginLoadFailures = countLogMetric(record, "pluginLoadFailures");
  const metadataScanMentions = countLogMetric(record, "metadataScanMentions");
@ -67,6 +73,7 @@ export function evaluateRecord(record, scenario, options = {}) {
  const timelineRequirement = timelineRequirementFor(options);
  const requiredOpenSpans = requiredTimelineSpans(options);
  const openRequiredSpans = timelineSummary.openSpans.filter((span) => requiredOpenSpans.has(span.name));
+  const missingRequiredSpans = missingTimelineSpans(timelineSummary, requiredOpenSpans);
  const runtimeDepsStagingMs = maxNullable(
    openclawDiagnostics.runtimeDepsStagingMs,
    timelineSummary.runtimeDepsStageMaxMs,
@ -719,6 +726,18 @@ export function evaluateRecord(record, scenario, options = {}) {
    });
  }

+  if (timelineSummary.available && missingRequiredSpans.length > 0) {
+    violations.push({
+      kind: "diagnostics",
+      metric: "openclawMissingRequiredSpanCount",
+      expected: "0",
+      actual: missingRequiredSpans.length,
+      message: `${missingRequiredSpans.length} required OpenClaw diagnostics span(s) were not observed: ${missingRequiredSpans.slice(0, 5).join(", ")}`
+    });
+  }
+
+  checkGatewaySessionTransport(violations, agentTurns, scenario);
+
  if (agentResponseOk === false) {
    violations.push({
      kind: "agent",
@ -737,6 +756,11 @@ export function evaluateRecord(record, scenario, options = {}) {
  record.measurements = {
    peakRssMb,
    cpuPercentMax,
+    measurementScopeSummary,
+    resourceMeasurementScope: "product",
+    resourcePrimaryRole: primaryResourceRole,
+    resourcePeakTrackedRssMb: peakTrackedRssMb,
+    resourceCpuPercentMaxTracked: cpuPercentMaxTracked,
    coldReadyMs,
    warmReadyMs,
    upgradeMs,
@ -917,6 +941,8 @@ export function evaluateRecord(record, scenario, options = {}) {
    openclawRepeatedSpanCount: timelineSummary.repeatedSpanCount,
    openclawOpenSpanCount: timelineSummary.openSpanCount,
    openclawOpenRequiredSpanCount: openRequiredSpans.length,
+    openclawMissingRequiredSpanCount: missingRequiredSpans.length,
+    openclawMissingRequiredSpans: missingRequiredSpans,
    openclawOpenSpans: timelineSummary.openSpans,
    openclawKeySpans: timelineSummary.keySpans,
    openclawEventLoopMaxMs: timelineSummary.eventLoopMaxMs,
@ -1089,6 +1115,29 @@ function preferredPreProviderAttributionSummary(...summaries) {
  return summaries.find((summary) => summary?.count > 0) ?? summaries[0];
 }

+function checkGatewaySessionTransport(violations, agentTurns, scenario) {
+  if (scenario.id !== "dashboard-session-send-turn") {
+    return;
+  }
+  for (const turn of agentTurns) {
+    if (!turn.gatewaySession) {
+      continue;
+    }
+    const transport = turn.gatewaySession.gatewayTransportKind;
+    if (transport === "direct-gateway-rpc") {
+      continue;
+    }
+    violations.push({
+      kind: "harness",
+      metric: "gatewayTransport.kind",
+      expected: "direct-gateway-rpc",
+      actual: transport ?? "unknown",
+      phaseId: turn.phaseId,
+      message: `dashboard session benchmark used ${transport ?? "unknown"} transport; direct Gateway RPC is required for Gateway product measurement${turn.gatewaySession.gatewayTransportFallbackReason ? ` (${turn.gatewaySession.gatewayTransportFallbackReason})` : ""}`
+    });
+  }
+}
+
 function extractGatewaySessionTurn(result) {
  if (!result?.command?.includes("run-dashboard-session-send-turn.mjs")) {
    return null;
@ -1115,6 +1164,9 @@ function extractGatewaySessionTurn(result) {
    minAssistantCount: numberOrNull(payload.minAssistantCount),
    sessionKey: payload.sessionKey ?? null,
    runId: payload.runId ?? null,
+    gatewayTransportKind: payload.gatewayTransport?.kind ?? null,
+    gatewayTransportFallbackReason: payload.gatewayTransport?.fallbackReason ?? null,
+    gatewayTransportFallbackUsed: typeof payload.gatewayTransport?.kind === "string" && payload.gatewayTransport.kind !== "direct-gateway-rpc",
    activeStartedAtEpochMs,
    activeFinishedAtEpochMs,
    activeTurnMs,
@ -2017,6 +2069,26 @@ function requiredTimelineSpans(options) {
  ]);
 }

+function missingTimelineSpans(timelineSummary, requiredSpans) {
+  return [...requiredSpans].filter((name) => !timelineSpanObserved(timelineSummary, name));
+}
+
+function timelineSpanObserved(timelineSummary, name) {
+  const exact = timelineSummary.keySpans?.[name] ?? timelineSummary.spanTotals?.[name];
+  if ((exact?.count ?? 0) > 0 || (exact?.openCount ?? 0) > 0) {
+    return true;
+  }
+  if ((timelineSummary.openSpans ?? []).some((span) => span.name === name)) {
+    return true;
+  }
+  if (name === "gateway.chat_send" || name === "auto_reply" || name === "reply" || name === "models.catalog") {
+    return Object.entries(timelineSummary.spanTotals ?? {}).some(([spanName, summary]) =>
+      spanName === name || (spanName.startsWith(`${name}.`) && (summary.count ?? 0) > 0)
+    );
+  }
+  return false;
+}
+
 function maxDurationWhere(results, predicate) {
  const durations = results
    .filter((result) => predicate(result.command))
@ -2515,6 +2587,28 @@ function healthFailureCount(samples) {
  return samples.filter((sample) => sample && !sample.ok).length;
 }

+function summarizeMeasurementScopes(record) {
+  const phases = { product: 0, harness: 0, cleanup: 0 };
+  const results = { product: 0, harness: 0, cleanup: 0 };
+  for (const phase of record.phases ?? []) {
+    const phaseScope = measurementScopeForPhase(phase);
+    phases[phaseScope] += 1;
+    for (const result of phase.results ?? []) {
+      const resultScope = result.measurementScope ? normalizeMeasurementScope(result.measurementScope, phase.id) : phaseScope;
+      results[resultScope] += 1;
+    }
+  }
+  return {
+    schemaVersion: "kova.measurementScopeSummary.v1",
+    productPhaseCount: phases.product,
+    harnessPhaseCount: phases.harness,
+    cleanupPhaseCount: phases.cleanup,
+    productCommandCount: results.product,
+    harnessCommandCount: results.harness,
+    cleanupCommandCount: results.cleanup
+  };
+}
+
 function collectResults(record, options = {}) {
  const excludePhaseIds = new Set(options.excludePhaseIds ?? []);
  const results = [];
@ -2522,6 +2616,9 @@ function collectResults(record, options = {}) {
    if (excludePhaseIds.has(phase.id)) {
      continue;
    }
+    if (options.productOnly === true && !measuredProductPhase(phase)) {
+      continue;
+    }
    for (const result of phase.results ?? []) {
      results.push(result);
    }
@ -2546,6 +2643,9 @@ function collectPeakRss(record, options = {}) {
    if (excludePhaseIds.has(phase.id)) {
      continue;
    }
+    if (options.productOnly === true && !measuredProductPhase(phase)) {
+      continue;
+    }
    const rss = phase.metrics?.process?.rssMb;
    if (typeof rss === "number") {
      peak = peak === null ? rss : Math.max(peak, rss);
@ -2720,8 +2820,8 @@ function collectTimelineSummary(record) {
  let repeatedSpanCount = 0;
  let runtimeDepsStageMaxMs = null;
  let slowestRuntimeDepsPlugin = null;
-  let openSpanCount = 0;
-  let openSpans = [];
+  let latestOpenSpanCount = 0;
+  let latestOpenSpans = [];
  let latestEventCount = -1;
  let events = [];
  let turnAttributionEvents = [];
@ -2734,6 +2834,10 @@ function collectTimelineSummary(record) {
      latestEventCount = timeline.eventCount ?? 0;
      events = timeline.events;
      turnAttributionEvents = Array.isArray(timeline.turnAttributionEvents) ? timeline.turnAttributionEvents : [];
+      latestOpenSpanCount = timeline.openSpanCount ?? timeline.openSpans?.length ?? 0;
+      latestOpenSpans = [...(timeline.openSpans ?? [])]
+        .toSorted((left, right) => (right.ageMs ?? -1) - (left.ageMs ?? -1))
+        .slice(0, 25);
    }
    for (const artifact of timeline.artifacts ?? []) {
      artifacts.add(artifact);
@ -2742,8 +2846,6 @@ function collectTimelineSummary(record) {
    parseErrorCount = Math.max(parseErrorCount, timeline.parseErrorCount ?? 0);
    childProcessFailedCount = Math.max(childProcessFailedCount, timeline.childProcesses?.failedCount ?? 0);
    repeatedSpanCount = Math.max(repeatedSpanCount, timeline.repeatedSpans?.length ?? 0);
-    openSpanCount = Math.max(openSpanCount, timeline.openSpanCount ?? timeline.openSpans?.length ?? 0);
-    openSpans = mergeOpenSpans(openSpans, timeline.openSpans ?? []);
    mergeKeySpans(keySpans, timeline.keySpans ?? {});
    mergeSpanTotals(spanTotals, timeline.spanTotals ?? {});
    eventLoopMaxMs = maxNullable(eventLoopMaxMs, timeline.eventLoop?.maxMs);
@ -2775,8 +2877,8 @@ function collectTimelineSummary(record) {
    slowestSpanName: slowestSpan?.name ?? null,
    slowestSpanMs: slowestSpan?.durationMs ?? null,
    repeatedSpanCount,
-    openSpanCount,
-    openSpans,
+    openSpanCount: latestOpenSpanCount,
+    openSpans: latestOpenSpans,
    artifacts: [...artifacts],
    timelineArtifacts: [...artifacts],
    events,
@ -2847,9 +2949,12 @@ function mergeKeySpans(target, source) {
  }
 }

-function collectCpuPercentMax(record) {
+function collectCpuPercentMax(record, options = {}) {
  const values = [];
  for (const phase of record.phases ?? []) {
+    if (options.productOnly === true && !measuredProductPhase(phase)) {
+      continue;
+    }
    const cpu = phase.metrics?.process?.cpuPercent;
    if (typeof cpu === "number") {
      values.push(cpu);
--- a/src/measurement-contract.mjs
+++ b/src/measurement-contract.mjs
@ -0,0 +1,68 @@
+export const MEASUREMENT_SCOPES = new Set(["product", "harness", "cleanup"]);
+
+export function normalizeMeasurementScope(value, phaseId = null) {
+  if (MEASUREMENT_SCOPES.has(value)) {
+    return value;
+  }
+  if (phaseId === "target-setup" || phaseId === "auth-prepare" || phaseId === "auth-setup" || phaseId === "prepare" || phaseId?.startsWith("state-")) {
+    return "harness";
+  }
+  if (phaseId === "cleanup" || phaseId === "auth-cleanup" || phaseId === "env-cleanup") {
+    return "cleanup";
+  }
+  return "product";
+}
+
+export function measuredProductPhase(phase) {
+  return measurementScopeForPhase(phase) === "product";
+}
+
+export function measurementScopeForPhase(phase) {
+  if (MEASUREMENT_SCOPES.has(phase?.measurementScope)) {
+    return phase.measurementScope;
+  }
+  if (phase?.id === "provision" && (phase.commands ?? []).some((command) => /(?:^|\s)--no-service(?:\s|$)/.test(command))) {
+    return "harness";
+  }
+  return normalizeMeasurementScope(phase?.measurementScope, phase?.id);
+}
+
+export function driverKindForCommand(command) {
+  const text = String(command ?? "");
+  if (text.includes("run-dashboard-session-send-turn.mjs")) {
+    return "gateway-rpc";
+  }
+  if (text.includes("run-openai-compatible-turn.mjs")) {
+    return "gateway-http";
+  }
+  if (text.includes("run-tui-message-turn.mjs")) {
+    return "gateway-rpc";
+  }
+  if (/\bocm\s+@[^ ]+\s+--\s+agent\b/.test(text)) {
+    return text.includes("--local") ? "openclaw-cli-local" : "openclaw-cli-gateway";
+  }
+  if (/\bocm\s+@[^ ]+\s+--\s+gateway\s+call\b/.test(text)) {
+    return "gateway-rpc-via-cli";
+  }
+  if (/\bocm\b/.test(text)) {
+    return "ocm";
+  }
+  if (/\bnode\b/.test(text)) {
+    return "kova-helper";
+  }
+  return "unknown";
+}
+
+export function phaseDriverKind(phase, commands = phase?.commands ?? []) {
+  if (phase?.driverKind) {
+    return phase.driverKind;
+  }
+  const kinds = new Set(commands.map(driverKindForCommand));
+  if (kinds.size === 1) {
+    return [...kinds][0];
+  }
+  if (kinds.size === 0) {
+    return "none";
+  }
+  return "mixed";
+}
--- a/src/reporting/compare.mjs
+++ b/src/reporting/compare.mjs
@ -44,6 +44,8 @@ const defaultThresholds = {
  heapSnapshotBytes: 50 * 1024 * 1024,
  resourcePeakCommandTreeRssMb: 100,
  resourcePeakGatewayRssMb: 100,
+  resourcePeakTrackedRssMb: 100,
+  resourceCpuPercentMaxTracked: 25,
  openclawTimelineParseErrors: 0,
  openclawSlowestSpanMs: 5000,
  openclawEventLoopMaxMs: 250,
@ -457,6 +459,8 @@ function metricDeltas(baseline, current) {
    "nodeProfileTopFunctionMs",
    "heapSnapshotBytes",
    "resourceSampleCount",
+    "resourcePeakTrackedRssMb",
+    "resourceCpuPercentMaxTracked",
    "resourcePeakCommandTreeRssMb",
    "resourcePeakGatewayRssMb",
    "openclawTimelineEventCount",
--- a/src/reporting/report.mjs
+++ b/src/reporting/report.mjs
@ -237,7 +237,9 @@ export function renderMarkdownReport(report) {
          const expectedFailure = turn.expectedFailure ? "; expected failure observed " + turn.expectedFailureObserved : "";
          lines.push(`  - ${turn.label}: total ${turn.totalTurnMs ?? "unknown"} ms; pre-provider ${turn.preProviderMs ?? "unknown"} ms; provider ${turn.providerFinalMs ?? "unknown"} ms; post-provider ${turn.postProviderMs ?? "unknown"} ms; route ${route}; status ${status}; issue ${issue}; response ${turn.responseOk}; leaks ${turn.processLeakCount ?? "unknown"}${providerTiming}${expectedFailure}`);
          if (turn.gatewaySession) {
-            lines.push(`    - gateway session: create ${turn.gatewaySession.createSession}; session create ${turn.gatewaySession.sessionCreateDurationMs ?? "n/a"} ms; send ${turn.gatewaySession.sendDurationMs ?? "unknown"} ms; first assistant ${turn.gatewaySession.timeToFirstAssistantMs ?? "unknown"} ms; matched assistant ${turn.gatewaySession.timeToMatchedAssistantMs ?? "unknown"} ms; polls ${turn.gatewaySession.historyPollCount ?? "unknown"} (${turn.gatewaySession.historyErrorCount ?? "unknown"} errors)`);
+            const transport = turn.gatewaySession.gatewayTransportKind ?? "unknown";
+            const fallback = turn.gatewaySession.gatewayTransportFallbackReason ? `; fallback ${turn.gatewaySession.gatewayTransportFallbackReason}` : "";
+            lines.push(`    - gateway session: transport ${transport}${fallback}; create ${turn.gatewaySession.createSession}; session create ${turn.gatewaySession.sessionCreateDurationMs ?? "n/a"} ms; send ${turn.gatewaySession.sendDurationMs ?? "unknown"} ms; first assistant ${turn.gatewaySession.timeToFirstAssistantMs ?? "unknown"} ms; matched assistant ${turn.gatewaySession.timeToMatchedAssistantMs ?? "unknown"} ms; polls ${turn.gatewaySession.historyPollCount ?? "unknown"} (${turn.gatewaySession.historyErrorCount ?? "unknown"} errors)`);
          }
          if (turn.turnDiagnostics) {
            lines.push(`    - active window: metadata scans ${turn.metadataScanCount ?? "unknown"} (${turn.metadataScanTotalMs ?? "unknown"} ms total, max ${turn.metadataScanMaxMs ?? "unknown"} ms); event-loop samples ${turn.turnDiagnostics.eventLoop?.sampleCount ?? "unknown"} max ${turn.eventLoopMaxMs ?? "unknown"} ms`);
@ -703,6 +705,11 @@ function summarizeMeasurements(measurements) {
  return {
    peakRssMb: measurements.peakRssMb ?? null,
    cpuPercentMax: measurements.cpuPercentMax ?? null,
+    measurementScopeSummary: measurements.measurementScopeSummary ?? null,
+    resourceMeasurementScope: measurements.resourceMeasurementScope ?? null,
+    resourcePrimaryRole: measurements.resourcePrimaryRole ?? null,
+    resourcePeakTrackedRssMb: measurements.resourcePeakTrackedRssMb ?? null,
+    resourceCpuPercentMaxTracked: measurements.resourceCpuPercentMaxTracked ?? null,
    health: measurements.health ?? null,
    missingDependencyErrors: measurements.missingDependencyErrors ?? null,
    pluginLoadFailures: measurements.pluginLoadFailures ?? null,
@ -719,6 +726,8 @@ function summarizeMeasurements(measurements) {
    openclawSlowestSpanMs: measurements.openclawSlowestSpanMs ?? null,
    openclawOpenSpanCount: measurements.openclawOpenSpanCount ?? null,
    openclawOpenRequiredSpanCount: measurements.openclawOpenRequiredSpanCount ?? null,
+    openclawMissingRequiredSpanCount: measurements.openclawMissingRequiredSpanCount ?? null,
+    openclawMissingRequiredSpans: measurements.openclawMissingRequiredSpans ?? null,
    openclawOpenSpans: measurements.openclawOpenSpans ?? null,
    openclawKeySpans: measurements.openclawKeySpans ?? null,
    providerRequestCount: measurements.providerRequestCount ?? null,
--- a/src/runner.mjs
+++ b/src/runner.mjs
@ -14,6 +14,7 @@ import { collectEnvMetrics, collectNodeProfileMetrics } from "./metrics.mjs";
 import { collectorArtifactDirs, prepareCollectorArtifactDirs } from "./collectors/artifacts.mjs";
 import { collectProviderEvidence } from "./collectors/provider.mjs";
 import { evaluateRecord } from "./evaluator.mjs";
+import { driverKindForCommand, measurementScopeForPhase, normalizeMeasurementScope, phaseDriverKind } from "./measurement-contract.mjs";
 import { artifactsDir } from "./paths.mjs";
 import { repoRoot } from "./paths.mjs";
 import { assertKovaEnvName, assertSafeScenarioCommand } from "./safety.mjs";
@ -82,6 +83,8 @@ export async function executeScenario(scenario, context) {
        id: "target-setup",
        title: "Target Runtime Setup",
        intent: "Prepare the target OpenClaw runtime selector for the scenario.",
+        measurementScope: "harness",
+        driverKind: "ocm",
        commands: setupResults.map((result) => result.command),
        evidence: [],
        results: setupResults
@ -143,6 +146,8 @@ export async function executeScenario(scenario, context) {
          title: phase.title,
          intent: phase.intent,
          healthScope: phase.healthScope,
+          measurementScope: phaseMeasurementScope(phase),
+          driverKind: phaseDriverKind(phase, commands),
          expectedAgentFailure: phase.expectedAgentFailure === true,
          commands,
          evidence: phase.evidence ?? [],
@ -332,7 +337,7 @@ function buildPlannedPhases(scenario, context, envName, artifactDir, authPolicy)

  const authPreparePhase = buildAuthPreparePhase(authPolicy, artifactDir);
  if (authPreparePhase) {
-    phases.push(authPreparePhase);
+    phases.push(withPhaseContract(authPreparePhase, "harness"));
  }

  const preparePhase = buildStateLifecyclePhase(context, envName, scenario, "prepare", context.state?.prepare ?? [], artifactDir);
@ -344,20 +349,23 @@ function buildPlannedPhases(scenario, context, envName, artifactDir, authPolicy)
    if (phase.id === "cleanup") {
      continue;
    }
+    const commands = materializeScenarioPhaseCommands(phase, context, envName, artifactDir);
    phases.push({
      id: phase.id,
      title: phase.title,
      intent: phase.intent,
      healthScope: phase.healthScope,
+      measurementScope: phaseMeasurementScope(phase),
+      driverKind: phaseDriverKind(phase, commands),
      expectedAgentFailure: phase.expectedAgentFailure === true,
-      commands: materializeScenarioPhaseCommands(phase, context, envName, artifactDir),
+      commands,
      evidence: phase.evidence ?? []
    });

    if (phaseSupportsAuthSetup(phase, authPolicy) && !phases.some((planned) => planned.id === "auth-setup")) {
      const authSetupPhase = buildAuthSetupPhase(authPolicy, envName, artifactDir);
      if (authSetupPhase) {
-        phases.push(authSetupPhase);
+        phases.push(withPhaseContract(authSetupPhase, "harness"));
      }
    }

@ -378,7 +386,7 @@ function buildPlannedPhases(scenario, context, envName, artifactDir, authPolicy)
  if (!context.keepEnv) {
    const authCleanupPhase = buildAuthCleanupPhase(authPolicy, artifactDir);
    if (authCleanupPhase) {
-      phases.push(authCleanupPhase);
+      phases.push(withPhaseContract(authCleanupPhase, "cleanup"));
    }
    const cleanupPhase = buildStateLifecyclePhase(context, envName, scenario, "cleanup", context.state?.cleanup ?? [], artifactDir);
    if (cleanupPhase) {
@ -388,6 +396,8 @@ function buildPlannedPhases(scenario, context, envName, artifactDir, authPolicy)
      id: "env-cleanup",
      title: "Environment Cleanup",
      intent: "Destroy the disposable Kova env after the scenario finishes.",
+      measurementScope: "cleanup",
+      driverKind: "ocm",
      commands: [ocmEnvDestroy(envName)],
      evidence: ["temporary env destroyed"]
    });
@ -405,6 +415,8 @@ function buildTargetSetupPhase(context, envName) {
    id: "target-setup",
    title: "Target Runtime Setup",
    intent: "Prepare the target OpenClaw runtime selector for the scenario.",
+    measurementScope: "harness",
+    driverKind: "ocm",
    commands: [targetSetupCommand(context.targetPlan)],
    evidence: [`local-build runtime ${context.targetPlan.runtimeName}`, `kova env ${envName}`]
  };
@ -426,6 +438,8 @@ function buildStateLifecyclePhase(context, envName, scenario, kind, steps, artif
    id: kind,
    title: stateLifecycleTitle(context.state?.id, kind, phaseId),
    intent: stateLifecycleIntent(context.state?.id, kind, phaseId),
+    measurementScope: normalizeMeasurementScope(null, kind),
+    driverKind: phaseDriverKind(null, commands),
    commands,
    evidence,
    scenario: scenario.id
@ -459,6 +473,8 @@ async function executeStateLifecycleSteps(context, envName, scenario, kind, step
    id: kind,
    title: stateLifecycleTitle(context.state?.id, kind, phaseId),
    intent: stateLifecycleIntent(context.state?.id, kind, phaseId),
+    measurementScope: normalizeMeasurementScope(null, kind),
+    driverKind: phaseDriverKind(null, commands),
    commands,
    evidence,
    results,
@ -476,6 +492,8 @@ async function executeAuthPhase(phase, context, envName, artifactDir, authPolicy
  }
  return {
    ...phase,
+    measurementScope: normalizeMeasurementScope(phase.measurementScope, phase.id),
+    driverKind: phaseDriverKind(phase),
    results,
    metrics: await collectEnvMetrics(envName, metricOptions(context, null, { id: phase.id }, artifactDir))
  };
@ -569,7 +587,7 @@ async function executeTargetSetup(context, envName, artifactDir) {
  }

  const results = [
-    await runCommand(targetSetupCommand(context.targetPlan), {
+    tagCommandResult(await runCommand(targetSetupCommand(context.targetPlan), {
      timeoutMs: context.timeoutMs,
      env: { KOVA_ENV_NAME: envName },
      resourceSample: context.resourceSampling === false ? null : {
@ -578,7 +596,7 @@ async function executeTargetSetup(context, envName, artifactDir) {
        processRoles: context.processRoles ?? [],
        artifactPath: join(collectorArtifactDirs(artifactDir).resourceSamples, "target-setup-1.jsonl")
      }
-    })
+    }), "target-setup")
  ];
  if (results.every((result) => result.status === 0) && context.targetSetup) {
    context.targetSetup.completed = true;
@ -617,6 +635,7 @@ async function runScenarioCommand(command, context, envName, artifactDir, phaseI
      artifactPath: join(collectorArtifactDirs(artifactDir).resourceSamples, `${safeSegment(phaseId)}-${commandIndex + 1}.jsonl`)
    }
  });
+  tagCommandResult(result, phaseId);
  if (agentCommand) {
    await sleep(1000);
    const afterSnapshot = captureProcessSnapshot(snapshotOptions);
@ -638,6 +657,28 @@ async function runScenarioCommand(command, context, envName, artifactDir, phaseI
  return result;
 }

+function phaseMeasurementScope(phase) {
+  return measurementScopeForPhase(phase);
+}
+
+function withPhaseContract(phase, scope = null) {
+  return {
+    ...phase,
+    measurementScope: normalizeMeasurementScope(scope ?? phase.measurementScope, phase.id),
+    driverKind: phaseDriverKind(phase)
+  };
+}
+
+function tagCommandResult(result, phaseId) {
+  result.measurementScope = measurementScopeForPhase({
+    id: phaseId,
+    measurementScope: result.measurementScope,
+    commands: [result.command]
+  });
+  result.driverKind = driverKindForCommand(result.command);
+  return result;
+}
+
 function isAgentMessageCommand(command) {
  return (command.includes(" -- agent ") && command.includes("--message")) ||
    command.includes("run-concurrent-agent-turns.mjs") ||
--- a/src/selfcheck.mjs
+++ b/src/selfcheck.mjs
@ -608,6 +608,7 @@ function localBuildTargetSetupResourceExclusionCheck() {
      phases: [
        {
          id: "target-setup",
+          measurementScope: "harness",
          results: [{
            command: "ocm runtime build-local kova-local-test --repo /tmp/openclaw --force",
            status: 0,
@ -619,8 +620,23 @@ function localBuildTargetSetupResourceExclusionCheck() {
            })
          }]
        },
+        {
+          id: "auth-prepare",
+          measurementScope: "harness",
+          results: [{
+            command: "node support/mock-openai-server.mjs",
+            status: 0,
+            durationMs: 500,
+            resourceSamples: syntheticResourceSamples({
+              peakRssMb: 1900,
+              maxCpuPercent: 320,
+              role: "mock-provider"
+            })
+          }]
+        },
        {
          id: "scenario-command",
+          measurementScope: "product",
          results: [{
            command: "ocm @kova-self-check -- status",
            status: 0,
@ -630,6 +646,29 @@ function localBuildTargetSetupResourceExclusionCheck() {
              maxCpuPercent: 20,
              role: "gateway"
            })
+          }, {
+            command: "node support/kova-helper.mjs",
+            status: 0,
+            durationMs: 100,
+            resourceSamples: syntheticResourceSamples({
+              peakRssMb: 600,
+              maxCpuPercent: 30,
+              role: "command-tree"
+            })
+          }]
+        },
+        {
+          id: "auth-cleanup",
+          measurementScope: "cleanup",
+          results: [{
+            command: "kill $(cat mock/pid)",
+            status: 0,
+            durationMs: 50,
+            resourceSamples: syntheticResourceSamples({
+              peakRssMb: 1800,
+              maxCpuPercent: 300,
+              role: "mock-provider"
+            })
          }]
        }
      ],
@ -638,14 +677,19 @@ function localBuildTargetSetupResourceExclusionCheck() {
        logs: zeroLogMetrics()
      }
    };
-    evaluateRecord(record, { thresholds: { peakRssMb: 900 } }, {
-      surface: { thresholds: {} },
+    evaluateRecord(record, { thresholds: { peakRssMb: 200 } }, {
+      surface: { thresholds: {}, resourcePrimaryRole: "gateway" },
      targetPlan: { kind: "local-build" }
    });
    assertEqual(record.status, "PASS", "local-build target setup resources ignored status");
    assertEqual(record.measurements.peakRssMb, 100, "local-build target setup resources ignored RSS");
+    assertEqual(record.measurements.resourcePeakTrackedRssMb, 600, "tracked product helper RSS retained separately");
+    assertEqual(record.measurements.resourcePrimaryRole, "gateway", "primary resource role retained");
    assertEqual(record.measurements.resourceByRole.gateway.peakRssMb, 100, "scenario role RSS retained");
    assertEqual(record.measurements.resourceByRole["build-tooling"], undefined, "target setup role excluded");
+    assertEqual(record.measurements.resourceByRole["mock-provider"], undefined, "harness auth resources excluded");
+    assertEqual(record.measurements.measurementScopeSummary.harnessCommandCount, 2, "harness command count");
+    assertEqual(record.measurements.measurementScopeSummary.cleanupCommandCount, 1, "cleanup command count");
    assertEqual(record.violations, undefined, "no-service local-build record has no gateway violation");
    return {
      id: "local-build-target-setup-resource-exclusion",
@ -2125,6 +2169,7 @@ function gatewaySessionTurnEvaluationCheck() {
      minAssistantCount: 1,
      sessionKey: "kova-dashboard-session-send",
      runId: "cold-run",
+      gatewayTransport: { kind: "direct-gateway-rpc", fallbackReason: null },
      activeStartedAtEpochMs: base + 1000,
      activeFinishedAtEpochMs: base + 2500,
      activeTurnMs: 1500,
@ -2150,6 +2195,7 @@ function gatewaySessionTurnEvaluationCheck() {
      minAssistantCount: 2,
      sessionKey: "kova-dashboard-session-send",
      runId: "warm-run",
+      gatewayTransport: { kind: "direct-gateway-rpc", fallbackReason: null },
      activeStartedAtEpochMs: base + 11000,
      activeFinishedAtEpochMs: base + 11800,
      activeTurnMs: 800,
@ -2284,6 +2330,7 @@ function gatewaySessionTurnEvaluationCheck() {
    assertEqual(record.measurements.agentEventLoopMaxMs, 9, "active-window event-loop max");
    assertEqual(record.measurements.agentSessionPollCount, 5, "session polling total");
    assertEqual(record.measurements.agentTurns[1].gatewaySession.createSession, false, "warm turn reuses session");
+    assertEqual(record.measurements.agentTurns[0].gatewaySession.gatewayTransportKind, "direct-gateway-rpc", "dashboard turn direct Gateway transport");

    const rendered = renderMarkdownReport({
      generatedAt: "2026-05-01T00:00:00.000Z",
@ -2295,8 +2342,57 @@ function gatewaySessionTurnEvaluationCheck() {
      summary: { statuses: { PASS: 1 } }
    });
    assertEqual(rendered.includes("gateway session:"), true, "markdown includes gateway session detail");
+    assertEqual(rendered.includes("transport direct-gateway-rpc"), true, "markdown includes direct Gateway transport");
    assertEqual(rendered.includes("active window:"), true, "markdown includes active turn diagnostics");

+    const fallbackPayload = {
+      ...coldPayload,
+      gatewayTransport: { kind: "shell", fallbackReason: "gateway-token-unavailable" }
+    };
+    const fallbackRecord = {
+      scenario: "dashboard-session-send-turn",
+      surface: "dashboard-session-send-turn",
+      title: "Gateway session shell fallback",
+      status: "PASS",
+      phases: [{
+        id: "cold-dashboard-session-turn",
+        title: "Cold Gateway Session Turn",
+        intent: "Synthetic shell fallback",
+        commands: ["node support/run-dashboard-session-send-turn.mjs --create-session true"],
+        evidence: [],
+        results: [{
+          command: "node support/run-dashboard-session-send-turn.mjs --create-session true",
+          status: 0,
+          timedOut: false,
+          startedAt: new Date(base).toISOString(),
+          startedAtEpochMs: base,
+          finishedAt: new Date(base + 5000).toISOString(),
+          finishedAtEpochMs: base + 5000,
+          durationMs: 5000,
+          stdout: JSON.stringify(fallbackPayload),
+          stderr: ""
+        }],
+        metrics: { logs: zeroLogMetrics(), health: { ok: true } }
+      }],
+      providerEvidence: {
+        available: true,
+        requestCount: 1,
+        requests: [record.providerEvidence.requests[0]]
+      },
+      finalMetrics: { service: { gatewayState: "running" }, logs: zeroLogMetrics() }
+    };
+    evaluateRecord(fallbackRecord, {
+      id: "dashboard-session-send-turn",
+      agent: { expectedText: "KOVA_AGENT_OK" },
+      thresholds: {}
+    }, { surface: { thresholds: {} }, targetPlan: { kind: "runtime" } });
+    assertEqual(fallbackRecord.status, "FAIL", "dashboard session shell fallback rejected");
+    assertEqual(
+      fallbackRecord.violations.some((violation) => violation.metric === "gatewayTransport.kind"),
+      true,
+      "dashboard session shell fallback violation"
+    );
+
    return {
      id: "gateway-session-turn-evaluation",
      status: "PASS",
@ -4144,6 +4240,47 @@ function diagnosticsTimelineEvaluationCheck() {
      "missing diagnostic timeline violation"
    );

+    const missingSpanRecord = {
+      scenario: "diagnostic-missing-span",
+      status: "PASS",
+      phases: [],
+      finalMetrics: {
+        service: { gatewayState: "running" },
+        logs: zeroLogMetrics(),
+        timeline: {
+          available: true,
+          eventCount: 1,
+          parseErrorCount: 0,
+          openSpanCount: 0,
+          openSpans: [],
+          keySpans: {},
+          spanTotals: {
+            "gateway.startup": { count: 1, totalDurationMs: 100, maxDurationMs: 100 }
+          },
+          runtimeDeps: {},
+          eventLoop: {},
+          providers: {},
+          childProcesses: {}
+        }
+      }
+    };
+    evaluateRecord(missingSpanRecord, { thresholds: {} }, {
+      targetPlan: { kind: "local-build" },
+      profile: { id: "diagnostic", diagnostics: { timelineRequired: true } },
+      surface: {
+        id: "bundled-runtime-deps",
+        diagnostics: { expectedSpans: ["runtimeDeps.stage"] },
+        thresholds: {}
+      }
+    });
+    assertEqual(missingSpanRecord.status, "FAIL", "missing required span status");
+    assertEqual(missingSpanRecord.measurements.openclawMissingRequiredSpanCount, 1, "missing required span measurement");
+    assertEqual(
+      missingSpanRecord.violations.some((violation) => violation.metric === "openclawMissingRequiredSpanCount"),
+      true,
+      "missing required span violation"
+    );
+
    const openSpanRecord = {
      scenario: "diagnostic-open-span",
      status: "PASS",
--- a/support/run-dashboard-session-send-turn.mjs
+++ b/support/run-dashboard-session-send-turn.mjs
@ -24,7 +24,11 @@ try {
  const sessionKey = args["session-key"] ?? `kova-dashboard-${randomUUID()}`;
  const createSession = readBoolean(args["create-session"], true);
  const minAssistantCount = readPositiveInteger(args["min-assistant-count"], 1);
+  const allowShellFallback = readBoolean(args["allow-shell-fallback"], false);
  const gatewayTransport = await openDirectGatewayRpcClient(runtimeContext);
+  if (!gatewayTransport.client && !allowShellFallback) {
+    throw new Error(`direct Gateway RPC is required for dashboard-session-send-turn; fallback=${gatewayTransport.transport}; reason=${gatewayTransport.fallbackReason ?? "unknown"}`);
+  }

  try {
    let created = null;
--- a/surfaces/dashboard-session-send-turn.json
+++ b/surfaces/dashboard-session-send-turn.json
@ -10,6 +10,7 @@
    "agent-process",
    "mock-provider"
  ],
+  "resourcePrimaryRole": "gateway",
  "thresholds": {
    "agentTurnMs": 45000,
    "coldAgentTurnMs": 45000,