From a995487433583edea6ea97fd9e08a3c8e1163c5a Mon Sep 17 00:00:00 2001 From: Shakker Date: Thu, 7 May 2026 10:32:11 +0100 Subject: [PATCH] fix: separate gateway measurement scope --- src/evaluator.mjs | 127 ++++++++++++++++-- src/measurement-contract.mjs | 68 ++++++++++ src/reporting/compare.mjs | 4 + src/reporting/report.mjs | 11 +- src/runner.mjs | 53 +++++++- src/selfcheck.mjs | 141 +++++++++++++++++++- support/run-dashboard-session-send-turn.mjs | 4 + surfaces/dashboard-session-send-turn.json | 1 + 8 files changed, 389 insertions(+), 20 deletions(-) create mode 100644 src/measurement-contract.mjs diff --git a/src/evaluator.mjs b/src/evaluator.mjs index 794171e..8013333 100644 --- a/src/evaluator.mjs +++ b/src/evaluator.mjs @@ -11,6 +11,7 @@ import { computeProviderTurnAttribution } from "./collectors/provider.mjs"; import { summarizeRuntimeDepsLogs } from "./collectors/logs.mjs"; import { buildHealthMeasurement, healthReadinessClassification } from "./health.mjs"; import { resolveThresholdPolicy } from "./evaluation/thresholds.mjs"; +import { measuredProductPhase, measurementScopeForPhase, normalizeMeasurementScope } from "./measurement-contract.mjs"; import { checkAggregateThreshold, checkDuration, @@ -30,13 +31,18 @@ export function evaluateRecord(record, scenario, options = {}) { const roleThresholds = thresholdPolicy.roleThresholds; const violations = []; const allResults = collectResults(record); - const measuredResults = collectResults(record, { excludePhaseIds: ["target-setup"] }); + const measurementScopeSummary = summarizeMeasurementScopes(record); + const measuredResults = collectResults(record, { productOnly: true }); const resourceSummary = collectResourceSummary(measuredResults); - const peakRssMb = maxNullable( - collectPeakRss(record, { excludePhaseIds: ["target-setup"] }), + const primaryResourceRole = options.surface?.resourcePrimaryRole ?? null; + const primaryRoleResources = primaryResourceRole ? resourceSummary.byRole[primaryResourceRole] : null; + const peakTrackedRssMb = maxNullable( + collectPeakRss(record, { productOnly: true }), resourceSummary.peakTotalRssMb ); - const cpuPercentMax = maxNullable(collectCpuPercentMax(record), resourceSummary.maxTotalCpuPercent); + const cpuPercentMaxTracked = maxNullable(collectCpuPercentMax(record, { productOnly: true }), resourceSummary.maxTotalCpuPercent); + const peakRssMb = typeof primaryRoleResources?.peakRssMb === "number" ? primaryRoleResources.peakRssMb : peakTrackedRssMb; + const cpuPercentMax = typeof primaryRoleResources?.maxCpuPercent === "number" ? primaryRoleResources.maxCpuPercent : cpuPercentMaxTracked; const missingDependencyErrors = countMissingDependencyErrors(allResults) + countLogMetric(record, "missingDependencyErrors"); const pluginLoadFailures = countLogMetric(record, "pluginLoadFailures"); const metadataScanMentions = countLogMetric(record, "metadataScanMentions"); @@ -67,6 +73,7 @@ export function evaluateRecord(record, scenario, options = {}) { const timelineRequirement = timelineRequirementFor(options); const requiredOpenSpans = requiredTimelineSpans(options); const openRequiredSpans = timelineSummary.openSpans.filter((span) => requiredOpenSpans.has(span.name)); + const missingRequiredSpans = missingTimelineSpans(timelineSummary, requiredOpenSpans); const runtimeDepsStagingMs = maxNullable( openclawDiagnostics.runtimeDepsStagingMs, timelineSummary.runtimeDepsStageMaxMs, @@ -719,6 +726,18 @@ export function evaluateRecord(record, scenario, options = {}) { }); } + if (timelineSummary.available && missingRequiredSpans.length > 0) { + violations.push({ + kind: "diagnostics", + metric: "openclawMissingRequiredSpanCount", + expected: "0", + actual: missingRequiredSpans.length, + message: `${missingRequiredSpans.length} required OpenClaw diagnostics span(s) were not observed: ${missingRequiredSpans.slice(0, 5).join(", ")}` + }); + } + + checkGatewaySessionTransport(violations, agentTurns, scenario); + if (agentResponseOk === false) { violations.push({ kind: "agent", @@ -737,6 +756,11 @@ export function evaluateRecord(record, scenario, options = {}) { record.measurements = { peakRssMb, cpuPercentMax, + measurementScopeSummary, + resourceMeasurementScope: "product", + resourcePrimaryRole: primaryResourceRole, + resourcePeakTrackedRssMb: peakTrackedRssMb, + resourceCpuPercentMaxTracked: cpuPercentMaxTracked, coldReadyMs, warmReadyMs, upgradeMs, @@ -917,6 +941,8 @@ export function evaluateRecord(record, scenario, options = {}) { openclawRepeatedSpanCount: timelineSummary.repeatedSpanCount, openclawOpenSpanCount: timelineSummary.openSpanCount, openclawOpenRequiredSpanCount: openRequiredSpans.length, + openclawMissingRequiredSpanCount: missingRequiredSpans.length, + openclawMissingRequiredSpans: missingRequiredSpans, openclawOpenSpans: timelineSummary.openSpans, openclawKeySpans: timelineSummary.keySpans, openclawEventLoopMaxMs: timelineSummary.eventLoopMaxMs, @@ -1089,6 +1115,29 @@ function preferredPreProviderAttributionSummary(...summaries) { return summaries.find((summary) => summary?.count > 0) ?? summaries[0]; } +function checkGatewaySessionTransport(violations, agentTurns, scenario) { + if (scenario.id !== "dashboard-session-send-turn") { + return; + } + for (const turn of agentTurns) { + if (!turn.gatewaySession) { + continue; + } + const transport = turn.gatewaySession.gatewayTransportKind; + if (transport === "direct-gateway-rpc") { + continue; + } + violations.push({ + kind: "harness", + metric: "gatewayTransport.kind", + expected: "direct-gateway-rpc", + actual: transport ?? "unknown", + phaseId: turn.phaseId, + message: `dashboard session benchmark used ${transport ?? "unknown"} transport; direct Gateway RPC is required for Gateway product measurement${turn.gatewaySession.gatewayTransportFallbackReason ? ` (${turn.gatewaySession.gatewayTransportFallbackReason})` : ""}` + }); + } +} + function extractGatewaySessionTurn(result) { if (!result?.command?.includes("run-dashboard-session-send-turn.mjs")) { return null; @@ -1115,6 +1164,9 @@ function extractGatewaySessionTurn(result) { minAssistantCount: numberOrNull(payload.minAssistantCount), sessionKey: payload.sessionKey ?? null, runId: payload.runId ?? null, + gatewayTransportKind: payload.gatewayTransport?.kind ?? null, + gatewayTransportFallbackReason: payload.gatewayTransport?.fallbackReason ?? null, + gatewayTransportFallbackUsed: typeof payload.gatewayTransport?.kind === "string" && payload.gatewayTransport.kind !== "direct-gateway-rpc", activeStartedAtEpochMs, activeFinishedAtEpochMs, activeTurnMs, @@ -2017,6 +2069,26 @@ function requiredTimelineSpans(options) { ]); } +function missingTimelineSpans(timelineSummary, requiredSpans) { + return [...requiredSpans].filter((name) => !timelineSpanObserved(timelineSummary, name)); +} + +function timelineSpanObserved(timelineSummary, name) { + const exact = timelineSummary.keySpans?.[name] ?? timelineSummary.spanTotals?.[name]; + if ((exact?.count ?? 0) > 0 || (exact?.openCount ?? 0) > 0) { + return true; + } + if ((timelineSummary.openSpans ?? []).some((span) => span.name === name)) { + return true; + } + if (name === "gateway.chat_send" || name === "auto_reply" || name === "reply" || name === "models.catalog") { + return Object.entries(timelineSummary.spanTotals ?? {}).some(([spanName, summary]) => + spanName === name || (spanName.startsWith(`${name}.`) && (summary.count ?? 0) > 0) + ); + } + return false; +} + function maxDurationWhere(results, predicate) { const durations = results .filter((result) => predicate(result.command)) @@ -2515,6 +2587,28 @@ function healthFailureCount(samples) { return samples.filter((sample) => sample && !sample.ok).length; } +function summarizeMeasurementScopes(record) { + const phases = { product: 0, harness: 0, cleanup: 0 }; + const results = { product: 0, harness: 0, cleanup: 0 }; + for (const phase of record.phases ?? []) { + const phaseScope = measurementScopeForPhase(phase); + phases[phaseScope] += 1; + for (const result of phase.results ?? []) { + const resultScope = result.measurementScope ? normalizeMeasurementScope(result.measurementScope, phase.id) : phaseScope; + results[resultScope] += 1; + } + } + return { + schemaVersion: "kova.measurementScopeSummary.v1", + productPhaseCount: phases.product, + harnessPhaseCount: phases.harness, + cleanupPhaseCount: phases.cleanup, + productCommandCount: results.product, + harnessCommandCount: results.harness, + cleanupCommandCount: results.cleanup + }; +} + function collectResults(record, options = {}) { const excludePhaseIds = new Set(options.excludePhaseIds ?? []); const results = []; @@ -2522,6 +2616,9 @@ function collectResults(record, options = {}) { if (excludePhaseIds.has(phase.id)) { continue; } + if (options.productOnly === true && !measuredProductPhase(phase)) { + continue; + } for (const result of phase.results ?? []) { results.push(result); } @@ -2546,6 +2643,9 @@ function collectPeakRss(record, options = {}) { if (excludePhaseIds.has(phase.id)) { continue; } + if (options.productOnly === true && !measuredProductPhase(phase)) { + continue; + } const rss = phase.metrics?.process?.rssMb; if (typeof rss === "number") { peak = peak === null ? rss : Math.max(peak, rss); @@ -2720,8 +2820,8 @@ function collectTimelineSummary(record) { let repeatedSpanCount = 0; let runtimeDepsStageMaxMs = null; let slowestRuntimeDepsPlugin = null; - let openSpanCount = 0; - let openSpans = []; + let latestOpenSpanCount = 0; + let latestOpenSpans = []; let latestEventCount = -1; let events = []; let turnAttributionEvents = []; @@ -2734,6 +2834,10 @@ function collectTimelineSummary(record) { latestEventCount = timeline.eventCount ?? 0; events = timeline.events; turnAttributionEvents = Array.isArray(timeline.turnAttributionEvents) ? timeline.turnAttributionEvents : []; + latestOpenSpanCount = timeline.openSpanCount ?? timeline.openSpans?.length ?? 0; + latestOpenSpans = [...(timeline.openSpans ?? [])] + .toSorted((left, right) => (right.ageMs ?? -1) - (left.ageMs ?? -1)) + .slice(0, 25); } for (const artifact of timeline.artifacts ?? []) { artifacts.add(artifact); @@ -2742,8 +2846,6 @@ function collectTimelineSummary(record) { parseErrorCount = Math.max(parseErrorCount, timeline.parseErrorCount ?? 0); childProcessFailedCount = Math.max(childProcessFailedCount, timeline.childProcesses?.failedCount ?? 0); repeatedSpanCount = Math.max(repeatedSpanCount, timeline.repeatedSpans?.length ?? 0); - openSpanCount = Math.max(openSpanCount, timeline.openSpanCount ?? timeline.openSpans?.length ?? 0); - openSpans = mergeOpenSpans(openSpans, timeline.openSpans ?? []); mergeKeySpans(keySpans, timeline.keySpans ?? {}); mergeSpanTotals(spanTotals, timeline.spanTotals ?? {}); eventLoopMaxMs = maxNullable(eventLoopMaxMs, timeline.eventLoop?.maxMs); @@ -2775,8 +2877,8 @@ function collectTimelineSummary(record) { slowestSpanName: slowestSpan?.name ?? null, slowestSpanMs: slowestSpan?.durationMs ?? null, repeatedSpanCount, - openSpanCount, - openSpans, + openSpanCount: latestOpenSpanCount, + openSpans: latestOpenSpans, artifacts: [...artifacts], timelineArtifacts: [...artifacts], events, @@ -2847,9 +2949,12 @@ function mergeKeySpans(target, source) { } } -function collectCpuPercentMax(record) { +function collectCpuPercentMax(record, options = {}) { const values = []; for (const phase of record.phases ?? []) { + if (options.productOnly === true && !measuredProductPhase(phase)) { + continue; + } const cpu = phase.metrics?.process?.cpuPercent; if (typeof cpu === "number") { values.push(cpu); diff --git a/src/measurement-contract.mjs b/src/measurement-contract.mjs new file mode 100644 index 0000000..430db34 --- /dev/null +++ b/src/measurement-contract.mjs @@ -0,0 +1,68 @@ +export const MEASUREMENT_SCOPES = new Set(["product", "harness", "cleanup"]); + +export function normalizeMeasurementScope(value, phaseId = null) { + if (MEASUREMENT_SCOPES.has(value)) { + return value; + } + if (phaseId === "target-setup" || phaseId === "auth-prepare" || phaseId === "auth-setup" || phaseId === "prepare" || phaseId?.startsWith("state-")) { + return "harness"; + } + if (phaseId === "cleanup" || phaseId === "auth-cleanup" || phaseId === "env-cleanup") { + return "cleanup"; + } + return "product"; +} + +export function measuredProductPhase(phase) { + return measurementScopeForPhase(phase) === "product"; +} + +export function measurementScopeForPhase(phase) { + if (MEASUREMENT_SCOPES.has(phase?.measurementScope)) { + return phase.measurementScope; + } + if (phase?.id === "provision" && (phase.commands ?? []).some((command) => /(?:^|\s)--no-service(?:\s|$)/.test(command))) { + return "harness"; + } + return normalizeMeasurementScope(phase?.measurementScope, phase?.id); +} + +export function driverKindForCommand(command) { + const text = String(command ?? ""); + if (text.includes("run-dashboard-session-send-turn.mjs")) { + return "gateway-rpc"; + } + if (text.includes("run-openai-compatible-turn.mjs")) { + return "gateway-http"; + } + if (text.includes("run-tui-message-turn.mjs")) { + return "gateway-rpc"; + } + if (/\bocm\s+@[^ ]+\s+--\s+agent\b/.test(text)) { + return text.includes("--local") ? "openclaw-cli-local" : "openclaw-cli-gateway"; + } + if (/\bocm\s+@[^ ]+\s+--\s+gateway\s+call\b/.test(text)) { + return "gateway-rpc-via-cli"; + } + if (/\bocm\b/.test(text)) { + return "ocm"; + } + if (/\bnode\b/.test(text)) { + return "kova-helper"; + } + return "unknown"; +} + +export function phaseDriverKind(phase, commands = phase?.commands ?? []) { + if (phase?.driverKind) { + return phase.driverKind; + } + const kinds = new Set(commands.map(driverKindForCommand)); + if (kinds.size === 1) { + return [...kinds][0]; + } + if (kinds.size === 0) { + return "none"; + } + return "mixed"; +} diff --git a/src/reporting/compare.mjs b/src/reporting/compare.mjs index 65b16ca..2553508 100644 --- a/src/reporting/compare.mjs +++ b/src/reporting/compare.mjs @@ -44,6 +44,8 @@ const defaultThresholds = { heapSnapshotBytes: 50 * 1024 * 1024, resourcePeakCommandTreeRssMb: 100, resourcePeakGatewayRssMb: 100, + resourcePeakTrackedRssMb: 100, + resourceCpuPercentMaxTracked: 25, openclawTimelineParseErrors: 0, openclawSlowestSpanMs: 5000, openclawEventLoopMaxMs: 250, @@ -457,6 +459,8 @@ function metricDeltas(baseline, current) { "nodeProfileTopFunctionMs", "heapSnapshotBytes", "resourceSampleCount", + "resourcePeakTrackedRssMb", + "resourceCpuPercentMaxTracked", "resourcePeakCommandTreeRssMb", "resourcePeakGatewayRssMb", "openclawTimelineEventCount", diff --git a/src/reporting/report.mjs b/src/reporting/report.mjs index 15273de..7d84045 100644 --- a/src/reporting/report.mjs +++ b/src/reporting/report.mjs @@ -237,7 +237,9 @@ export function renderMarkdownReport(report) { const expectedFailure = turn.expectedFailure ? "; expected failure observed " + turn.expectedFailureObserved : ""; lines.push(` - ${turn.label}: total ${turn.totalTurnMs ?? "unknown"} ms; pre-provider ${turn.preProviderMs ?? "unknown"} ms; provider ${turn.providerFinalMs ?? "unknown"} ms; post-provider ${turn.postProviderMs ?? "unknown"} ms; route ${route}; status ${status}; issue ${issue}; response ${turn.responseOk}; leaks ${turn.processLeakCount ?? "unknown"}${providerTiming}${expectedFailure}`); if (turn.gatewaySession) { - lines.push(` - gateway session: create ${turn.gatewaySession.createSession}; session create ${turn.gatewaySession.sessionCreateDurationMs ?? "n/a"} ms; send ${turn.gatewaySession.sendDurationMs ?? "unknown"} ms; first assistant ${turn.gatewaySession.timeToFirstAssistantMs ?? "unknown"} ms; matched assistant ${turn.gatewaySession.timeToMatchedAssistantMs ?? "unknown"} ms; polls ${turn.gatewaySession.historyPollCount ?? "unknown"} (${turn.gatewaySession.historyErrorCount ?? "unknown"} errors)`); + const transport = turn.gatewaySession.gatewayTransportKind ?? "unknown"; + const fallback = turn.gatewaySession.gatewayTransportFallbackReason ? `; fallback ${turn.gatewaySession.gatewayTransportFallbackReason}` : ""; + lines.push(` - gateway session: transport ${transport}${fallback}; create ${turn.gatewaySession.createSession}; session create ${turn.gatewaySession.sessionCreateDurationMs ?? "n/a"} ms; send ${turn.gatewaySession.sendDurationMs ?? "unknown"} ms; first assistant ${turn.gatewaySession.timeToFirstAssistantMs ?? "unknown"} ms; matched assistant ${turn.gatewaySession.timeToMatchedAssistantMs ?? "unknown"} ms; polls ${turn.gatewaySession.historyPollCount ?? "unknown"} (${turn.gatewaySession.historyErrorCount ?? "unknown"} errors)`); } if (turn.turnDiagnostics) { lines.push(` - active window: metadata scans ${turn.metadataScanCount ?? "unknown"} (${turn.metadataScanTotalMs ?? "unknown"} ms total, max ${turn.metadataScanMaxMs ?? "unknown"} ms); event-loop samples ${turn.turnDiagnostics.eventLoop?.sampleCount ?? "unknown"} max ${turn.eventLoopMaxMs ?? "unknown"} ms`); @@ -703,6 +705,11 @@ function summarizeMeasurements(measurements) { return { peakRssMb: measurements.peakRssMb ?? null, cpuPercentMax: measurements.cpuPercentMax ?? null, + measurementScopeSummary: measurements.measurementScopeSummary ?? null, + resourceMeasurementScope: measurements.resourceMeasurementScope ?? null, + resourcePrimaryRole: measurements.resourcePrimaryRole ?? null, + resourcePeakTrackedRssMb: measurements.resourcePeakTrackedRssMb ?? null, + resourceCpuPercentMaxTracked: measurements.resourceCpuPercentMaxTracked ?? null, health: measurements.health ?? null, missingDependencyErrors: measurements.missingDependencyErrors ?? null, pluginLoadFailures: measurements.pluginLoadFailures ?? null, @@ -719,6 +726,8 @@ function summarizeMeasurements(measurements) { openclawSlowestSpanMs: measurements.openclawSlowestSpanMs ?? null, openclawOpenSpanCount: measurements.openclawOpenSpanCount ?? null, openclawOpenRequiredSpanCount: measurements.openclawOpenRequiredSpanCount ?? null, + openclawMissingRequiredSpanCount: measurements.openclawMissingRequiredSpanCount ?? null, + openclawMissingRequiredSpans: measurements.openclawMissingRequiredSpans ?? null, openclawOpenSpans: measurements.openclawOpenSpans ?? null, openclawKeySpans: measurements.openclawKeySpans ?? null, providerRequestCount: measurements.providerRequestCount ?? null, diff --git a/src/runner.mjs b/src/runner.mjs index d6bc1e8..5f48659 100644 --- a/src/runner.mjs +++ b/src/runner.mjs @@ -14,6 +14,7 @@ import { collectEnvMetrics, collectNodeProfileMetrics } from "./metrics.mjs"; import { collectorArtifactDirs, prepareCollectorArtifactDirs } from "./collectors/artifacts.mjs"; import { collectProviderEvidence } from "./collectors/provider.mjs"; import { evaluateRecord } from "./evaluator.mjs"; +import { driverKindForCommand, measurementScopeForPhase, normalizeMeasurementScope, phaseDriverKind } from "./measurement-contract.mjs"; import { artifactsDir } from "./paths.mjs"; import { repoRoot } from "./paths.mjs"; import { assertKovaEnvName, assertSafeScenarioCommand } from "./safety.mjs"; @@ -82,6 +83,8 @@ export async function executeScenario(scenario, context) { id: "target-setup", title: "Target Runtime Setup", intent: "Prepare the target OpenClaw runtime selector for the scenario.", + measurementScope: "harness", + driverKind: "ocm", commands: setupResults.map((result) => result.command), evidence: [], results: setupResults @@ -143,6 +146,8 @@ export async function executeScenario(scenario, context) { title: phase.title, intent: phase.intent, healthScope: phase.healthScope, + measurementScope: phaseMeasurementScope(phase), + driverKind: phaseDriverKind(phase, commands), expectedAgentFailure: phase.expectedAgentFailure === true, commands, evidence: phase.evidence ?? [], @@ -332,7 +337,7 @@ function buildPlannedPhases(scenario, context, envName, artifactDir, authPolicy) const authPreparePhase = buildAuthPreparePhase(authPolicy, artifactDir); if (authPreparePhase) { - phases.push(authPreparePhase); + phases.push(withPhaseContract(authPreparePhase, "harness")); } const preparePhase = buildStateLifecyclePhase(context, envName, scenario, "prepare", context.state?.prepare ?? [], artifactDir); @@ -344,20 +349,23 @@ function buildPlannedPhases(scenario, context, envName, artifactDir, authPolicy) if (phase.id === "cleanup") { continue; } + const commands = materializeScenarioPhaseCommands(phase, context, envName, artifactDir); phases.push({ id: phase.id, title: phase.title, intent: phase.intent, healthScope: phase.healthScope, + measurementScope: phaseMeasurementScope(phase), + driverKind: phaseDriverKind(phase, commands), expectedAgentFailure: phase.expectedAgentFailure === true, - commands: materializeScenarioPhaseCommands(phase, context, envName, artifactDir), + commands, evidence: phase.evidence ?? [] }); if (phaseSupportsAuthSetup(phase, authPolicy) && !phases.some((planned) => planned.id === "auth-setup")) { const authSetupPhase = buildAuthSetupPhase(authPolicy, envName, artifactDir); if (authSetupPhase) { - phases.push(authSetupPhase); + phases.push(withPhaseContract(authSetupPhase, "harness")); } } @@ -378,7 +386,7 @@ function buildPlannedPhases(scenario, context, envName, artifactDir, authPolicy) if (!context.keepEnv) { const authCleanupPhase = buildAuthCleanupPhase(authPolicy, artifactDir); if (authCleanupPhase) { - phases.push(authCleanupPhase); + phases.push(withPhaseContract(authCleanupPhase, "cleanup")); } const cleanupPhase = buildStateLifecyclePhase(context, envName, scenario, "cleanup", context.state?.cleanup ?? [], artifactDir); if (cleanupPhase) { @@ -388,6 +396,8 @@ function buildPlannedPhases(scenario, context, envName, artifactDir, authPolicy) id: "env-cleanup", title: "Environment Cleanup", intent: "Destroy the disposable Kova env after the scenario finishes.", + measurementScope: "cleanup", + driverKind: "ocm", commands: [ocmEnvDestroy(envName)], evidence: ["temporary env destroyed"] }); @@ -405,6 +415,8 @@ function buildTargetSetupPhase(context, envName) { id: "target-setup", title: "Target Runtime Setup", intent: "Prepare the target OpenClaw runtime selector for the scenario.", + measurementScope: "harness", + driverKind: "ocm", commands: [targetSetupCommand(context.targetPlan)], evidence: [`local-build runtime ${context.targetPlan.runtimeName}`, `kova env ${envName}`] }; @@ -426,6 +438,8 @@ function buildStateLifecyclePhase(context, envName, scenario, kind, steps, artif id: kind, title: stateLifecycleTitle(context.state?.id, kind, phaseId), intent: stateLifecycleIntent(context.state?.id, kind, phaseId), + measurementScope: normalizeMeasurementScope(null, kind), + driverKind: phaseDriverKind(null, commands), commands, evidence, scenario: scenario.id @@ -459,6 +473,8 @@ async function executeStateLifecycleSteps(context, envName, scenario, kind, step id: kind, title: stateLifecycleTitle(context.state?.id, kind, phaseId), intent: stateLifecycleIntent(context.state?.id, kind, phaseId), + measurementScope: normalizeMeasurementScope(null, kind), + driverKind: phaseDriverKind(null, commands), commands, evidence, results, @@ -476,6 +492,8 @@ async function executeAuthPhase(phase, context, envName, artifactDir, authPolicy } return { ...phase, + measurementScope: normalizeMeasurementScope(phase.measurementScope, phase.id), + driverKind: phaseDriverKind(phase), results, metrics: await collectEnvMetrics(envName, metricOptions(context, null, { id: phase.id }, artifactDir)) }; @@ -569,7 +587,7 @@ async function executeTargetSetup(context, envName, artifactDir) { } const results = [ - await runCommand(targetSetupCommand(context.targetPlan), { + tagCommandResult(await runCommand(targetSetupCommand(context.targetPlan), { timeoutMs: context.timeoutMs, env: { KOVA_ENV_NAME: envName }, resourceSample: context.resourceSampling === false ? null : { @@ -578,7 +596,7 @@ async function executeTargetSetup(context, envName, artifactDir) { processRoles: context.processRoles ?? [], artifactPath: join(collectorArtifactDirs(artifactDir).resourceSamples, "target-setup-1.jsonl") } - }) + }), "target-setup") ]; if (results.every((result) => result.status === 0) && context.targetSetup) { context.targetSetup.completed = true; @@ -617,6 +635,7 @@ async function runScenarioCommand(command, context, envName, artifactDir, phaseI artifactPath: join(collectorArtifactDirs(artifactDir).resourceSamples, `${safeSegment(phaseId)}-${commandIndex + 1}.jsonl`) } }); + tagCommandResult(result, phaseId); if (agentCommand) { await sleep(1000); const afterSnapshot = captureProcessSnapshot(snapshotOptions); @@ -638,6 +657,28 @@ async function runScenarioCommand(command, context, envName, artifactDir, phaseI return result; } +function phaseMeasurementScope(phase) { + return measurementScopeForPhase(phase); +} + +function withPhaseContract(phase, scope = null) { + return { + ...phase, + measurementScope: normalizeMeasurementScope(scope ?? phase.measurementScope, phase.id), + driverKind: phaseDriverKind(phase) + }; +} + +function tagCommandResult(result, phaseId) { + result.measurementScope = measurementScopeForPhase({ + id: phaseId, + measurementScope: result.measurementScope, + commands: [result.command] + }); + result.driverKind = driverKindForCommand(result.command); + return result; +} + function isAgentMessageCommand(command) { return (command.includes(" -- agent ") && command.includes("--message")) || command.includes("run-concurrent-agent-turns.mjs") || diff --git a/src/selfcheck.mjs b/src/selfcheck.mjs index c483ef9..b3a6b5f 100644 --- a/src/selfcheck.mjs +++ b/src/selfcheck.mjs @@ -608,6 +608,7 @@ function localBuildTargetSetupResourceExclusionCheck() { phases: [ { id: "target-setup", + measurementScope: "harness", results: [{ command: "ocm runtime build-local kova-local-test --repo /tmp/openclaw --force", status: 0, @@ -619,8 +620,23 @@ function localBuildTargetSetupResourceExclusionCheck() { }) }] }, + { + id: "auth-prepare", + measurementScope: "harness", + results: [{ + command: "node support/mock-openai-server.mjs", + status: 0, + durationMs: 500, + resourceSamples: syntheticResourceSamples({ + peakRssMb: 1900, + maxCpuPercent: 320, + role: "mock-provider" + }) + }] + }, { id: "scenario-command", + measurementScope: "product", results: [{ command: "ocm @kova-self-check -- status", status: 0, @@ -630,6 +646,29 @@ function localBuildTargetSetupResourceExclusionCheck() { maxCpuPercent: 20, role: "gateway" }) + }, { + command: "node support/kova-helper.mjs", + status: 0, + durationMs: 100, + resourceSamples: syntheticResourceSamples({ + peakRssMb: 600, + maxCpuPercent: 30, + role: "command-tree" + }) + }] + }, + { + id: "auth-cleanup", + measurementScope: "cleanup", + results: [{ + command: "kill $(cat mock/pid)", + status: 0, + durationMs: 50, + resourceSamples: syntheticResourceSamples({ + peakRssMb: 1800, + maxCpuPercent: 300, + role: "mock-provider" + }) }] } ], @@ -638,14 +677,19 @@ function localBuildTargetSetupResourceExclusionCheck() { logs: zeroLogMetrics() } }; - evaluateRecord(record, { thresholds: { peakRssMb: 900 } }, { - surface: { thresholds: {} }, + evaluateRecord(record, { thresholds: { peakRssMb: 200 } }, { + surface: { thresholds: {}, resourcePrimaryRole: "gateway" }, targetPlan: { kind: "local-build" } }); assertEqual(record.status, "PASS", "local-build target setup resources ignored status"); assertEqual(record.measurements.peakRssMb, 100, "local-build target setup resources ignored RSS"); + assertEqual(record.measurements.resourcePeakTrackedRssMb, 600, "tracked product helper RSS retained separately"); + assertEqual(record.measurements.resourcePrimaryRole, "gateway", "primary resource role retained"); assertEqual(record.measurements.resourceByRole.gateway.peakRssMb, 100, "scenario role RSS retained"); assertEqual(record.measurements.resourceByRole["build-tooling"], undefined, "target setup role excluded"); + assertEqual(record.measurements.resourceByRole["mock-provider"], undefined, "harness auth resources excluded"); + assertEqual(record.measurements.measurementScopeSummary.harnessCommandCount, 2, "harness command count"); + assertEqual(record.measurements.measurementScopeSummary.cleanupCommandCount, 1, "cleanup command count"); assertEqual(record.violations, undefined, "no-service local-build record has no gateway violation"); return { id: "local-build-target-setup-resource-exclusion", @@ -2125,6 +2169,7 @@ function gatewaySessionTurnEvaluationCheck() { minAssistantCount: 1, sessionKey: "kova-dashboard-session-send", runId: "cold-run", + gatewayTransport: { kind: "direct-gateway-rpc", fallbackReason: null }, activeStartedAtEpochMs: base + 1000, activeFinishedAtEpochMs: base + 2500, activeTurnMs: 1500, @@ -2150,6 +2195,7 @@ function gatewaySessionTurnEvaluationCheck() { minAssistantCount: 2, sessionKey: "kova-dashboard-session-send", runId: "warm-run", + gatewayTransport: { kind: "direct-gateway-rpc", fallbackReason: null }, activeStartedAtEpochMs: base + 11000, activeFinishedAtEpochMs: base + 11800, activeTurnMs: 800, @@ -2284,6 +2330,7 @@ function gatewaySessionTurnEvaluationCheck() { assertEqual(record.measurements.agentEventLoopMaxMs, 9, "active-window event-loop max"); assertEqual(record.measurements.agentSessionPollCount, 5, "session polling total"); assertEqual(record.measurements.agentTurns[1].gatewaySession.createSession, false, "warm turn reuses session"); + assertEqual(record.measurements.agentTurns[0].gatewaySession.gatewayTransportKind, "direct-gateway-rpc", "dashboard turn direct Gateway transport"); const rendered = renderMarkdownReport({ generatedAt: "2026-05-01T00:00:00.000Z", @@ -2295,8 +2342,57 @@ function gatewaySessionTurnEvaluationCheck() { summary: { statuses: { PASS: 1 } } }); assertEqual(rendered.includes("gateway session:"), true, "markdown includes gateway session detail"); + assertEqual(rendered.includes("transport direct-gateway-rpc"), true, "markdown includes direct Gateway transport"); assertEqual(rendered.includes("active window:"), true, "markdown includes active turn diagnostics"); + const fallbackPayload = { + ...coldPayload, + gatewayTransport: { kind: "shell", fallbackReason: "gateway-token-unavailable" } + }; + const fallbackRecord = { + scenario: "dashboard-session-send-turn", + surface: "dashboard-session-send-turn", + title: "Gateway session shell fallback", + status: "PASS", + phases: [{ + id: "cold-dashboard-session-turn", + title: "Cold Gateway Session Turn", + intent: "Synthetic shell fallback", + commands: ["node support/run-dashboard-session-send-turn.mjs --create-session true"], + evidence: [], + results: [{ + command: "node support/run-dashboard-session-send-turn.mjs --create-session true", + status: 0, + timedOut: false, + startedAt: new Date(base).toISOString(), + startedAtEpochMs: base, + finishedAt: new Date(base + 5000).toISOString(), + finishedAtEpochMs: base + 5000, + durationMs: 5000, + stdout: JSON.stringify(fallbackPayload), + stderr: "" + }], + metrics: { logs: zeroLogMetrics(), health: { ok: true } } + }], + providerEvidence: { + available: true, + requestCount: 1, + requests: [record.providerEvidence.requests[0]] + }, + finalMetrics: { service: { gatewayState: "running" }, logs: zeroLogMetrics() } + }; + evaluateRecord(fallbackRecord, { + id: "dashboard-session-send-turn", + agent: { expectedText: "KOVA_AGENT_OK" }, + thresholds: {} + }, { surface: { thresholds: {} }, targetPlan: { kind: "runtime" } }); + assertEqual(fallbackRecord.status, "FAIL", "dashboard session shell fallback rejected"); + assertEqual( + fallbackRecord.violations.some((violation) => violation.metric === "gatewayTransport.kind"), + true, + "dashboard session shell fallback violation" + ); + return { id: "gateway-session-turn-evaluation", status: "PASS", @@ -4144,6 +4240,47 @@ function diagnosticsTimelineEvaluationCheck() { "missing diagnostic timeline violation" ); + const missingSpanRecord = { + scenario: "diagnostic-missing-span", + status: "PASS", + phases: [], + finalMetrics: { + service: { gatewayState: "running" }, + logs: zeroLogMetrics(), + timeline: { + available: true, + eventCount: 1, + parseErrorCount: 0, + openSpanCount: 0, + openSpans: [], + keySpans: {}, + spanTotals: { + "gateway.startup": { count: 1, totalDurationMs: 100, maxDurationMs: 100 } + }, + runtimeDeps: {}, + eventLoop: {}, + providers: {}, + childProcesses: {} + } + } + }; + evaluateRecord(missingSpanRecord, { thresholds: {} }, { + targetPlan: { kind: "local-build" }, + profile: { id: "diagnostic", diagnostics: { timelineRequired: true } }, + surface: { + id: "bundled-runtime-deps", + diagnostics: { expectedSpans: ["runtimeDeps.stage"] }, + thresholds: {} + } + }); + assertEqual(missingSpanRecord.status, "FAIL", "missing required span status"); + assertEqual(missingSpanRecord.measurements.openclawMissingRequiredSpanCount, 1, "missing required span measurement"); + assertEqual( + missingSpanRecord.violations.some((violation) => violation.metric === "openclawMissingRequiredSpanCount"), + true, + "missing required span violation" + ); + const openSpanRecord = { scenario: "diagnostic-open-span", status: "PASS", diff --git a/support/run-dashboard-session-send-turn.mjs b/support/run-dashboard-session-send-turn.mjs index 82776ab..79c6e73 100755 --- a/support/run-dashboard-session-send-turn.mjs +++ b/support/run-dashboard-session-send-turn.mjs @@ -24,7 +24,11 @@ try { const sessionKey = args["session-key"] ?? `kova-dashboard-${randomUUID()}`; const createSession = readBoolean(args["create-session"], true); const minAssistantCount = readPositiveInteger(args["min-assistant-count"], 1); + const allowShellFallback = readBoolean(args["allow-shell-fallback"], false); const gatewayTransport = await openDirectGatewayRpcClient(runtimeContext); + if (!gatewayTransport.client && !allowShellFallback) { + throw new Error(`direct Gateway RPC is required for dashboard-session-send-turn; fallback=${gatewayTransport.transport}; reason=${gatewayTransport.fallbackReason ?? "unknown"}`); + } try { let created = null; diff --git a/surfaces/dashboard-session-send-turn.json b/surfaces/dashboard-session-send-turn.json index c12b4c0..d14ac53 100644 --- a/surfaces/dashboard-session-send-turn.json +++ b/surfaces/dashboard-session-send-turn.json @@ -10,6 +10,7 @@ "agent-process", "mock-provider" ], + "resourcePrimaryRole": "gateway", "thresholds": { "agentTurnMs": 45000, "coldAgentTurnMs": 45000,