fix: separate gateway measurement scope
This commit is contained in:
parent
9d9ff32d32
commit
a995487433
@ -11,6 +11,7 @@ import { computeProviderTurnAttribution } from "./collectors/provider.mjs";
|
||||
import { summarizeRuntimeDepsLogs } from "./collectors/logs.mjs";
|
||||
import { buildHealthMeasurement, healthReadinessClassification } from "./health.mjs";
|
||||
import { resolveThresholdPolicy } from "./evaluation/thresholds.mjs";
|
||||
import { measuredProductPhase, measurementScopeForPhase, normalizeMeasurementScope } from "./measurement-contract.mjs";
|
||||
import {
|
||||
checkAggregateThreshold,
|
||||
checkDuration,
|
||||
@ -30,13 +31,18 @@ export function evaluateRecord(record, scenario, options = {}) {
|
||||
const roleThresholds = thresholdPolicy.roleThresholds;
|
||||
const violations = [];
|
||||
const allResults = collectResults(record);
|
||||
const measuredResults = collectResults(record, { excludePhaseIds: ["target-setup"] });
|
||||
const measurementScopeSummary = summarizeMeasurementScopes(record);
|
||||
const measuredResults = collectResults(record, { productOnly: true });
|
||||
const resourceSummary = collectResourceSummary(measuredResults);
|
||||
const peakRssMb = maxNullable(
|
||||
collectPeakRss(record, { excludePhaseIds: ["target-setup"] }),
|
||||
const primaryResourceRole = options.surface?.resourcePrimaryRole ?? null;
|
||||
const primaryRoleResources = primaryResourceRole ? resourceSummary.byRole[primaryResourceRole] : null;
|
||||
const peakTrackedRssMb = maxNullable(
|
||||
collectPeakRss(record, { productOnly: true }),
|
||||
resourceSummary.peakTotalRssMb
|
||||
);
|
||||
const cpuPercentMax = maxNullable(collectCpuPercentMax(record), resourceSummary.maxTotalCpuPercent);
|
||||
const cpuPercentMaxTracked = maxNullable(collectCpuPercentMax(record, { productOnly: true }), resourceSummary.maxTotalCpuPercent);
|
||||
const peakRssMb = typeof primaryRoleResources?.peakRssMb === "number" ? primaryRoleResources.peakRssMb : peakTrackedRssMb;
|
||||
const cpuPercentMax = typeof primaryRoleResources?.maxCpuPercent === "number" ? primaryRoleResources.maxCpuPercent : cpuPercentMaxTracked;
|
||||
const missingDependencyErrors = countMissingDependencyErrors(allResults) + countLogMetric(record, "missingDependencyErrors");
|
||||
const pluginLoadFailures = countLogMetric(record, "pluginLoadFailures");
|
||||
const metadataScanMentions = countLogMetric(record, "metadataScanMentions");
|
||||
@ -67,6 +73,7 @@ export function evaluateRecord(record, scenario, options = {}) {
|
||||
const timelineRequirement = timelineRequirementFor(options);
|
||||
const requiredOpenSpans = requiredTimelineSpans(options);
|
||||
const openRequiredSpans = timelineSummary.openSpans.filter((span) => requiredOpenSpans.has(span.name));
|
||||
const missingRequiredSpans = missingTimelineSpans(timelineSummary, requiredOpenSpans);
|
||||
const runtimeDepsStagingMs = maxNullable(
|
||||
openclawDiagnostics.runtimeDepsStagingMs,
|
||||
timelineSummary.runtimeDepsStageMaxMs,
|
||||
@ -719,6 +726,18 @@ export function evaluateRecord(record, scenario, options = {}) {
|
||||
});
|
||||
}
|
||||
|
||||
if (timelineSummary.available && missingRequiredSpans.length > 0) {
|
||||
violations.push({
|
||||
kind: "diagnostics",
|
||||
metric: "openclawMissingRequiredSpanCount",
|
||||
expected: "0",
|
||||
actual: missingRequiredSpans.length,
|
||||
message: `${missingRequiredSpans.length} required OpenClaw diagnostics span(s) were not observed: ${missingRequiredSpans.slice(0, 5).join(", ")}`
|
||||
});
|
||||
}
|
||||
|
||||
checkGatewaySessionTransport(violations, agentTurns, scenario);
|
||||
|
||||
if (agentResponseOk === false) {
|
||||
violations.push({
|
||||
kind: "agent",
|
||||
@ -737,6 +756,11 @@ export function evaluateRecord(record, scenario, options = {}) {
|
||||
record.measurements = {
|
||||
peakRssMb,
|
||||
cpuPercentMax,
|
||||
measurementScopeSummary,
|
||||
resourceMeasurementScope: "product",
|
||||
resourcePrimaryRole: primaryResourceRole,
|
||||
resourcePeakTrackedRssMb: peakTrackedRssMb,
|
||||
resourceCpuPercentMaxTracked: cpuPercentMaxTracked,
|
||||
coldReadyMs,
|
||||
warmReadyMs,
|
||||
upgradeMs,
|
||||
@ -917,6 +941,8 @@ export function evaluateRecord(record, scenario, options = {}) {
|
||||
openclawRepeatedSpanCount: timelineSummary.repeatedSpanCount,
|
||||
openclawOpenSpanCount: timelineSummary.openSpanCount,
|
||||
openclawOpenRequiredSpanCount: openRequiredSpans.length,
|
||||
openclawMissingRequiredSpanCount: missingRequiredSpans.length,
|
||||
openclawMissingRequiredSpans: missingRequiredSpans,
|
||||
openclawOpenSpans: timelineSummary.openSpans,
|
||||
openclawKeySpans: timelineSummary.keySpans,
|
||||
openclawEventLoopMaxMs: timelineSummary.eventLoopMaxMs,
|
||||
@ -1089,6 +1115,29 @@ function preferredPreProviderAttributionSummary(...summaries) {
|
||||
return summaries.find((summary) => summary?.count > 0) ?? summaries[0];
|
||||
}
|
||||
|
||||
function checkGatewaySessionTransport(violations, agentTurns, scenario) {
|
||||
if (scenario.id !== "dashboard-session-send-turn") {
|
||||
return;
|
||||
}
|
||||
for (const turn of agentTurns) {
|
||||
if (!turn.gatewaySession) {
|
||||
continue;
|
||||
}
|
||||
const transport = turn.gatewaySession.gatewayTransportKind;
|
||||
if (transport === "direct-gateway-rpc") {
|
||||
continue;
|
||||
}
|
||||
violations.push({
|
||||
kind: "harness",
|
||||
metric: "gatewayTransport.kind",
|
||||
expected: "direct-gateway-rpc",
|
||||
actual: transport ?? "unknown",
|
||||
phaseId: turn.phaseId,
|
||||
message: `dashboard session benchmark used ${transport ?? "unknown"} transport; direct Gateway RPC is required for Gateway product measurement${turn.gatewaySession.gatewayTransportFallbackReason ? ` (${turn.gatewaySession.gatewayTransportFallbackReason})` : ""}`
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
function extractGatewaySessionTurn(result) {
|
||||
if (!result?.command?.includes("run-dashboard-session-send-turn.mjs")) {
|
||||
return null;
|
||||
@ -1115,6 +1164,9 @@ function extractGatewaySessionTurn(result) {
|
||||
minAssistantCount: numberOrNull(payload.minAssistantCount),
|
||||
sessionKey: payload.sessionKey ?? null,
|
||||
runId: payload.runId ?? null,
|
||||
gatewayTransportKind: payload.gatewayTransport?.kind ?? null,
|
||||
gatewayTransportFallbackReason: payload.gatewayTransport?.fallbackReason ?? null,
|
||||
gatewayTransportFallbackUsed: typeof payload.gatewayTransport?.kind === "string" && payload.gatewayTransport.kind !== "direct-gateway-rpc",
|
||||
activeStartedAtEpochMs,
|
||||
activeFinishedAtEpochMs,
|
||||
activeTurnMs,
|
||||
@ -2017,6 +2069,26 @@ function requiredTimelineSpans(options) {
|
||||
]);
|
||||
}
|
||||
|
||||
function missingTimelineSpans(timelineSummary, requiredSpans) {
|
||||
return [...requiredSpans].filter((name) => !timelineSpanObserved(timelineSummary, name));
|
||||
}
|
||||
|
||||
function timelineSpanObserved(timelineSummary, name) {
|
||||
const exact = timelineSummary.keySpans?.[name] ?? timelineSummary.spanTotals?.[name];
|
||||
if ((exact?.count ?? 0) > 0 || (exact?.openCount ?? 0) > 0) {
|
||||
return true;
|
||||
}
|
||||
if ((timelineSummary.openSpans ?? []).some((span) => span.name === name)) {
|
||||
return true;
|
||||
}
|
||||
if (name === "gateway.chat_send" || name === "auto_reply" || name === "reply" || name === "models.catalog") {
|
||||
return Object.entries(timelineSummary.spanTotals ?? {}).some(([spanName, summary]) =>
|
||||
spanName === name || (spanName.startsWith(`${name}.`) && (summary.count ?? 0) > 0)
|
||||
);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
function maxDurationWhere(results, predicate) {
|
||||
const durations = results
|
||||
.filter((result) => predicate(result.command))
|
||||
@ -2515,6 +2587,28 @@ function healthFailureCount(samples) {
|
||||
return samples.filter((sample) => sample && !sample.ok).length;
|
||||
}
|
||||
|
||||
function summarizeMeasurementScopes(record) {
|
||||
const phases = { product: 0, harness: 0, cleanup: 0 };
|
||||
const results = { product: 0, harness: 0, cleanup: 0 };
|
||||
for (const phase of record.phases ?? []) {
|
||||
const phaseScope = measurementScopeForPhase(phase);
|
||||
phases[phaseScope] += 1;
|
||||
for (const result of phase.results ?? []) {
|
||||
const resultScope = result.measurementScope ? normalizeMeasurementScope(result.measurementScope, phase.id) : phaseScope;
|
||||
results[resultScope] += 1;
|
||||
}
|
||||
}
|
||||
return {
|
||||
schemaVersion: "kova.measurementScopeSummary.v1",
|
||||
productPhaseCount: phases.product,
|
||||
harnessPhaseCount: phases.harness,
|
||||
cleanupPhaseCount: phases.cleanup,
|
||||
productCommandCount: results.product,
|
||||
harnessCommandCount: results.harness,
|
||||
cleanupCommandCount: results.cleanup
|
||||
};
|
||||
}
|
||||
|
||||
function collectResults(record, options = {}) {
|
||||
const excludePhaseIds = new Set(options.excludePhaseIds ?? []);
|
||||
const results = [];
|
||||
@ -2522,6 +2616,9 @@ function collectResults(record, options = {}) {
|
||||
if (excludePhaseIds.has(phase.id)) {
|
||||
continue;
|
||||
}
|
||||
if (options.productOnly === true && !measuredProductPhase(phase)) {
|
||||
continue;
|
||||
}
|
||||
for (const result of phase.results ?? []) {
|
||||
results.push(result);
|
||||
}
|
||||
@ -2546,6 +2643,9 @@ function collectPeakRss(record, options = {}) {
|
||||
if (excludePhaseIds.has(phase.id)) {
|
||||
continue;
|
||||
}
|
||||
if (options.productOnly === true && !measuredProductPhase(phase)) {
|
||||
continue;
|
||||
}
|
||||
const rss = phase.metrics?.process?.rssMb;
|
||||
if (typeof rss === "number") {
|
||||
peak = peak === null ? rss : Math.max(peak, rss);
|
||||
@ -2720,8 +2820,8 @@ function collectTimelineSummary(record) {
|
||||
let repeatedSpanCount = 0;
|
||||
let runtimeDepsStageMaxMs = null;
|
||||
let slowestRuntimeDepsPlugin = null;
|
||||
let openSpanCount = 0;
|
||||
let openSpans = [];
|
||||
let latestOpenSpanCount = 0;
|
||||
let latestOpenSpans = [];
|
||||
let latestEventCount = -1;
|
||||
let events = [];
|
||||
let turnAttributionEvents = [];
|
||||
@ -2734,6 +2834,10 @@ function collectTimelineSummary(record) {
|
||||
latestEventCount = timeline.eventCount ?? 0;
|
||||
events = timeline.events;
|
||||
turnAttributionEvents = Array.isArray(timeline.turnAttributionEvents) ? timeline.turnAttributionEvents : [];
|
||||
latestOpenSpanCount = timeline.openSpanCount ?? timeline.openSpans?.length ?? 0;
|
||||
latestOpenSpans = [...(timeline.openSpans ?? [])]
|
||||
.toSorted((left, right) => (right.ageMs ?? -1) - (left.ageMs ?? -1))
|
||||
.slice(0, 25);
|
||||
}
|
||||
for (const artifact of timeline.artifacts ?? []) {
|
||||
artifacts.add(artifact);
|
||||
@ -2742,8 +2846,6 @@ function collectTimelineSummary(record) {
|
||||
parseErrorCount = Math.max(parseErrorCount, timeline.parseErrorCount ?? 0);
|
||||
childProcessFailedCount = Math.max(childProcessFailedCount, timeline.childProcesses?.failedCount ?? 0);
|
||||
repeatedSpanCount = Math.max(repeatedSpanCount, timeline.repeatedSpans?.length ?? 0);
|
||||
openSpanCount = Math.max(openSpanCount, timeline.openSpanCount ?? timeline.openSpans?.length ?? 0);
|
||||
openSpans = mergeOpenSpans(openSpans, timeline.openSpans ?? []);
|
||||
mergeKeySpans(keySpans, timeline.keySpans ?? {});
|
||||
mergeSpanTotals(spanTotals, timeline.spanTotals ?? {});
|
||||
eventLoopMaxMs = maxNullable(eventLoopMaxMs, timeline.eventLoop?.maxMs);
|
||||
@ -2775,8 +2877,8 @@ function collectTimelineSummary(record) {
|
||||
slowestSpanName: slowestSpan?.name ?? null,
|
||||
slowestSpanMs: slowestSpan?.durationMs ?? null,
|
||||
repeatedSpanCount,
|
||||
openSpanCount,
|
||||
openSpans,
|
||||
openSpanCount: latestOpenSpanCount,
|
||||
openSpans: latestOpenSpans,
|
||||
artifacts: [...artifacts],
|
||||
timelineArtifacts: [...artifacts],
|
||||
events,
|
||||
@ -2847,9 +2949,12 @@ function mergeKeySpans(target, source) {
|
||||
}
|
||||
}
|
||||
|
||||
function collectCpuPercentMax(record) {
|
||||
function collectCpuPercentMax(record, options = {}) {
|
||||
const values = [];
|
||||
for (const phase of record.phases ?? []) {
|
||||
if (options.productOnly === true && !measuredProductPhase(phase)) {
|
||||
continue;
|
||||
}
|
||||
const cpu = phase.metrics?.process?.cpuPercent;
|
||||
if (typeof cpu === "number") {
|
||||
values.push(cpu);
|
||||
|
||||
68
src/measurement-contract.mjs
Normal file
68
src/measurement-contract.mjs
Normal file
@ -0,0 +1,68 @@
|
||||
export const MEASUREMENT_SCOPES = new Set(["product", "harness", "cleanup"]);
|
||||
|
||||
export function normalizeMeasurementScope(value, phaseId = null) {
|
||||
if (MEASUREMENT_SCOPES.has(value)) {
|
||||
return value;
|
||||
}
|
||||
if (phaseId === "target-setup" || phaseId === "auth-prepare" || phaseId === "auth-setup" || phaseId === "prepare" || phaseId?.startsWith("state-")) {
|
||||
return "harness";
|
||||
}
|
||||
if (phaseId === "cleanup" || phaseId === "auth-cleanup" || phaseId === "env-cleanup") {
|
||||
return "cleanup";
|
||||
}
|
||||
return "product";
|
||||
}
|
||||
|
||||
export function measuredProductPhase(phase) {
|
||||
return measurementScopeForPhase(phase) === "product";
|
||||
}
|
||||
|
||||
export function measurementScopeForPhase(phase) {
|
||||
if (MEASUREMENT_SCOPES.has(phase?.measurementScope)) {
|
||||
return phase.measurementScope;
|
||||
}
|
||||
if (phase?.id === "provision" && (phase.commands ?? []).some((command) => /(?:^|\s)--no-service(?:\s|$)/.test(command))) {
|
||||
return "harness";
|
||||
}
|
||||
return normalizeMeasurementScope(phase?.measurementScope, phase?.id);
|
||||
}
|
||||
|
||||
export function driverKindForCommand(command) {
|
||||
const text = String(command ?? "");
|
||||
if (text.includes("run-dashboard-session-send-turn.mjs")) {
|
||||
return "gateway-rpc";
|
||||
}
|
||||
if (text.includes("run-openai-compatible-turn.mjs")) {
|
||||
return "gateway-http";
|
||||
}
|
||||
if (text.includes("run-tui-message-turn.mjs")) {
|
||||
return "gateway-rpc";
|
||||
}
|
||||
if (/\bocm\s+@[^ ]+\s+--\s+agent\b/.test(text)) {
|
||||
return text.includes("--local") ? "openclaw-cli-local" : "openclaw-cli-gateway";
|
||||
}
|
||||
if (/\bocm\s+@[^ ]+\s+--\s+gateway\s+call\b/.test(text)) {
|
||||
return "gateway-rpc-via-cli";
|
||||
}
|
||||
if (/\bocm\b/.test(text)) {
|
||||
return "ocm";
|
||||
}
|
||||
if (/\bnode\b/.test(text)) {
|
||||
return "kova-helper";
|
||||
}
|
||||
return "unknown";
|
||||
}
|
||||
|
||||
export function phaseDriverKind(phase, commands = phase?.commands ?? []) {
|
||||
if (phase?.driverKind) {
|
||||
return phase.driverKind;
|
||||
}
|
||||
const kinds = new Set(commands.map(driverKindForCommand));
|
||||
if (kinds.size === 1) {
|
||||
return [...kinds][0];
|
||||
}
|
||||
if (kinds.size === 0) {
|
||||
return "none";
|
||||
}
|
||||
return "mixed";
|
||||
}
|
||||
@ -44,6 +44,8 @@ const defaultThresholds = {
|
||||
heapSnapshotBytes: 50 * 1024 * 1024,
|
||||
resourcePeakCommandTreeRssMb: 100,
|
||||
resourcePeakGatewayRssMb: 100,
|
||||
resourcePeakTrackedRssMb: 100,
|
||||
resourceCpuPercentMaxTracked: 25,
|
||||
openclawTimelineParseErrors: 0,
|
||||
openclawSlowestSpanMs: 5000,
|
||||
openclawEventLoopMaxMs: 250,
|
||||
@ -457,6 +459,8 @@ function metricDeltas(baseline, current) {
|
||||
"nodeProfileTopFunctionMs",
|
||||
"heapSnapshotBytes",
|
||||
"resourceSampleCount",
|
||||
"resourcePeakTrackedRssMb",
|
||||
"resourceCpuPercentMaxTracked",
|
||||
"resourcePeakCommandTreeRssMb",
|
||||
"resourcePeakGatewayRssMb",
|
||||
"openclawTimelineEventCount",
|
||||
|
||||
@ -237,7 +237,9 @@ export function renderMarkdownReport(report) {
|
||||
const expectedFailure = turn.expectedFailure ? "; expected failure observed " + turn.expectedFailureObserved : "";
|
||||
lines.push(` - ${turn.label}: total ${turn.totalTurnMs ?? "unknown"} ms; pre-provider ${turn.preProviderMs ?? "unknown"} ms; provider ${turn.providerFinalMs ?? "unknown"} ms; post-provider ${turn.postProviderMs ?? "unknown"} ms; route ${route}; status ${status}; issue ${issue}; response ${turn.responseOk}; leaks ${turn.processLeakCount ?? "unknown"}${providerTiming}${expectedFailure}`);
|
||||
if (turn.gatewaySession) {
|
||||
lines.push(` - gateway session: create ${turn.gatewaySession.createSession}; session create ${turn.gatewaySession.sessionCreateDurationMs ?? "n/a"} ms; send ${turn.gatewaySession.sendDurationMs ?? "unknown"} ms; first assistant ${turn.gatewaySession.timeToFirstAssistantMs ?? "unknown"} ms; matched assistant ${turn.gatewaySession.timeToMatchedAssistantMs ?? "unknown"} ms; polls ${turn.gatewaySession.historyPollCount ?? "unknown"} (${turn.gatewaySession.historyErrorCount ?? "unknown"} errors)`);
|
||||
const transport = turn.gatewaySession.gatewayTransportKind ?? "unknown";
|
||||
const fallback = turn.gatewaySession.gatewayTransportFallbackReason ? `; fallback ${turn.gatewaySession.gatewayTransportFallbackReason}` : "";
|
||||
lines.push(` - gateway session: transport ${transport}${fallback}; create ${turn.gatewaySession.createSession}; session create ${turn.gatewaySession.sessionCreateDurationMs ?? "n/a"} ms; send ${turn.gatewaySession.sendDurationMs ?? "unknown"} ms; first assistant ${turn.gatewaySession.timeToFirstAssistantMs ?? "unknown"} ms; matched assistant ${turn.gatewaySession.timeToMatchedAssistantMs ?? "unknown"} ms; polls ${turn.gatewaySession.historyPollCount ?? "unknown"} (${turn.gatewaySession.historyErrorCount ?? "unknown"} errors)`);
|
||||
}
|
||||
if (turn.turnDiagnostics) {
|
||||
lines.push(` - active window: metadata scans ${turn.metadataScanCount ?? "unknown"} (${turn.metadataScanTotalMs ?? "unknown"} ms total, max ${turn.metadataScanMaxMs ?? "unknown"} ms); event-loop samples ${turn.turnDiagnostics.eventLoop?.sampleCount ?? "unknown"} max ${turn.eventLoopMaxMs ?? "unknown"} ms`);
|
||||
@ -703,6 +705,11 @@ function summarizeMeasurements(measurements) {
|
||||
return {
|
||||
peakRssMb: measurements.peakRssMb ?? null,
|
||||
cpuPercentMax: measurements.cpuPercentMax ?? null,
|
||||
measurementScopeSummary: measurements.measurementScopeSummary ?? null,
|
||||
resourceMeasurementScope: measurements.resourceMeasurementScope ?? null,
|
||||
resourcePrimaryRole: measurements.resourcePrimaryRole ?? null,
|
||||
resourcePeakTrackedRssMb: measurements.resourcePeakTrackedRssMb ?? null,
|
||||
resourceCpuPercentMaxTracked: measurements.resourceCpuPercentMaxTracked ?? null,
|
||||
health: measurements.health ?? null,
|
||||
missingDependencyErrors: measurements.missingDependencyErrors ?? null,
|
||||
pluginLoadFailures: measurements.pluginLoadFailures ?? null,
|
||||
@ -719,6 +726,8 @@ function summarizeMeasurements(measurements) {
|
||||
openclawSlowestSpanMs: measurements.openclawSlowestSpanMs ?? null,
|
||||
openclawOpenSpanCount: measurements.openclawOpenSpanCount ?? null,
|
||||
openclawOpenRequiredSpanCount: measurements.openclawOpenRequiredSpanCount ?? null,
|
||||
openclawMissingRequiredSpanCount: measurements.openclawMissingRequiredSpanCount ?? null,
|
||||
openclawMissingRequiredSpans: measurements.openclawMissingRequiredSpans ?? null,
|
||||
openclawOpenSpans: measurements.openclawOpenSpans ?? null,
|
||||
openclawKeySpans: measurements.openclawKeySpans ?? null,
|
||||
providerRequestCount: measurements.providerRequestCount ?? null,
|
||||
|
||||
@ -14,6 +14,7 @@ import { collectEnvMetrics, collectNodeProfileMetrics } from "./metrics.mjs";
|
||||
import { collectorArtifactDirs, prepareCollectorArtifactDirs } from "./collectors/artifacts.mjs";
|
||||
import { collectProviderEvidence } from "./collectors/provider.mjs";
|
||||
import { evaluateRecord } from "./evaluator.mjs";
|
||||
import { driverKindForCommand, measurementScopeForPhase, normalizeMeasurementScope, phaseDriverKind } from "./measurement-contract.mjs";
|
||||
import { artifactsDir } from "./paths.mjs";
|
||||
import { repoRoot } from "./paths.mjs";
|
||||
import { assertKovaEnvName, assertSafeScenarioCommand } from "./safety.mjs";
|
||||
@ -82,6 +83,8 @@ export async function executeScenario(scenario, context) {
|
||||
id: "target-setup",
|
||||
title: "Target Runtime Setup",
|
||||
intent: "Prepare the target OpenClaw runtime selector for the scenario.",
|
||||
measurementScope: "harness",
|
||||
driverKind: "ocm",
|
||||
commands: setupResults.map((result) => result.command),
|
||||
evidence: [],
|
||||
results: setupResults
|
||||
@ -143,6 +146,8 @@ export async function executeScenario(scenario, context) {
|
||||
title: phase.title,
|
||||
intent: phase.intent,
|
||||
healthScope: phase.healthScope,
|
||||
measurementScope: phaseMeasurementScope(phase),
|
||||
driverKind: phaseDriverKind(phase, commands),
|
||||
expectedAgentFailure: phase.expectedAgentFailure === true,
|
||||
commands,
|
||||
evidence: phase.evidence ?? [],
|
||||
@ -332,7 +337,7 @@ function buildPlannedPhases(scenario, context, envName, artifactDir, authPolicy)
|
||||
|
||||
const authPreparePhase = buildAuthPreparePhase(authPolicy, artifactDir);
|
||||
if (authPreparePhase) {
|
||||
phases.push(authPreparePhase);
|
||||
phases.push(withPhaseContract(authPreparePhase, "harness"));
|
||||
}
|
||||
|
||||
const preparePhase = buildStateLifecyclePhase(context, envName, scenario, "prepare", context.state?.prepare ?? [], artifactDir);
|
||||
@ -344,20 +349,23 @@ function buildPlannedPhases(scenario, context, envName, artifactDir, authPolicy)
|
||||
if (phase.id === "cleanup") {
|
||||
continue;
|
||||
}
|
||||
const commands = materializeScenarioPhaseCommands(phase, context, envName, artifactDir);
|
||||
phases.push({
|
||||
id: phase.id,
|
||||
title: phase.title,
|
||||
intent: phase.intent,
|
||||
healthScope: phase.healthScope,
|
||||
measurementScope: phaseMeasurementScope(phase),
|
||||
driverKind: phaseDriverKind(phase, commands),
|
||||
expectedAgentFailure: phase.expectedAgentFailure === true,
|
||||
commands: materializeScenarioPhaseCommands(phase, context, envName, artifactDir),
|
||||
commands,
|
||||
evidence: phase.evidence ?? []
|
||||
});
|
||||
|
||||
if (phaseSupportsAuthSetup(phase, authPolicy) && !phases.some((planned) => planned.id === "auth-setup")) {
|
||||
const authSetupPhase = buildAuthSetupPhase(authPolicy, envName, artifactDir);
|
||||
if (authSetupPhase) {
|
||||
phases.push(authSetupPhase);
|
||||
phases.push(withPhaseContract(authSetupPhase, "harness"));
|
||||
}
|
||||
}
|
||||
|
||||
@ -378,7 +386,7 @@ function buildPlannedPhases(scenario, context, envName, artifactDir, authPolicy)
|
||||
if (!context.keepEnv) {
|
||||
const authCleanupPhase = buildAuthCleanupPhase(authPolicy, artifactDir);
|
||||
if (authCleanupPhase) {
|
||||
phases.push(authCleanupPhase);
|
||||
phases.push(withPhaseContract(authCleanupPhase, "cleanup"));
|
||||
}
|
||||
const cleanupPhase = buildStateLifecyclePhase(context, envName, scenario, "cleanup", context.state?.cleanup ?? [], artifactDir);
|
||||
if (cleanupPhase) {
|
||||
@ -388,6 +396,8 @@ function buildPlannedPhases(scenario, context, envName, artifactDir, authPolicy)
|
||||
id: "env-cleanup",
|
||||
title: "Environment Cleanup",
|
||||
intent: "Destroy the disposable Kova env after the scenario finishes.",
|
||||
measurementScope: "cleanup",
|
||||
driverKind: "ocm",
|
||||
commands: [ocmEnvDestroy(envName)],
|
||||
evidence: ["temporary env destroyed"]
|
||||
});
|
||||
@ -405,6 +415,8 @@ function buildTargetSetupPhase(context, envName) {
|
||||
id: "target-setup",
|
||||
title: "Target Runtime Setup",
|
||||
intent: "Prepare the target OpenClaw runtime selector for the scenario.",
|
||||
measurementScope: "harness",
|
||||
driverKind: "ocm",
|
||||
commands: [targetSetupCommand(context.targetPlan)],
|
||||
evidence: [`local-build runtime ${context.targetPlan.runtimeName}`, `kova env ${envName}`]
|
||||
};
|
||||
@ -426,6 +438,8 @@ function buildStateLifecyclePhase(context, envName, scenario, kind, steps, artif
|
||||
id: kind,
|
||||
title: stateLifecycleTitle(context.state?.id, kind, phaseId),
|
||||
intent: stateLifecycleIntent(context.state?.id, kind, phaseId),
|
||||
measurementScope: normalizeMeasurementScope(null, kind),
|
||||
driverKind: phaseDriverKind(null, commands),
|
||||
commands,
|
||||
evidence,
|
||||
scenario: scenario.id
|
||||
@ -459,6 +473,8 @@ async function executeStateLifecycleSteps(context, envName, scenario, kind, step
|
||||
id: kind,
|
||||
title: stateLifecycleTitle(context.state?.id, kind, phaseId),
|
||||
intent: stateLifecycleIntent(context.state?.id, kind, phaseId),
|
||||
measurementScope: normalizeMeasurementScope(null, kind),
|
||||
driverKind: phaseDriverKind(null, commands),
|
||||
commands,
|
||||
evidence,
|
||||
results,
|
||||
@ -476,6 +492,8 @@ async function executeAuthPhase(phase, context, envName, artifactDir, authPolicy
|
||||
}
|
||||
return {
|
||||
...phase,
|
||||
measurementScope: normalizeMeasurementScope(phase.measurementScope, phase.id),
|
||||
driverKind: phaseDriverKind(phase),
|
||||
results,
|
||||
metrics: await collectEnvMetrics(envName, metricOptions(context, null, { id: phase.id }, artifactDir))
|
||||
};
|
||||
@ -569,7 +587,7 @@ async function executeTargetSetup(context, envName, artifactDir) {
|
||||
}
|
||||
|
||||
const results = [
|
||||
await runCommand(targetSetupCommand(context.targetPlan), {
|
||||
tagCommandResult(await runCommand(targetSetupCommand(context.targetPlan), {
|
||||
timeoutMs: context.timeoutMs,
|
||||
env: { KOVA_ENV_NAME: envName },
|
||||
resourceSample: context.resourceSampling === false ? null : {
|
||||
@ -578,7 +596,7 @@ async function executeTargetSetup(context, envName, artifactDir) {
|
||||
processRoles: context.processRoles ?? [],
|
||||
artifactPath: join(collectorArtifactDirs(artifactDir).resourceSamples, "target-setup-1.jsonl")
|
||||
}
|
||||
})
|
||||
}), "target-setup")
|
||||
];
|
||||
if (results.every((result) => result.status === 0) && context.targetSetup) {
|
||||
context.targetSetup.completed = true;
|
||||
@ -617,6 +635,7 @@ async function runScenarioCommand(command, context, envName, artifactDir, phaseI
|
||||
artifactPath: join(collectorArtifactDirs(artifactDir).resourceSamples, `${safeSegment(phaseId)}-${commandIndex + 1}.jsonl`)
|
||||
}
|
||||
});
|
||||
tagCommandResult(result, phaseId);
|
||||
if (agentCommand) {
|
||||
await sleep(1000);
|
||||
const afterSnapshot = captureProcessSnapshot(snapshotOptions);
|
||||
@ -638,6 +657,28 @@ async function runScenarioCommand(command, context, envName, artifactDir, phaseI
|
||||
return result;
|
||||
}
|
||||
|
||||
function phaseMeasurementScope(phase) {
|
||||
return measurementScopeForPhase(phase);
|
||||
}
|
||||
|
||||
function withPhaseContract(phase, scope = null) {
|
||||
return {
|
||||
...phase,
|
||||
measurementScope: normalizeMeasurementScope(scope ?? phase.measurementScope, phase.id),
|
||||
driverKind: phaseDriverKind(phase)
|
||||
};
|
||||
}
|
||||
|
||||
function tagCommandResult(result, phaseId) {
|
||||
result.measurementScope = measurementScopeForPhase({
|
||||
id: phaseId,
|
||||
measurementScope: result.measurementScope,
|
||||
commands: [result.command]
|
||||
});
|
||||
result.driverKind = driverKindForCommand(result.command);
|
||||
return result;
|
||||
}
|
||||
|
||||
function isAgentMessageCommand(command) {
|
||||
return (command.includes(" -- agent ") && command.includes("--message")) ||
|
||||
command.includes("run-concurrent-agent-turns.mjs") ||
|
||||
|
||||
@ -608,6 +608,7 @@ function localBuildTargetSetupResourceExclusionCheck() {
|
||||
phases: [
|
||||
{
|
||||
id: "target-setup",
|
||||
measurementScope: "harness",
|
||||
results: [{
|
||||
command: "ocm runtime build-local kova-local-test --repo /tmp/openclaw --force",
|
||||
status: 0,
|
||||
@ -619,8 +620,23 @@ function localBuildTargetSetupResourceExclusionCheck() {
|
||||
})
|
||||
}]
|
||||
},
|
||||
{
|
||||
id: "auth-prepare",
|
||||
measurementScope: "harness",
|
||||
results: [{
|
||||
command: "node support/mock-openai-server.mjs",
|
||||
status: 0,
|
||||
durationMs: 500,
|
||||
resourceSamples: syntheticResourceSamples({
|
||||
peakRssMb: 1900,
|
||||
maxCpuPercent: 320,
|
||||
role: "mock-provider"
|
||||
})
|
||||
}]
|
||||
},
|
||||
{
|
||||
id: "scenario-command",
|
||||
measurementScope: "product",
|
||||
results: [{
|
||||
command: "ocm @kova-self-check -- status",
|
||||
status: 0,
|
||||
@ -630,6 +646,29 @@ function localBuildTargetSetupResourceExclusionCheck() {
|
||||
maxCpuPercent: 20,
|
||||
role: "gateway"
|
||||
})
|
||||
}, {
|
||||
command: "node support/kova-helper.mjs",
|
||||
status: 0,
|
||||
durationMs: 100,
|
||||
resourceSamples: syntheticResourceSamples({
|
||||
peakRssMb: 600,
|
||||
maxCpuPercent: 30,
|
||||
role: "command-tree"
|
||||
})
|
||||
}]
|
||||
},
|
||||
{
|
||||
id: "auth-cleanup",
|
||||
measurementScope: "cleanup",
|
||||
results: [{
|
||||
command: "kill $(cat mock/pid)",
|
||||
status: 0,
|
||||
durationMs: 50,
|
||||
resourceSamples: syntheticResourceSamples({
|
||||
peakRssMb: 1800,
|
||||
maxCpuPercent: 300,
|
||||
role: "mock-provider"
|
||||
})
|
||||
}]
|
||||
}
|
||||
],
|
||||
@ -638,14 +677,19 @@ function localBuildTargetSetupResourceExclusionCheck() {
|
||||
logs: zeroLogMetrics()
|
||||
}
|
||||
};
|
||||
evaluateRecord(record, { thresholds: { peakRssMb: 900 } }, {
|
||||
surface: { thresholds: {} },
|
||||
evaluateRecord(record, { thresholds: { peakRssMb: 200 } }, {
|
||||
surface: { thresholds: {}, resourcePrimaryRole: "gateway" },
|
||||
targetPlan: { kind: "local-build" }
|
||||
});
|
||||
assertEqual(record.status, "PASS", "local-build target setup resources ignored status");
|
||||
assertEqual(record.measurements.peakRssMb, 100, "local-build target setup resources ignored RSS");
|
||||
assertEqual(record.measurements.resourcePeakTrackedRssMb, 600, "tracked product helper RSS retained separately");
|
||||
assertEqual(record.measurements.resourcePrimaryRole, "gateway", "primary resource role retained");
|
||||
assertEqual(record.measurements.resourceByRole.gateway.peakRssMb, 100, "scenario role RSS retained");
|
||||
assertEqual(record.measurements.resourceByRole["build-tooling"], undefined, "target setup role excluded");
|
||||
assertEqual(record.measurements.resourceByRole["mock-provider"], undefined, "harness auth resources excluded");
|
||||
assertEqual(record.measurements.measurementScopeSummary.harnessCommandCount, 2, "harness command count");
|
||||
assertEqual(record.measurements.measurementScopeSummary.cleanupCommandCount, 1, "cleanup command count");
|
||||
assertEqual(record.violations, undefined, "no-service local-build record has no gateway violation");
|
||||
return {
|
||||
id: "local-build-target-setup-resource-exclusion",
|
||||
@ -2125,6 +2169,7 @@ function gatewaySessionTurnEvaluationCheck() {
|
||||
minAssistantCount: 1,
|
||||
sessionKey: "kova-dashboard-session-send",
|
||||
runId: "cold-run",
|
||||
gatewayTransport: { kind: "direct-gateway-rpc", fallbackReason: null },
|
||||
activeStartedAtEpochMs: base + 1000,
|
||||
activeFinishedAtEpochMs: base + 2500,
|
||||
activeTurnMs: 1500,
|
||||
@ -2150,6 +2195,7 @@ function gatewaySessionTurnEvaluationCheck() {
|
||||
minAssistantCount: 2,
|
||||
sessionKey: "kova-dashboard-session-send",
|
||||
runId: "warm-run",
|
||||
gatewayTransport: { kind: "direct-gateway-rpc", fallbackReason: null },
|
||||
activeStartedAtEpochMs: base + 11000,
|
||||
activeFinishedAtEpochMs: base + 11800,
|
||||
activeTurnMs: 800,
|
||||
@ -2284,6 +2330,7 @@ function gatewaySessionTurnEvaluationCheck() {
|
||||
assertEqual(record.measurements.agentEventLoopMaxMs, 9, "active-window event-loop max");
|
||||
assertEqual(record.measurements.agentSessionPollCount, 5, "session polling total");
|
||||
assertEqual(record.measurements.agentTurns[1].gatewaySession.createSession, false, "warm turn reuses session");
|
||||
assertEqual(record.measurements.agentTurns[0].gatewaySession.gatewayTransportKind, "direct-gateway-rpc", "dashboard turn direct Gateway transport");
|
||||
|
||||
const rendered = renderMarkdownReport({
|
||||
generatedAt: "2026-05-01T00:00:00.000Z",
|
||||
@ -2295,8 +2342,57 @@ function gatewaySessionTurnEvaluationCheck() {
|
||||
summary: { statuses: { PASS: 1 } }
|
||||
});
|
||||
assertEqual(rendered.includes("gateway session:"), true, "markdown includes gateway session detail");
|
||||
assertEqual(rendered.includes("transport direct-gateway-rpc"), true, "markdown includes direct Gateway transport");
|
||||
assertEqual(rendered.includes("active window:"), true, "markdown includes active turn diagnostics");
|
||||
|
||||
const fallbackPayload = {
|
||||
...coldPayload,
|
||||
gatewayTransport: { kind: "shell", fallbackReason: "gateway-token-unavailable" }
|
||||
};
|
||||
const fallbackRecord = {
|
||||
scenario: "dashboard-session-send-turn",
|
||||
surface: "dashboard-session-send-turn",
|
||||
title: "Gateway session shell fallback",
|
||||
status: "PASS",
|
||||
phases: [{
|
||||
id: "cold-dashboard-session-turn",
|
||||
title: "Cold Gateway Session Turn",
|
||||
intent: "Synthetic shell fallback",
|
||||
commands: ["node support/run-dashboard-session-send-turn.mjs --create-session true"],
|
||||
evidence: [],
|
||||
results: [{
|
||||
command: "node support/run-dashboard-session-send-turn.mjs --create-session true",
|
||||
status: 0,
|
||||
timedOut: false,
|
||||
startedAt: new Date(base).toISOString(),
|
||||
startedAtEpochMs: base,
|
||||
finishedAt: new Date(base + 5000).toISOString(),
|
||||
finishedAtEpochMs: base + 5000,
|
||||
durationMs: 5000,
|
||||
stdout: JSON.stringify(fallbackPayload),
|
||||
stderr: ""
|
||||
}],
|
||||
metrics: { logs: zeroLogMetrics(), health: { ok: true } }
|
||||
}],
|
||||
providerEvidence: {
|
||||
available: true,
|
||||
requestCount: 1,
|
||||
requests: [record.providerEvidence.requests[0]]
|
||||
},
|
||||
finalMetrics: { service: { gatewayState: "running" }, logs: zeroLogMetrics() }
|
||||
};
|
||||
evaluateRecord(fallbackRecord, {
|
||||
id: "dashboard-session-send-turn",
|
||||
agent: { expectedText: "KOVA_AGENT_OK" },
|
||||
thresholds: {}
|
||||
}, { surface: { thresholds: {} }, targetPlan: { kind: "runtime" } });
|
||||
assertEqual(fallbackRecord.status, "FAIL", "dashboard session shell fallback rejected");
|
||||
assertEqual(
|
||||
fallbackRecord.violations.some((violation) => violation.metric === "gatewayTransport.kind"),
|
||||
true,
|
||||
"dashboard session shell fallback violation"
|
||||
);
|
||||
|
||||
return {
|
||||
id: "gateway-session-turn-evaluation",
|
||||
status: "PASS",
|
||||
@ -4144,6 +4240,47 @@ function diagnosticsTimelineEvaluationCheck() {
|
||||
"missing diagnostic timeline violation"
|
||||
);
|
||||
|
||||
const missingSpanRecord = {
|
||||
scenario: "diagnostic-missing-span",
|
||||
status: "PASS",
|
||||
phases: [],
|
||||
finalMetrics: {
|
||||
service: { gatewayState: "running" },
|
||||
logs: zeroLogMetrics(),
|
||||
timeline: {
|
||||
available: true,
|
||||
eventCount: 1,
|
||||
parseErrorCount: 0,
|
||||
openSpanCount: 0,
|
||||
openSpans: [],
|
||||
keySpans: {},
|
||||
spanTotals: {
|
||||
"gateway.startup": { count: 1, totalDurationMs: 100, maxDurationMs: 100 }
|
||||
},
|
||||
runtimeDeps: {},
|
||||
eventLoop: {},
|
||||
providers: {},
|
||||
childProcesses: {}
|
||||
}
|
||||
}
|
||||
};
|
||||
evaluateRecord(missingSpanRecord, { thresholds: {} }, {
|
||||
targetPlan: { kind: "local-build" },
|
||||
profile: { id: "diagnostic", diagnostics: { timelineRequired: true } },
|
||||
surface: {
|
||||
id: "bundled-runtime-deps",
|
||||
diagnostics: { expectedSpans: ["runtimeDeps.stage"] },
|
||||
thresholds: {}
|
||||
}
|
||||
});
|
||||
assertEqual(missingSpanRecord.status, "FAIL", "missing required span status");
|
||||
assertEqual(missingSpanRecord.measurements.openclawMissingRequiredSpanCount, 1, "missing required span measurement");
|
||||
assertEqual(
|
||||
missingSpanRecord.violations.some((violation) => violation.metric === "openclawMissingRequiredSpanCount"),
|
||||
true,
|
||||
"missing required span violation"
|
||||
);
|
||||
|
||||
const openSpanRecord = {
|
||||
scenario: "diagnostic-open-span",
|
||||
status: "PASS",
|
||||
|
||||
@ -24,7 +24,11 @@ try {
|
||||
const sessionKey = args["session-key"] ?? `kova-dashboard-${randomUUID()}`;
|
||||
const createSession = readBoolean(args["create-session"], true);
|
||||
const minAssistantCount = readPositiveInteger(args["min-assistant-count"], 1);
|
||||
const allowShellFallback = readBoolean(args["allow-shell-fallback"], false);
|
||||
const gatewayTransport = await openDirectGatewayRpcClient(runtimeContext);
|
||||
if (!gatewayTransport.client && !allowShellFallback) {
|
||||
throw new Error(`direct Gateway RPC is required for dashboard-session-send-turn; fallback=${gatewayTransport.transport}; reason=${gatewayTransport.fallbackReason ?? "unknown"}`);
|
||||
}
|
||||
|
||||
try {
|
||||
let created = null;
|
||||
|
||||
@ -10,6 +10,7 @@
|
||||
"agent-process",
|
||||
"mock-provider"
|
||||
],
|
||||
"resourcePrimaryRole": "gateway",
|
||||
"thresholds": {
|
||||
"agentTurnMs": 45000,
|
||||
"coldAgentTurnMs": 45000,
|
||||
|
||||
Loading…
Reference in New Issue
Block a user