fix: make local agent benchmark service-free
Some checks failed
CI / Test (${{ matrix.os }}) (macos-latest) (push) Has been cancelled
CI / Test (${{ matrix.os }}) (ubuntu-latest) (push) Has been cancelled

This commit is contained in:
Peter Steinberger 2026-05-03 13:11:49 +01:00
parent 51947110f5
commit b63b6f9e20
No known key found for this signature in database
3 changed files with 128 additions and 19 deletions

View File

@ -3,7 +3,7 @@
"surface": "agent-cli-local-turn",
"title": "Agent CLI Local Cold/Warm Message",
"objective": "Send cold and warm simple messages through `openclaw agent --local`, verify mock-provider responses, and attribute latency before, during, and after provider work.",
"tags": ["agent", "message", "latency", "providers", "gateway", "cold-warm"],
"tags": ["agent", "message", "latency", "providers", "cold-warm"],
"timeoutMs": 240000,
"agent": {
"expectedText": "KOVA_AGENT_OK"
@ -16,11 +16,11 @@
"coldWarmDeltaMs": 30000,
"preProviderMs": 10000,
"coldPreProviderMs": 10000,
"warmPreProviderMs": 5000,
"warmPreProviderMs": 10000,
"providerFinalMs": 3000,
"preProviderDominanceRatio": 0.8,
"statusMs": 10000,
"peakRssMb": 900,
"agentProcessLeaks": 4,
"missingDependencyErrors": 0,
"pluginLoadFailures": 0,
"providerTimeoutMentions": 0
@ -29,9 +29,9 @@
{
"id": "provision",
"title": "Provision Agent Env",
"intent": "Start a disposable OpenClaw gateway before wiring the model provider and sending messages.",
"commands": ["ocm start {env} {startSelector} --json"],
"evidence": ["gateway port", "runtime binding", "startup readiness"]
"intent": "Create a disposable OpenClaw env before wiring the model provider and sending local agent messages.",
"commands": ["ocm start {env} {startSelector} --no-service --json"],
"evidence": ["gateway port", "runtime binding", "env created without service"]
},
{
"id": "cold-agent-turn",
@ -53,10 +53,10 @@
},
{
"id": "post-agent-health",
"title": "Post-Agent Gateway Health",
"intent": "Verify the gateway remains responsive after both agent turns and capture provider/plugin diagnostics.",
"commands": ["ocm @{env} -- status", "ocm logs {env} --tail 300 --raw"],
"evidence": ["gateway status", "provider logs", "plugin errors", "memory after agent turns"]
"title": "Post-Agent Env Status",
"intent": "Verify the env remains usable after both local agent turns and capture plugin diagnostics.",
"commands": ["ocm @{env} -- status"],
"evidence": ["env status", "plugin errors", "memory after agent turns"]
}
]
}

View File

@ -21,8 +21,12 @@ export function evaluateRecord(record, scenario, options = {}) {
const roleThresholds = thresholdPolicy.roleThresholds;
const violations = [];
const allResults = collectResults(record);
const resourceSummary = collectResourceSummary(allResults);
const peakRssMb = maxNullable(collectPeakRss(record), resourceSummary.peakTotalRssMb);
const measuredResults = collectResults(record, { excludePhaseIds: ["target-setup"] });
const resourceSummary = collectResourceSummary(measuredResults);
const peakRssMb = maxNullable(
collectPeakRss(record, { excludePhaseIds: ["target-setup"] }),
resourceSummary.peakTotalRssMb
);
const cpuPercentMax = maxNullable(collectCpuPercentMax(record), resourceSummary.maxTotalCpuPercent);
const missingDependencyErrors = countMissingDependencyErrors(allResults) + countLogMetric(record, "missingDependencyErrors");
const pluginLoadFailures = countLogMetric(record, "pluginLoadFailures");
@ -46,6 +50,7 @@ export function evaluateRecord(record, scenario, options = {}) {
const heapSnapshotBytes = countHeapSnapshotBytes(record);
const diagnosticReportCount = countDiagnosticReportMetric(record, "fileCount");
const diagnosticReportBytes = countDiagnosticReportMetric(record, "artifactBytes");
const gatewayExpected = recordExpectsGateway(record);
const openclawDiagnostics = collectOpenClawDiagnostics(record);
const timelineSummary = collectTimelineSummary(record);
const logSummary = collectLogSummary(record);
@ -73,7 +78,7 @@ export function evaluateRecord(record, scenario, options = {}) {
const agentTurnMs = maxNullable(maxDurationWhere(allResults, isAgentMessageCommand), maxTurnDuration(agentTurns));
const agentResponseOk = agentTurns.length === 0 ? null : agentTurns.every((turn) => turn.responseOk === true);
const agentProviderSimulation = evaluateProviderSimulation({ turns: agentTurns, scenario, record, thresholds });
const agentFailureContainment = evaluateAgentFailureContainment({ turns: agentTurns, record, thresholds });
const agentFailureContainment = evaluateAgentFailureContainment({ turns: agentTurns, record, thresholds, gatewayExpected });
const agentCleanupDiagnosis = diagnoseAgentCleanup(agentTurns, agentTurnStats, thresholds);
const agentLatencyDiagnosis = diagnoseAgentLatency({
coldAgentTurn,
@ -166,7 +171,7 @@ export function evaluateRecord(record, scenario, options = {}) {
});
}
if (finalGatewayState && finalGatewayState !== "running") {
if (gatewayExpected && finalGatewayState && finalGatewayState !== "running") {
violations.push({
kind: "gateway",
metric: "finalGatewayState",
@ -950,7 +955,7 @@ function collectAgentTurns(record, providerEvidence, scenario, timelineSummary,
return turns;
}
function evaluateAgentFailureContainment({ turns, record, thresholds }) {
function evaluateAgentFailureContainment({ turns, record, thresholds, gatewayExpected = true }) {
if (turns.length === 0) {
return {
schemaVersion: "kova.agentFailureContainment.v1",
@ -959,7 +964,7 @@ function evaluateAgentFailureContainment({ turns, record, thresholds }) {
leakedProcesses: [],
processLeaksOk: true,
finalGatewayState: record.finalMetrics?.service?.gatewayState ?? null,
gatewayHealthy: null,
gatewayHealthy: gatewayExpected ? null : true,
healthFailures: countHealthFailures(record),
healthLimit: 0,
statusWorks: null,
@ -989,7 +994,7 @@ function evaluateAgentFailureContainment({ turns, record, thresholds }) {
leakedProcesses,
processLeaksOk: leakCount <= leakLimit,
finalGatewayState,
gatewayHealthy: finalGatewayState === "running" && healthFailures <= healthLimit,
gatewayHealthy: gatewayExpected ? finalGatewayState === "running" && healthFailures <= healthLimit : true,
healthFailures,
healthLimit,
statusWorks,
@ -2164,9 +2169,13 @@ function healthFailureCount(samples) {
return samples.filter((sample) => sample && !sample.ok).length;
}
function collectResults(record) {
function collectResults(record, options = {}) {
const excludePhaseIds = new Set(options.excludePhaseIds ?? []);
const results = [];
for (const phase of record.phases ?? []) {
if (excludePhaseIds.has(phase.id)) {
continue;
}
for (const result of phase.results ?? []) {
results.push(result);
}
@ -2174,9 +2183,23 @@ function collectResults(record) {
return results;
}
function collectPeakRss(record) {
function recordExpectsGateway(record) {
return collectResults(record).some((result) => {
const command = result.command ?? "";
if (command.startsWith("ocm service start ") || command.startsWith("ocm service restart ")) {
return true;
}
return command.startsWith("ocm start ") && !/(?:^|\s)--no-service(?:\s|$)/.test(command);
});
}
function collectPeakRss(record, options = {}) {
const excludePhaseIds = new Set(options.excludePhaseIds ?? []);
let peak = null;
for (const phase of record.phases ?? []) {
if (excludePhaseIds.has(phase.id)) {
continue;
}
const rss = phase.metrics?.process?.rssMb;
if (typeof rss === "number") {
peak = peak === null ? rss : Math.max(peak, rss);

View File

@ -110,6 +110,7 @@ export async function runSelfCheck(flags = {}) {
checks.push(await commandTimeoutContractCheck());
checks.push(ocmCommandBuildersCheck());
checks.push(evaluationViolationHelpersCheck());
checks.push(localBuildTargetSetupResourceExclusionCheck());
checks.push(await jsonCommandCheck("plan-json", "node bin/kova.mjs plan --json", (data) => {
assertEqual(data.schemaVersion, "kova.plan.v1", "plan schema");
assertArrayNotEmpty(data.surfaces, "plan surfaces");
@ -557,6 +558,91 @@ function evaluationViolationHelpersCheck() {
}
}
function localBuildTargetSetupResourceExclusionCheck() {
try {
const record = {
scenario: "local-build-runtime-resources",
status: "PASS",
phases: [
{
id: "target-setup",
results: [{
command: "ocm runtime build-local kova-local-test --repo /tmp/openclaw --force",
status: 0,
durationMs: 60000,
resourceSamples: syntheticResourceSamples({
peakRssMb: 2500,
maxCpuPercent: 350,
role: "build-tooling"
})
}]
},
{
id: "scenario-command",
results: [{
command: "ocm @kova-self-check -- status",
status: 0,
durationMs: 100,
resourceSamples: syntheticResourceSamples({
peakRssMb: 100,
maxCpuPercent: 20,
role: "gateway"
})
}]
}
],
finalMetrics: {
service: { gatewayState: "disabled" },
logs: zeroLogMetrics()
}
};
evaluateRecord(record, { thresholds: { peakRssMb: 900 } }, {
surface: { thresholds: {} },
targetPlan: { kind: "local-build" }
});
assertEqual(record.status, "PASS", "local-build target setup resources ignored status");
assertEqual(record.measurements.peakRssMb, 100, "local-build target setup resources ignored RSS");
assertEqual(record.measurements.resourceByRole.gateway.peakRssMb, 100, "scenario role RSS retained");
assertEqual(record.measurements.resourceByRole["build-tooling"], undefined, "target setup role excluded");
assertEqual(record.violations, undefined, "no-service local-build record has no gateway violation");
return {
id: "local-build-target-setup-resource-exclusion",
status: "PASS",
command: "evaluate local-build target setup resource exclusion",
durationMs: 0
};
} catch (error) {
return {
id: "local-build-target-setup-resource-exclusion",
status: "FAIL",
command: "evaluate local-build target setup resource exclusion",
durationMs: 0,
message: error.message
};
}
}
function syntheticResourceSamples({ peakRssMb, maxCpuPercent, role }) {
return {
sampleCount: 1,
peakTotalRssMb: peakRssMb,
maxTotalCpuPercent: maxCpuPercent,
peakCommandTreeRssMb: peakRssMb,
peakGatewayRssMb: role === "gateway" ? peakRssMb : 0,
byRole: {
[role]: {
peakRssMb,
maxCpuPercent,
peakProcessCount: 1
}
},
topRolesByRss: [{ role, peakRssMb, maxCpuPercent }],
topRolesByCpu: [{ role, peakRssMb, maxCpuPercent }],
topByRss: [],
topByCpu: []
};
}
function gatePartialFailureCheck() {
try {
const gate = evaluateGate({