fix: make local agent benchmark service-free
This commit is contained in:
parent
51947110f5
commit
b63b6f9e20
@ -3,7 +3,7 @@
|
||||
"surface": "agent-cli-local-turn",
|
||||
"title": "Agent CLI Local Cold/Warm Message",
|
||||
"objective": "Send cold and warm simple messages through `openclaw agent --local`, verify mock-provider responses, and attribute latency before, during, and after provider work.",
|
||||
"tags": ["agent", "message", "latency", "providers", "gateway", "cold-warm"],
|
||||
"tags": ["agent", "message", "latency", "providers", "cold-warm"],
|
||||
"timeoutMs": 240000,
|
||||
"agent": {
|
||||
"expectedText": "KOVA_AGENT_OK"
|
||||
@ -16,11 +16,11 @@
|
||||
"coldWarmDeltaMs": 30000,
|
||||
"preProviderMs": 10000,
|
||||
"coldPreProviderMs": 10000,
|
||||
"warmPreProviderMs": 5000,
|
||||
"warmPreProviderMs": 10000,
|
||||
"providerFinalMs": 3000,
|
||||
"preProviderDominanceRatio": 0.8,
|
||||
"statusMs": 10000,
|
||||
"peakRssMb": 900,
|
||||
"agentProcessLeaks": 4,
|
||||
"missingDependencyErrors": 0,
|
||||
"pluginLoadFailures": 0,
|
||||
"providerTimeoutMentions": 0
|
||||
@ -29,9 +29,9 @@
|
||||
{
|
||||
"id": "provision",
|
||||
"title": "Provision Agent Env",
|
||||
"intent": "Start a disposable OpenClaw gateway before wiring the model provider and sending messages.",
|
||||
"commands": ["ocm start {env} {startSelector} --json"],
|
||||
"evidence": ["gateway port", "runtime binding", "startup readiness"]
|
||||
"intent": "Create a disposable OpenClaw env before wiring the model provider and sending local agent messages.",
|
||||
"commands": ["ocm start {env} {startSelector} --no-service --json"],
|
||||
"evidence": ["gateway port", "runtime binding", "env created without service"]
|
||||
},
|
||||
{
|
||||
"id": "cold-agent-turn",
|
||||
@ -53,10 +53,10 @@
|
||||
},
|
||||
{
|
||||
"id": "post-agent-health",
|
||||
"title": "Post-Agent Gateway Health",
|
||||
"intent": "Verify the gateway remains responsive after both agent turns and capture provider/plugin diagnostics.",
|
||||
"commands": ["ocm @{env} -- status", "ocm logs {env} --tail 300 --raw"],
|
||||
"evidence": ["gateway status", "provider logs", "plugin errors", "memory after agent turns"]
|
||||
"title": "Post-Agent Env Status",
|
||||
"intent": "Verify the env remains usable after both local agent turns and capture plugin diagnostics.",
|
||||
"commands": ["ocm @{env} -- status"],
|
||||
"evidence": ["env status", "plugin errors", "memory after agent turns"]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@ -21,8 +21,12 @@ export function evaluateRecord(record, scenario, options = {}) {
|
||||
const roleThresholds = thresholdPolicy.roleThresholds;
|
||||
const violations = [];
|
||||
const allResults = collectResults(record);
|
||||
const resourceSummary = collectResourceSummary(allResults);
|
||||
const peakRssMb = maxNullable(collectPeakRss(record), resourceSummary.peakTotalRssMb);
|
||||
const measuredResults = collectResults(record, { excludePhaseIds: ["target-setup"] });
|
||||
const resourceSummary = collectResourceSummary(measuredResults);
|
||||
const peakRssMb = maxNullable(
|
||||
collectPeakRss(record, { excludePhaseIds: ["target-setup"] }),
|
||||
resourceSummary.peakTotalRssMb
|
||||
);
|
||||
const cpuPercentMax = maxNullable(collectCpuPercentMax(record), resourceSummary.maxTotalCpuPercent);
|
||||
const missingDependencyErrors = countMissingDependencyErrors(allResults) + countLogMetric(record, "missingDependencyErrors");
|
||||
const pluginLoadFailures = countLogMetric(record, "pluginLoadFailures");
|
||||
@ -46,6 +50,7 @@ export function evaluateRecord(record, scenario, options = {}) {
|
||||
const heapSnapshotBytes = countHeapSnapshotBytes(record);
|
||||
const diagnosticReportCount = countDiagnosticReportMetric(record, "fileCount");
|
||||
const diagnosticReportBytes = countDiagnosticReportMetric(record, "artifactBytes");
|
||||
const gatewayExpected = recordExpectsGateway(record);
|
||||
const openclawDiagnostics = collectOpenClawDiagnostics(record);
|
||||
const timelineSummary = collectTimelineSummary(record);
|
||||
const logSummary = collectLogSummary(record);
|
||||
@ -73,7 +78,7 @@ export function evaluateRecord(record, scenario, options = {}) {
|
||||
const agentTurnMs = maxNullable(maxDurationWhere(allResults, isAgentMessageCommand), maxTurnDuration(agentTurns));
|
||||
const agentResponseOk = agentTurns.length === 0 ? null : agentTurns.every((turn) => turn.responseOk === true);
|
||||
const agentProviderSimulation = evaluateProviderSimulation({ turns: agentTurns, scenario, record, thresholds });
|
||||
const agentFailureContainment = evaluateAgentFailureContainment({ turns: agentTurns, record, thresholds });
|
||||
const agentFailureContainment = evaluateAgentFailureContainment({ turns: agentTurns, record, thresholds, gatewayExpected });
|
||||
const agentCleanupDiagnosis = diagnoseAgentCleanup(agentTurns, agentTurnStats, thresholds);
|
||||
const agentLatencyDiagnosis = diagnoseAgentLatency({
|
||||
coldAgentTurn,
|
||||
@ -166,7 +171,7 @@ export function evaluateRecord(record, scenario, options = {}) {
|
||||
});
|
||||
}
|
||||
|
||||
if (finalGatewayState && finalGatewayState !== "running") {
|
||||
if (gatewayExpected && finalGatewayState && finalGatewayState !== "running") {
|
||||
violations.push({
|
||||
kind: "gateway",
|
||||
metric: "finalGatewayState",
|
||||
@ -950,7 +955,7 @@ function collectAgentTurns(record, providerEvidence, scenario, timelineSummary,
|
||||
return turns;
|
||||
}
|
||||
|
||||
function evaluateAgentFailureContainment({ turns, record, thresholds }) {
|
||||
function evaluateAgentFailureContainment({ turns, record, thresholds, gatewayExpected = true }) {
|
||||
if (turns.length === 0) {
|
||||
return {
|
||||
schemaVersion: "kova.agentFailureContainment.v1",
|
||||
@ -959,7 +964,7 @@ function evaluateAgentFailureContainment({ turns, record, thresholds }) {
|
||||
leakedProcesses: [],
|
||||
processLeaksOk: true,
|
||||
finalGatewayState: record.finalMetrics?.service?.gatewayState ?? null,
|
||||
gatewayHealthy: null,
|
||||
gatewayHealthy: gatewayExpected ? null : true,
|
||||
healthFailures: countHealthFailures(record),
|
||||
healthLimit: 0,
|
||||
statusWorks: null,
|
||||
@ -989,7 +994,7 @@ function evaluateAgentFailureContainment({ turns, record, thresholds }) {
|
||||
leakedProcesses,
|
||||
processLeaksOk: leakCount <= leakLimit,
|
||||
finalGatewayState,
|
||||
gatewayHealthy: finalGatewayState === "running" && healthFailures <= healthLimit,
|
||||
gatewayHealthy: gatewayExpected ? finalGatewayState === "running" && healthFailures <= healthLimit : true,
|
||||
healthFailures,
|
||||
healthLimit,
|
||||
statusWorks,
|
||||
@ -2164,9 +2169,13 @@ function healthFailureCount(samples) {
|
||||
return samples.filter((sample) => sample && !sample.ok).length;
|
||||
}
|
||||
|
||||
function collectResults(record) {
|
||||
function collectResults(record, options = {}) {
|
||||
const excludePhaseIds = new Set(options.excludePhaseIds ?? []);
|
||||
const results = [];
|
||||
for (const phase of record.phases ?? []) {
|
||||
if (excludePhaseIds.has(phase.id)) {
|
||||
continue;
|
||||
}
|
||||
for (const result of phase.results ?? []) {
|
||||
results.push(result);
|
||||
}
|
||||
@ -2174,9 +2183,23 @@ function collectResults(record) {
|
||||
return results;
|
||||
}
|
||||
|
||||
function collectPeakRss(record) {
|
||||
function recordExpectsGateway(record) {
|
||||
return collectResults(record).some((result) => {
|
||||
const command = result.command ?? "";
|
||||
if (command.startsWith("ocm service start ") || command.startsWith("ocm service restart ")) {
|
||||
return true;
|
||||
}
|
||||
return command.startsWith("ocm start ") && !/(?:^|\s)--no-service(?:\s|$)/.test(command);
|
||||
});
|
||||
}
|
||||
|
||||
function collectPeakRss(record, options = {}) {
|
||||
const excludePhaseIds = new Set(options.excludePhaseIds ?? []);
|
||||
let peak = null;
|
||||
for (const phase of record.phases ?? []) {
|
||||
if (excludePhaseIds.has(phase.id)) {
|
||||
continue;
|
||||
}
|
||||
const rss = phase.metrics?.process?.rssMb;
|
||||
if (typeof rss === "number") {
|
||||
peak = peak === null ? rss : Math.max(peak, rss);
|
||||
|
||||
@ -110,6 +110,7 @@ export async function runSelfCheck(flags = {}) {
|
||||
checks.push(await commandTimeoutContractCheck());
|
||||
checks.push(ocmCommandBuildersCheck());
|
||||
checks.push(evaluationViolationHelpersCheck());
|
||||
checks.push(localBuildTargetSetupResourceExclusionCheck());
|
||||
checks.push(await jsonCommandCheck("plan-json", "node bin/kova.mjs plan --json", (data) => {
|
||||
assertEqual(data.schemaVersion, "kova.plan.v1", "plan schema");
|
||||
assertArrayNotEmpty(data.surfaces, "plan surfaces");
|
||||
@ -557,6 +558,91 @@ function evaluationViolationHelpersCheck() {
|
||||
}
|
||||
}
|
||||
|
||||
function localBuildTargetSetupResourceExclusionCheck() {
|
||||
try {
|
||||
const record = {
|
||||
scenario: "local-build-runtime-resources",
|
||||
status: "PASS",
|
||||
phases: [
|
||||
{
|
||||
id: "target-setup",
|
||||
results: [{
|
||||
command: "ocm runtime build-local kova-local-test --repo /tmp/openclaw --force",
|
||||
status: 0,
|
||||
durationMs: 60000,
|
||||
resourceSamples: syntheticResourceSamples({
|
||||
peakRssMb: 2500,
|
||||
maxCpuPercent: 350,
|
||||
role: "build-tooling"
|
||||
})
|
||||
}]
|
||||
},
|
||||
{
|
||||
id: "scenario-command",
|
||||
results: [{
|
||||
command: "ocm @kova-self-check -- status",
|
||||
status: 0,
|
||||
durationMs: 100,
|
||||
resourceSamples: syntheticResourceSamples({
|
||||
peakRssMb: 100,
|
||||
maxCpuPercent: 20,
|
||||
role: "gateway"
|
||||
})
|
||||
}]
|
||||
}
|
||||
],
|
||||
finalMetrics: {
|
||||
service: { gatewayState: "disabled" },
|
||||
logs: zeroLogMetrics()
|
||||
}
|
||||
};
|
||||
evaluateRecord(record, { thresholds: { peakRssMb: 900 } }, {
|
||||
surface: { thresholds: {} },
|
||||
targetPlan: { kind: "local-build" }
|
||||
});
|
||||
assertEqual(record.status, "PASS", "local-build target setup resources ignored status");
|
||||
assertEqual(record.measurements.peakRssMb, 100, "local-build target setup resources ignored RSS");
|
||||
assertEqual(record.measurements.resourceByRole.gateway.peakRssMb, 100, "scenario role RSS retained");
|
||||
assertEqual(record.measurements.resourceByRole["build-tooling"], undefined, "target setup role excluded");
|
||||
assertEqual(record.violations, undefined, "no-service local-build record has no gateway violation");
|
||||
return {
|
||||
id: "local-build-target-setup-resource-exclusion",
|
||||
status: "PASS",
|
||||
command: "evaluate local-build target setup resource exclusion",
|
||||
durationMs: 0
|
||||
};
|
||||
} catch (error) {
|
||||
return {
|
||||
id: "local-build-target-setup-resource-exclusion",
|
||||
status: "FAIL",
|
||||
command: "evaluate local-build target setup resource exclusion",
|
||||
durationMs: 0,
|
||||
message: error.message
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
function syntheticResourceSamples({ peakRssMb, maxCpuPercent, role }) {
|
||||
return {
|
||||
sampleCount: 1,
|
||||
peakTotalRssMb: peakRssMb,
|
||||
maxTotalCpuPercent: maxCpuPercent,
|
||||
peakCommandTreeRssMb: peakRssMb,
|
||||
peakGatewayRssMb: role === "gateway" ? peakRssMb : 0,
|
||||
byRole: {
|
||||
[role]: {
|
||||
peakRssMb,
|
||||
maxCpuPercent,
|
||||
peakProcessCount: 1
|
||||
}
|
||||
},
|
||||
topRolesByRss: [{ role, peakRssMb, maxCpuPercent }],
|
||||
topRolesByCpu: [{ role, peakRssMb, maxCpuPercent }],
|
||||
topByRss: [],
|
||||
topByCpu: []
|
||||
};
|
||||
}
|
||||
|
||||
function gatePartialFailureCheck() {
|
||||
try {
|
||||
const gate = evaluateGate({
|
||||
|
||||
Loading…
Reference in New Issue
Block a user