feat: add calibrated threshold policy

2026-05-01 09:54:47 +01:00 · 2026-05-01 09:54:47 +01:00 · fba1ea8b6d
commit fba1ea8b6d
parent 4819803596
9 changed files with 425 additions and 14 deletions
--- a/metrics/known.json
+++ b/metrics/known.json
@ -32,6 +32,8 @@
    "maxCpuPercent",
    "missingDependencyErrors",
    "modelsListMs",
+    "openclawSlowestSpanMs",
+    "openclawTimelineParseErrors",
    "peakRssMb",
    "pluginIndexPresent",
    "pluginLoadFailures",
--- a/profiles/diagnostic.json
+++ b/profiles/diagnostic.json
@ -18,6 +18,52 @@
      "agent.cleanup"
    ]
  },
+  "calibration": {
+    "roles": {
+      "gateway": { "peakRssMb": 1000, "maxCpuPercent": 350 },
+      "command-tree": { "peakRssMb": 1400, "maxCpuPercent": 450 },
+      "runtime-staging": { "peakRssMb": 900, "maxCpuPercent": 350 },
+      "package-manager": { "peakRssMb": 900, "maxCpuPercent": 350 },
+      "agent-cli": { "peakRssMb": 1200, "maxCpuPercent": 350 },
+      "mock-provider": { "peakRssMb": 300, "maxCpuPercent": 150 }
+    },
+    "surfaces": {
+      "release-runtime-startup": {
+        "thresholds": {
+          "gatewayReadyMs": 60000,
+          "gatewayReadyHardTimeoutMs": 180000,
+          "runtimeDepsStagingMs": 45000,
+          "openclawTimelineParseErrors": 0,
+          "openclawSlowestSpanMs": 45000
+        }
+      },
+      "agent-message": {
+        "thresholds": {
+          "coldAgentTurnMs": 60000,
+          "warmAgentTurnMs": 15000,
+          "coldPreProviderMs": 15000,
+          "providerFinalMs": 3000,
+          "agentCleanupMs": 5000,
+          "openclawSlowestSpanMs": 45000
+        }
+      },
+      "bundled-runtime-deps": {
+        "thresholds": {
+          "runtimeDepsStagingMs": 45000,
+          "warmRuntimeDepsRestageCount": 0,
+          "openclawSlowestSpanMs": 45000
+        }
+      },
+      "gateway-performance": {
+        "thresholds": {
+          "gatewayReadyMs": 60000,
+          "healthP95Ms": 1000,
+          "peakRssMb": 1000,
+          "openclawSlowestSpanMs": 45000
+        }
+      }
+    }
+  },
  "gate": {
    "id": "openclaw-diagnostic",
    "coverage": {
--- a/profiles/release.json
+++ b/profiles/release.json
@ -2,6 +2,74 @@
  "id": "release",
  "title": "Release Matrix",
  "objective": "Broad OpenClaw release confidence across install, upgrade, bundled plugins, model/provider, UI, failure, soak, and platform smoke scenarios.",
+  "calibration": {
+    "roles": {
+      "gateway": { "peakRssMb": 900, "maxCpuPercent": 300 },
+      "command-tree": { "peakRssMb": 1200, "maxCpuPercent": 400 },
+      "runtime-management": { "peakRssMb": 900, "maxCpuPercent": 350 },
+      "package-manager": { "peakRssMb": 900, "maxCpuPercent": 350 },
+      "agent-cli": { "peakRssMb": 1100, "maxCpuPercent": 350 },
+      "agent-process": { "peakRssMb": 900, "maxCpuPercent": 300 },
+      "plugin-cli": { "peakRssMb": 650, "maxCpuPercent": 250 },
+      "model-cli": { "peakRssMb": 650, "maxCpuPercent": 250 },
+      "doctor-cli": { "peakRssMb": 700, "maxCpuPercent": 300 },
+      "tui-cli": { "peakRssMb": 650, "maxCpuPercent": 250 },
+      "dashboard-cli": { "peakRssMb": 650, "maxCpuPercent": 250 },
+      "browser-sidecar": { "peakRssMb": 500, "maxCpuPercent": 250 },
+      "mock-provider": { "peakRssMb": 300, "maxCpuPercent": 150 }
+    },
+    "surfaces": {
+      "release-runtime-startup": {
+        "thresholds": {
+          "gatewayReadyMs": 45000,
+          "gatewayReadyHardTimeoutMs": 120000,
+          "runtimeDepsStagingMs": 30000,
+          "missingDependencyErrors": 0,
+          "pluginLoadFailures": 0,
+          "peakRssMb": 950
+        }
+      },
+      "upgrade-existing-user": {
+        "thresholds": {
+          "upgradeMs": 180000,
+          "gatewayReadyMs": 60000,
+          "statusMs": 10000,
+          "missingDependencyErrors": 0,
+          "pluginLoadFailures": 0
+        }
+      },
+      "agent-message": {
+        "thresholds": {
+          "coldAgentTurnMs": 45000,
+          "warmAgentTurnMs": 15000,
+          "coldPreProviderMs": 10000,
+          "warmPreProviderMs": 2500,
+          "providerFinalMs": 3000,
+          "agentCleanupMs": 5000,
+          "agentProcessLeaks": 0
+        }
+      },
+      "bundled-runtime-deps": {
+        "thresholds": {
+          "runtimeDepsStagingMs": 30000,
+          "warmRuntimeDepsRestageCount": 0,
+          "warmRuntimeDepsStagingMs": 1000,
+          "missingDependencyErrors": 0,
+          "pluginLoadFailures": 0
+        }
+      },
+      "soak": {
+        "thresholds": {
+          "soakMinDurationMs": 60000,
+          "soakCommandFailures": 0,
+          "soakHealthFailures": 0,
+          "soakCommandP95Ms": 10000,
+          "soakHealthP95Ms": 1000,
+          "gatewayRssGrowthMb": 250
+        }
+      }
+    }
+  },
  "gate": {
    "id": "openclaw-release",
    "coverage": {
--- a/src/evaluation/thresholds.mjs
+++ b/src/evaluation/thresholds.mjs
@ -0,0 +1,83 @@
+export function resolveThresholdPolicy({ profile = null, surface = null, scenario = null } = {}) {
+  const surfaceCalibration = profile?.calibration?.surfaces?.[surface?.id] ?? {};
+  const thresholds = mergeObjects(
+    surface?.thresholds,
+    surfaceCalibration.thresholds,
+    scenario?.thresholds
+  );
+  const roleThresholds = mergeRoleThresholds(
+    profile?.calibration?.roles,
+    surface?.roleThresholds,
+    surfaceCalibration.roleThresholds,
+    scenario?.thresholds?.roleThresholds
+  );
+
+  return {
+    thresholds,
+    roleThresholds,
+    report: {
+      schemaVersion: "kova.thresholdPolicy.v1",
+      profileId: profile?.id ?? null,
+      surfaceId: surface?.id ?? null,
+      scenarioId: scenario?.id ?? null,
+      sources: thresholdSources({ profile, surface, surfaceCalibration, scenario }),
+      thresholds,
+      roleThresholds
+    }
+  };
+}
+
+function thresholdSources({ profile, surface, surfaceCalibration, scenario }) {
+  const sources = [];
+  if (surface?.thresholds && Object.keys(surface.thresholds).length > 0) {
+    sources.push({ kind: "surface", id: surface.id, thresholds: Object.keys(surface.thresholds).sort() });
+  }
+  if (surface?.roleThresholds && Object.keys(surface.roleThresholds).length > 0) {
+    sources.push({ kind: "surface-role", id: surface.id, roles: Object.keys(surface.roleThresholds).sort() });
+  }
+  if (surfaceCalibration?.thresholds && Object.keys(surfaceCalibration.thresholds).length > 0) {
+    sources.push({ kind: "profile-surface", id: `${profile?.id}:${surface?.id}`, thresholds: Object.keys(surfaceCalibration.thresholds).sort() });
+  }
+  if (surfaceCalibration?.roleThresholds && Object.keys(surfaceCalibration.roleThresholds).length > 0) {
+    sources.push({ kind: "profile-surface-role", id: `${profile?.id}:${surface?.id}`, roles: Object.keys(surfaceCalibration.roleThresholds).sort() });
+  }
+  if (profile?.calibration?.roles && Object.keys(profile.calibration.roles).length > 0) {
+    sources.push({ kind: "profile-role", id: profile.id, roles: Object.keys(profile.calibration.roles).sort() });
+  }
+  if (scenario?.thresholds && Object.keys(scenario.thresholds).length > 0) {
+    sources.push({ kind: "scenario", id: scenario.id, thresholds: Object.keys(scenario.thresholds).sort() });
+  }
+  return sources;
+}
+
+function mergeObjects(...objects) {
+  const merged = {};
+  for (const object of objects) {
+    if (!object || typeof object !== "object" || Array.isArray(object)) {
+      continue;
+    }
+    for (const [key, value] of Object.entries(object)) {
+      if (key === "roleThresholds") {
+        continue;
+      }
+      merged[key] = value;
+    }
+  }
+  return merged;
+}
+
+function mergeRoleThresholds(...sets) {
+  const merged = {};
+  for (const set of sets) {
+    if (!set || typeof set !== "object" || Array.isArray(set)) {
+      continue;
+    }
+    for (const [role, thresholds] of Object.entries(set)) {
+      merged[role] = {
+        ...(merged[role] ?? {}),
+        ...(thresholds ?? {})
+      };
+    }
+  }
+  return merged;
+}
--- a/src/evaluator.mjs
+++ b/src/evaluator.mjs
@ -1,11 +1,17 @@
 import { buildAgentTurnBreakdown } from "./collectors/agent-turns.mjs";
 import { computeProviderTurnAttribution } from "./collectors/provider.mjs";
 import { summarizeRuntimeDepsLogs } from "./collectors/logs.mjs";
+import { resolveThresholdPolicy } from "./evaluation/thresholds.mjs";

 export function evaluateRecord(record, scenario, options = {}) {
  const originalStatus = record.status;
-  const thresholds = { ...(options.surface?.thresholds ?? {}), ...(scenario.thresholds ?? {}) };
-  const roleThresholds = mergeRoleThresholds(options.surface?.roleThresholds, scenario.thresholds?.roleThresholds);
+  const thresholdPolicy = resolveThresholdPolicy({
+    profile: options.profile,
+    surface: options.surface,
+    scenario
+  });
+  const thresholds = thresholdPolicy.thresholds;
+  const roleThresholds = thresholdPolicy.roleThresholds;
  const violations = [];
  const allResults = collectResults(record);
  const resourceSummary = collectResourceSummary(allResults);
@ -580,6 +586,7 @@ export function evaluateRecord(record, scenario, options = {}) {
      providerModelTimingMs
    })
  };
+  record.thresholdPolicy = thresholdPolicy.report;

  if (violations.length > 0) {
    if (originalStatus === "PASS") {
@ -1648,17 +1655,6 @@ function checkRoleThresholds(violations, byRole, roleThresholds) {
  }
 }

-function mergeRoleThresholds(base, override) {
-  const merged = {};
-  for (const [sourceRole, sourceThresholds] of Object.entries(base ?? {})) {
-    merged[sourceRole] = { ...sourceThresholds };
-  }
-  for (const [sourceRole, sourceThresholds] of Object.entries(override ?? {})) {
-    merged[sourceRole] = { ...(merged[sourceRole] ?? {}), ...sourceThresholds };
-  }
-  return merged;
-}
-
 function collectResults(record) {
  const results = [];
  for (const phase of record.phases ?? []) {
--- a/src/main.mjs
+++ b/src/main.mjs
@ -712,6 +712,10 @@ function profileSummary(profile) {
    entryCount: profile.entries.length,
    targetKinds: profile.targetKinds ?? null,
    diagnostics: profile.diagnostics ?? null,
+    calibration: profile.calibration ? {
+      surfaceCount: Object.keys(profile.calibration.surfaces ?? {}).length,
+      roleCount: Object.keys(profile.calibration.roles ?? {}).length
+    } : null,
    gate: profile.gate ? {
      id: profile.gate.id ?? `${profile.id}-gate`,
      blockingCount: Array.isArray(profile.gate.blocking) ? profile.gate.blocking.length : profile.entries.length,
--- a/src/registries/profiles.mjs
+++ b/src/registries/profiles.mjs
@ -24,6 +24,7 @@ export function validateProfileShape(profile, sourceName = "profile") {
  requireArray(profile, "entries", errors);
  validateStringArray(profile.targetKinds, "targetKinds", errors, { optional: true });
  validateDiagnostics(profile.diagnostics, "diagnostics", errors);
+  validateCalibration(profile.calibration, "calibration", errors);
  validateEntries(profile.entries, errors);

  if (profile.gate !== undefined) {
@ -33,6 +34,50 @@ export function validateProfileShape(profile, sourceName = "profile") {
  assertNoShapeErrors(errors, sourceName);
 }

+function validateCalibration(calibration, prefix, errors) {
+  if (calibration === undefined) {
+    return;
+  }
+  if (!calibration || typeof calibration !== "object" || Array.isArray(calibration)) {
+    errors.push(`${prefix} must be an object when set`);
+    return;
+  }
+  validateThresholdMap(calibration.roles, `${prefix}.roles`, errors, { keyed: true });
+  if (calibration.surfaces !== undefined) {
+    if (!calibration.surfaces || typeof calibration.surfaces !== "object" || Array.isArray(calibration.surfaces)) {
+      errors.push(`${prefix}.surfaces must be an object when set`);
+    } else {
+      for (const [surfaceId, surfaceCalibration] of Object.entries(calibration.surfaces)) {
+        if (!surfaceCalibration || typeof surfaceCalibration !== "object" || Array.isArray(surfaceCalibration)) {
+          errors.push(`${prefix}.surfaces.${surfaceId} must be an object`);
+          continue;
+        }
+        validateThresholdMap(surfaceCalibration.thresholds, `${prefix}.surfaces.${surfaceId}.thresholds`, errors);
+        validateThresholdMap(surfaceCalibration.roleThresholds, `${prefix}.surfaces.${surfaceId}.roleThresholds`, errors, { keyed: true });
+      }
+    }
+  }
+}
+
+function validateThresholdMap(map, prefix, errors, options = {}) {
+  if (map === undefined) {
+    return;
+  }
+  if (!map || typeof map !== "object" || Array.isArray(map)) {
+    errors.push(`${prefix} must be an object when set`);
+    return;
+  }
+  for (const [key, value] of Object.entries(map)) {
+    if (options.keyed) {
+      validateThresholdMap(value, `${prefix}.${key}`, errors);
+      continue;
+    }
+    if (typeof value !== "number" || !Number.isFinite(value)) {
+      errors.push(`${prefix}.${key} must be a finite number`);
+    }
+  }
+}
+
 function validateDiagnostics(diagnostics, prefix, errors) {
  if (diagnostics === undefined) {
    return;
--- a/src/registries/validate.mjs
+++ b/src/registries/validate.mjs
@ -83,7 +83,7 @@ export function validateRegistryReferences({ scenarios, states, profiles, surfac
  }

  for (const profile of profiles) {
-    validateProfileReferences(profile, { scenarioIds, stateIds, surfaceIds, traitIds, scenarioById, stateById, surfaceById }, errors);
+    validateProfileReferences(profile, { scenarioIds, stateIds, surfaceIds, processRoleIds, metricIds, traitIds, scenarioById, stateById, surfaceById }, errors);
  }

  if (errors.length > 0) {
@ -178,6 +178,35 @@ function validateProfileReferences(profile, refs, errors) {
  validateCoverageRefs(profile, refs, errors, "traits", refs.traitIds);
  validatePlatformCoverageRefs(profile, errors);
  validateStateSurfaceCoverageRefs(profile, refs, errors);
+  validateCalibrationRefs(profile, refs, errors);
+}
+
+function validateCalibrationRefs(profile, refs, errors) {
+  const calibration = profile.calibration;
+  if (!calibration) {
+    return;
+  }
+  for (const role of Object.keys(calibration.roles ?? {})) {
+    if (!refs.processRoleIds.has(role)) {
+      errors.push(`profile '${profile.id}' calibration.roles references unknown process role '${role}'`);
+      continue;
+    }
+    validateThresholdMetrics(calibration.roles[role], refs.metricIds, errors, `profile '${profile.id}' calibration.roles.${role}`);
+  }
+  for (const [surfaceId, surfaceCalibration] of Object.entries(calibration.surfaces ?? {})) {
+    if (!refs.surfaceIds.has(surfaceId)) {
+      errors.push(`profile '${profile.id}' calibration.surfaces references unknown surface '${surfaceId}'`);
+      continue;
+    }
+    validateThresholdMetrics(surfaceCalibration.thresholds ?? {}, refs.metricIds, errors, `profile '${profile.id}' calibration.surfaces.${surfaceId}.thresholds`);
+    for (const [role, thresholds] of Object.entries(surfaceCalibration.roleThresholds ?? {})) {
+      if (!refs.processRoleIds.has(role)) {
+        errors.push(`profile '${profile.id}' calibration.surfaces.${surfaceId}.roleThresholds references unknown process role '${role}'`);
+        continue;
+      }
+      validateThresholdMetrics(thresholds, refs.metricIds, errors, `profile '${profile.id}' calibration.surfaces.${surfaceId}.roleThresholds.${role}`);
+    }
+  }
 }

 function validatePlatformCoverageRefs(profile, errors) {
--- a/src/selfcheck.mjs
+++ b/src/selfcheck.mjs
@ -92,8 +92,11 @@ export async function runSelfCheck(flags = {}) {
      assertEqual(data.coverage?.schemaVersion, "kova.coverage.v1", "coverage schema");
      assertArrayNotEmpty(data.coverage?.scenarioSurfaceMap, "scenario surface map");
      const releaseCoverage = data.coverage?.profiles?.find((profile) => profile.id === "release");
+      const releaseProfile = data.profiles?.find((profile) => profile.id === "release");
      assertArrayNotEmpty(releaseCoverage?.required?.platforms, "release required platform coverage");
      assertArrayNotEmpty(releaseCoverage?.currentPlatformKeys, "current platform coverage keys");
+      assertEqual((releaseProfile?.calibration?.surfaceCount ?? 0) > 0, true, "release profile calibrated surfaces");
+      assertEqual((releaseProfile?.calibration?.roleCount ?? 0) > 0, true, "release profile calibrated roles");
      if (data.scenarios.some((scenario) => typeof scenario.surface !== "string" || scenario.surface.length === 0)) {
        throw new Error("every scenario must expose a surface");
      }
@ -167,6 +170,7 @@ export async function runSelfCheck(flags = {}) {
    checks.push(await resourceRoleAttributionCheck(tmp));
    checks.push(await processSnapshotCheck(tmp));
    checks.push(roleThresholdEvaluationCheck());
+    checks.push(thresholdPolicyCalibrationCheck());
    checks.push(stateRegistryValidationCheck());
    checks.push(scenarioStateCompatibilityCheck());
    checks.push(await cpuProfileParserCheck());
@ -2751,6 +2755,99 @@ function roleThresholdEvaluationCheck() {
  }
 }

+function thresholdPolicyCalibrationCheck() {
+  try {
+    const record = {
+      scenario: "synthetic-threshold-policy",
+      title: "Synthetic Threshold Policy",
+      status: "PASS",
+      phases: [{
+        id: "sample",
+        results: [{
+          command: "ocm start kova-threshold-test",
+          status: 0,
+          durationMs: 150,
+          resourceSamples: {
+            schemaVersion: "kova.resourceSamples.v1",
+            sampleCount: 1,
+            peakTotalRssMb: 250,
+            maxTotalCpuPercent: 80,
+            byRole: {
+              gateway: {
+                peakRssMb: 250,
+                maxCpuPercent: 80,
+                peakRssAtMs: 10,
+                peakCpuAtMs: 10,
+                peakProcessCount: 1
+              }
+            },
+            topRolesByRss: [{ role: "gateway", peakRssMb: 250, maxCpuPercent: 80 }],
+            topRolesByCpu: [{ role: "gateway", peakRssMb: 250, maxCpuPercent: 80 }],
+            topByRss: [],
+            topByCpu: []
+          }
+        }],
+        metrics: { logs: zeroLogMetrics() }
+      }],
+      finalMetrics: {
+        service: { gatewayState: "running" },
+        logs: zeroLogMetrics()
+      }
+    };
+    evaluateRecord(record, {
+      id: "synthetic-threshold-policy",
+      thresholds: {}
+    }, {
+      profile: {
+        id: "release",
+        calibration: {
+          roles: {
+            gateway: { peakRssMb: 200 }
+          },
+          surfaces: {
+            "release-runtime-startup": {
+              thresholds: { coldReadyMs: 100 }
+            }
+          }
+        }
+      },
+      surface: {
+        id: "release-runtime-startup",
+        thresholds: { coldReadyMs: 1000 },
+        roleThresholds: {}
+      }
+    });
+    assertEqual(record.status, "FAIL", "profile calibration threshold should fail record");
+    assertEqual(record.thresholdPolicy?.profileId, "release", "threshold policy profile id");
+    assertEqual(record.thresholdPolicy?.thresholds?.coldReadyMs, 100, "profile surface threshold override");
+    assertEqual(record.thresholdPolicy?.roleThresholds?.gateway?.peakRssMb, 200, "profile role threshold");
+    assertEqual(
+      record.violations.some((violation) => violation.metric === "coldReadyMs"),
+      true,
+      "profile calibrated duration violation"
+    );
+    assertEqual(
+      record.violations.some((violation) => violation.metric === "resourceByRole.gateway.peakRssMb"),
+      true,
+      "profile calibrated role violation"
+    );
+    return {
+      id: "threshold-policy-calibration",
+      status: "PASS",
+      command: "evaluate synthetic profile threshold calibration",
+      durationMs: 0
+    };
+  } catch (error) {
+    return {
+      id: "threshold-policy-calibration",
+      status: "FAIL",
+      command: "evaluate synthetic profile threshold calibration",
+      durationMs: 0,
+      message: error.message
+    };
+  }
+}
+
 function stateRegistryValidationCheck() {
  try {
    let rejectedTrait = false;
@ -2904,6 +3001,47 @@ function stateRegistryValidationCheck() {
    }
    assertEqual(rejectedMetric, true, "unknown scenario metric rejected");

+    let rejectedCalibration = false;
+    try {
+      validateRegistryReferences({
+        scenarios: [],
+        states: [],
+        profiles: [{
+          id: "profile",
+          entries: [],
+          calibration: {
+            roles: {
+              missingRole: { peakRssMb: 100 }
+            },
+            surfaces: {
+              missingSurface: {
+                thresholds: { peakRssMb: 100 }
+              },
+              knownSurface: {
+                thresholds: { madeUpMetric: 1 },
+                roleThresholds: {
+                  knownRole: { peakRssMb: 100 }
+                }
+              }
+            }
+          }
+        }],
+        surfaces: [{
+          id: "knownSurface",
+          processRoles: [],
+          requiredStates: [],
+          targetKinds: []
+        }],
+        processRoles: [{ id: "knownRole" }],
+        metrics: [{ id: "peakRssMb" }]
+      });
+    } catch (error) {
+      rejectedCalibration = /calibration\.roles references unknown process role/.test(error.message) &&
+        /calibration\.surfaces references unknown surface/.test(error.message) &&
+        /unknown metric 'madeUpMetric'/.test(error.message);
+    }
+    assertEqual(rejectedCalibration, true, "invalid profile calibration rejected");
+
    let rejectedPlatform = false;
    try {
      validateRegistryReferences({