diff --git a/.github/workflows/commit-review.yml b/.github/workflows/commit-review.yml index 253dc41de8..c9f8867c90 100644 --- a/.github/workflows/commit-review.yml +++ b/.github/workflows/commit-review.yml @@ -159,9 +159,13 @@ jobs: ENABLED: ${{ steps.mode.outputs.enabled }} SOURCE_REF: ${{ github.event.client_payload.ref || 'refs/heads/main' }} SETTLE_SECONDS: ${{ vars.CLAWSWEEPER_COMMIT_REVIEW_SETTLE_SECONDS || '60' }} - PAGE_SIZE: ${{ vars.CLAWSWEEPER_COMMIT_REVIEW_PAGE_SIZE || '6' }} + PAGE_SIZE: ${{ vars.CLAWSWEEPER_COMMIT_REVIEW_PAGE_SIZE || '' }} run: | set -euo pipefail + if [ -z "$PAGE_SIZE" ]; then + PAGE_SIZE="$(pnpm --dir clawsweeper run --silent workflow -- limit commit_review.page_size_default)" + fi + page_size_hard_cap="$(pnpm --dir clawsweeper run --silent workflow -- limit commit_review.page_size_hard_cap)" if [ "$ENABLED" = "false" ]; then { echo "matrix=[]" @@ -203,8 +207,8 @@ jobs: if [ "$PAGE_SIZE" -lt 1 ]; then PAGE_SIZE=1 fi - if [ "$PAGE_SIZE" -gt 256 ]; then - PAGE_SIZE=256 + if [ "$PAGE_SIZE" -gt "$page_size_hard_cap" ]; then + PAGE_SIZE="$page_size_hard_cap" fi if [ "$SETTLE_SECONDS" -gt 0 ]; then echo "Waiting ${SETTLE_SECONDS}s for target main to settle before selecting commits." diff --git a/.github/workflows/repair-issue-implementation-intake.yml b/.github/workflows/repair-issue-implementation-intake.yml index aec2fc6eb5..a3e15149f3 100644 --- a/.github/workflows/repair-issue-implementation-intake.yml +++ b/.github/workflows/repair-issue-implementation-intake.yml @@ -143,12 +143,15 @@ jobs: if: ${{ steps.prepare.outputs.should_repair == 'true' }} env: GH_TOKEN: ${{ steps.app_token.outputs.token }} - MAX_LIVE_WORKERS: ${{ vars.CLAWSWEEPER_AUTO_IMPLEMENT_MAX_LIVE_WORKERS || '40' }} + MAX_LIVE_WORKERS: ${{ vars.CLAWSWEEPER_AUTO_IMPLEMENT_MAX_LIVE_WORKERS || '' }} RUNNER: ${{ github.event.inputs.runner || vars.CLAWSWEEPER_WORKER_RUNNER || 'blacksmith-4vcpu-ubuntu-2404' }} EXECUTION_RUNNER: ${{ github.event.inputs.execution_runner || vars.CLAWSWEEPER_EXECUTION_RUNNER || 'blacksmith-16vcpu-ubuntu-2404' }} MODEL: ${{ github.event.inputs.model || vars.CLAWSWEEPER_MODEL || 'gpt-5.5' }} run: | set -euo pipefail + if [ -z "$MAX_LIVE_WORKERS" ]; then + MAX_LIVE_WORKERS="$(pnpm run --silent workflow -- limit repair_live_runs.issue_implementation_default)" + fi git pull --rebase pnpm run repair:dispatch -- "${{ steps.prepare.outputs.job_path }}" \ --mode autonomous \ diff --git a/.github/workflows/sweep.yml b/.github/workflows/sweep.yml index e4c0c342a3..2c67be1869 100644 --- a/.github/workflows/sweep.yml +++ b/.github/workflows/sweep.yml @@ -596,16 +596,24 @@ jobs: - id: mode run: | + limit() { + pnpm --dir clawsweeper run --silent workflow -- limit "$1" + } + exact_item_shards="$(limit review_shards.exact_item_default)" + hot_intake_shards="$(limit review_shards.hot_intake_default)" + normal_shards="$(limit review_shards.normal_default)" + normal_active_floor="$(limit review_shards.normal_active_floor)" + hard_shard_cap="$(limit review_shards.hard_cap)" hot_intake="${{ ((github.event_name == 'workflow_dispatch' && github.event.inputs.hot_intake == 'true') || (github.event_name == 'schedule' && (github.event.schedule == '*/5 * * * *' || github.event.schedule == '2/5 * * * *'))) && 'true' || 'false' }}" exact_item="${{ github.event.client_payload.item_number || github.event.inputs.item_number || github.event.inputs.item_numbers || '' }}" target_repo="${{ steps.target.outputs.target_repo }}" if [ "$hot_intake" = "true" ] && [ -n "$exact_item" ]; then batch_size="1" - shard_count="1" + shard_count="$exact_item_shards" max_pages="1" elif [ "$hot_intake" = "true" ]; then batch_size="1" - shard_count="40" + shard_count="$hot_intake_shards" max_pages="10" min_active_shards="0" min_backfill_review_age_minutes="30" @@ -616,11 +624,14 @@ jobs: batch_size="${{ github.event.inputs.batch_size || '3' }}" fi if [ "$target_repo" = "openclaw/openclaw" ]; then - min_active_shards="32" + min_active_shards="$normal_active_floor" else min_active_shards="0" fi - shard_count="${{ github.event.inputs.shard_count || '64' }}" + shard_count="${{ github.event.inputs.shard_count || '' }}" + if [ -z "$shard_count" ]; then + shard_count="$normal_shards" + fi max_pages="250" min_backfill_review_age_minutes="30" fi @@ -629,10 +640,10 @@ jobs: min_backfill_review_age_minutes="30" fi if ! [[ "$shard_count" =~ ^[0-9]+$ ]]; then - shard_count="64" + shard_count="$normal_shards" fi - if [ "$shard_count" -gt 100 ]; then - shard_count="100" + if [ "$shard_count" -gt "$hard_shard_cap" ]; then + shard_count="$hard_shard_cap" fi { echo "batch_size=$batch_size" @@ -1001,9 +1012,12 @@ jobs: GH_TOKEN: ${{ github.token }} TARGET_REPO: ${{ needs.plan.outputs.target_repo }} ENABLED: ${{ vars.CLAWSWEEPER_AUTO_IMPLEMENT_REPRO_BUGS == '1' && 'true' || 'false' }} - MAX_DISPATCH: ${{ vars.CLAWSWEEPER_AUTO_IMPLEMENT_MAX_DISPATCH_PER_SWEEP || '4' }} + MAX_DISPATCH: ${{ vars.CLAWSWEEPER_AUTO_IMPLEMENT_MAX_DISPATCH_PER_SWEEP || '' }} run: | set -euo pipefail + if [ -z "$MAX_DISPATCH" ]; then + MAX_DISPATCH="$(pnpm run --silent workflow -- limit issue_implementation.dispatches_per_sweep_default)" + fi candidate_output="$(pnpm run --silent repair:issue-implementation-intake -- candidates \ --enabled "$ENABLED" \ --target-repo "$TARGET_REPO" \ @@ -1020,7 +1034,7 @@ jobs: fi CANDIDATES_JSON="$candidates_json" MAX_DISPATCH="$MAX_DISPATCH" node <<'NODE' > /tmp/issue-implementation-candidates.tsv const candidates = JSON.parse(process.env.CANDIDATES_JSON || "[]"); - const limit = Math.max(0, Number(process.env.MAX_DISPATCH || "4")); + const limit = Math.max(0, Number(process.env.MAX_DISPATCH || "0")); for (const candidate of candidates.slice(0, limit)) { console.log([ candidate.item_number, @@ -1721,6 +1735,8 @@ jobs: GH_TOKEN: ${{ github.token }} run: | set -euo pipefail + hot_intake_shards="$(pnpm run --silent workflow -- limit review_shards.hot_intake_default)" + normal_shards="$(pnpm run --silent workflow -- limit review_shards.normal_default)" runs_json="$(gh run list --repo "${{ github.repository }}" --workflow sweep.yml --limit 80 --json displayTitle,status,createdAt)" eval "$( RUNS_JSON="$runs_json" node <<'NODE' @@ -1762,7 +1778,7 @@ jobs: -f hot_intake=true \ -f target_repo=openclaw/openclaw \ -f batch_size=1 \ - -f shard_count=40 \ + -f shard_count="$hot_intake_shards" \ -f codex_timeout_ms=600000 fi @@ -1774,6 +1790,6 @@ jobs: -f hot_intake=false \ -f target_repo=openclaw/openclaw \ -f batch_size=3 \ - -f shard_count=64 \ + -f shard_count="$normal_shards" \ -f codex_timeout_ms=600000 fi diff --git a/CHANGELOG.md b/CHANGELOG.md index cdf42418df..c137a5ccb4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,9 @@ checkpoint, and status-only commits are intentionally omitted. ### Added +- Added `config/automation-limits.json` plus docs and a drift check so review, + commit-review, repair, and issue-implementation capacity defaults have one + checked-in source of truth. - Added a generated 1200x630 social preview card plus large-image Open Graph and Twitter metadata for the docs site. @@ -37,6 +40,9 @@ checkpoint, and status-only commits are intentionally omitted. and push timeouts as blocked repair outcomes. - Skipped self-heal repair redispatches when the same repair job is already queued or running, avoiding duplicate pending workers for active PR repairs. +- Let self-heal rediscover recent failed repair workers from live GitHub run + metadata when a hard execute failure happens before durable run records are + published. - Expanded validation-failure detail passed into Codex repair follow-up prompts so lint/typecheck failures keep the actionable diagnostic instead of only the package-manager epilogue. diff --git a/README.md b/README.md index 6175050b2b..f5bb4a8da0 100644 --- a/README.md +++ b/README.md @@ -458,7 +458,8 @@ reports live under `records/openclaw-clawhub/` without colliding with default repo records. `openclaw/clawsweeper` has a scheduled read-only audit row and is available for manual and event self-review smoke tests. Broad hot-intake sweeps cap scheduled fan-out at 40 one-item shards per run; exact event reviews still -use one shard. +use one shard. Throughput defaults live in +[docs/limits.md](docs/limits.md) and `config/automation-limits.json`. Target repositories can opt into event-level latency by installing the dispatcher workflow in [docs/target-dispatcher.md](docs/target-dispatcher.md). diff --git a/config/automation-limits.json b/config/automation-limits.json new file mode 100644 index 0000000000..de57679f6a --- /dev/null +++ b/config/automation-limits.json @@ -0,0 +1,22 @@ +{ + "review_shards": { + "normal_default": 64, + "normal_active_floor": 32, + "hot_intake_default": 40, + "exact_item_default": 1, + "hard_cap": 100 + }, + "commit_review": { + "page_size_default": 6, + "page_size_hard_cap": 256 + }, + "repair_live_runs": { + "default": 40, + "hard_cap": 100, + "automerge_default": 40, + "issue_implementation_default": 40 + }, + "issue_implementation": { + "dispatches_per_sweep_default": 4 + } +} diff --git a/docs/commit-dispatcher.md b/docs/commit-dispatcher.md index fa2c686e80..cfb97befc0 100644 --- a/docs/commit-dispatcher.md +++ b/docs/commit-dispatcher.md @@ -103,8 +103,10 @@ CLAWSWEEPER_COMMIT_REVIEW_SETTLE_SECONDS=60 Use `0` for settled manual backfills or a larger value during GitHub event lag incidents. -Commit review runs at most 6 Codex workers per workflow page by default. Adjust -on `openclaw/clawsweeper` only when the org has enough rate-limit headroom: +Commit review runs at most 6 Codex workers per workflow page by default. The +checked-in default lives in `config/automation-limits.json`; adjust the live +workflow on `openclaw/clawsweeper` only when the org has enough rate-limit +headroom: ```text CLAWSWEEPER_COMMIT_REVIEW_PAGE_SIZE=6 diff --git a/docs/commit-sweeper.md b/docs/commit-sweeper.md index e63b863ff3..208ad0085d 100644 --- a/docs/commit-sweeper.md +++ b/docs/commit-sweeper.md @@ -90,9 +90,9 @@ by SHA/range rather than detaching the whole target repository at the commit. ## Scaling -Commit Sweeper defaults to 6 commits per workflow page. The receiver clamps -`CLAWSWEEPER_COMMIT_REVIEW_PAGE_SIZE` between 1 and 256, then pages large -ranges: +Commit Sweeper defaults to 6 commits per workflow page. The checked-in default +lives in `config/automation-limits.json`. The receiver clamps +`CLAWSWEEPER_COMMIT_REVIEW_PAGE_SIZE` between 1 and 256, then pages large ranges: - select up to the configured page size - classify them cheaply diff --git a/docs/limits.md b/docs/limits.md new file mode 100644 index 0000000000..7365155818 --- /dev/null +++ b/docs/limits.md @@ -0,0 +1,47 @@ +# Automation Limits + +Read when changing ClawSweeper throughput, Codex fan-out, commit review paging, +or repair dispatch capacity. + +`config/automation-limits.json` is the source of truth for automation capacity +defaults. It covers throughput and fan-out limits only. Safety thresholds such +as close age floors, apply delays, retry counts, and comment caps stay near the +code that owns those decisions. + +GitHub repository variables still override these defaults in live workflows. +When a variable is unset, workflows read the checked-in limit after checkout. +The one exception is the `workflow_dispatch.inputs.shard_count.default` value in +`.github/workflows/sweep.yml`: GitHub renders that UI before checkout, so it +must remain a YAML literal. `pnpm run check:limits` verifies that literal and +the docs stay in sync with `config/automation-limits.json`. + +## Names + +| Name | Current | Meaning | +| --- | ---: | --- | +| `review_shards.normal_default` | 64 | Default normal review shard jobs per sweep. | +| `review_shards.normal_active_floor` | 32 | Minimum active normal review shards to keep queued for `openclaw/openclaw`. | +| `review_shards.hot_intake_default` | 40 | Broad hot-intake review shard jobs. | +| `review_shards.exact_item_default` | 1 | Exact-item hot-intake shard count. | +| `review_shards.hard_cap` | 100 | Maximum accepted review shard count. | +| `commit_review.page_size_default` | 6 | Commits selected per commit-review page. | +| `commit_review.page_size_hard_cap` | 256 | Maximum commit-review page size. | +| `repair_live_runs.default` | 40 | Default live repair workflow run cap for manual dispatch/requeue/self-heal. | +| `repair_live_runs.hard_cap` | 100 | Absolute live repair run cap accepted by the CLI. | +| `repair_live_runs.automerge_default` | 40 | Live repair run cap for automerge comment-router dispatches. | +| `repair_live_runs.issue_implementation_default` | 40 | Live repair run cap for issue-to-PR implementation intake. | +| `issue_implementation.dispatches_per_sweep_default` | 4 | Maximum implementation intake jobs queued from one review publish run. | + +## Runtime Overrides + +- `CLAWSWEEPER_COMMIT_REVIEW_PAGE_SIZE` overrides + `commit_review.page_size_default`. +- `CLAWSWEEPER_MAX_LIVE_WORKERS` overrides `repair_live_runs.default`. +- `CLAWSWEEPER_AUTOMERGE_MAX_LIVE_WORKERS` overrides + `repair_live_runs.automerge_default`. +- `CLAWSWEEPER_AUTO_IMPLEMENT_MAX_LIVE_WORKERS` overrides + `repair_live_runs.issue_implementation_default`. +- `CLAWSWEEPER_AUTO_IMPLEMENT_MAX_DISPATCH_PER_SWEEP` overrides + `issue_implementation.dispatches_per_sweep_default`. +- Manual `sweep.yml` dispatch `shard_count` overrides + `review_shards.normal_default`, then clamps to `review_shards.hard_cap`. diff --git a/docs/repair/README.md b/docs/repair/README.md index 256990878f..fbeb8f0f9b 100644 --- a/docs/repair/README.md +++ b/docs/repair/README.md @@ -230,7 +230,8 @@ pnpm run repair:import-gitcrawl-low-signal -- --limit 20 --batch-size 5 --mode a pnpm run repair:import-gitcrawl -- --from-gitcrawl --limit 40 --mode autonomous --suffix autonomous-smoke --allow-instant-close --allow-merge --allow-fix-pr --allow-post-merge-close # Dispatch reviewed jobs. Dispatch, requeue, and self-heal refuse to exceed -# 40 live cluster-worker runs by default; tune with CLAWSWEEPER_MAX_LIVE_WORKERS +# 40 live cluster-worker runs by default. The checked-in default lives in +# config/automation-limits.json; tune live runs with CLAWSWEEPER_MAX_LIVE_WORKERS # or --max-live-workers. With --wait-for-capacity, dispatch can drain a larger # file list in capacity-sized waves instead of refusing the whole batch. CLAWSWEEPER_MAX_LIVE_WORKERS=40 pnpm run repair:dispatch -- jobs/openclaw/inbox/cluster-example.md \ diff --git a/docs/scheduler.md b/docs/scheduler.md index 9c6f827d02..22a0380262 100644 --- a/docs/scheduler.md +++ b/docs/scheduler.md @@ -4,6 +4,9 @@ Read when changing `.github/workflows/sweep.yml`, `src/clawsweeper.ts` planner selection, review cadence, dashboard capacity fields, or GitHub Actions concurrency for issue/PR review and apply. +Throughput defaults come from `config/automation-limits.json`; see +[Automation Limits](limits.md) for the naming and GitHub variable overrides. + ClawSweeper has three issue/PR scheduler paths: - exact event review for one target issue or pull request @@ -128,7 +131,7 @@ Capacity is shard-level. A review shard processes its selected item numbers sequentially, so maximum concurrent Codex sessions equals the number of nonempty review shard jobs, not `batch_size * shard_count`. -Defaults: +Current defaults: - exact event review: 1 shard, 1 item - exact manual hot intake: 1 shard, 1 item diff --git a/package.json b/package.json index 1827b42eeb..2ea2f65bad 100644 --- a/package.json +++ b/package.json @@ -58,15 +58,16 @@ "test:coverage": "pnpm run build:all && node --test --experimental-test-coverage --test-coverage-include='dist/**/*.js' --test-coverage-exclude='dist/repair/*.test.js' --test-coverage-lines=49 --test-coverage-branches=66 --test-coverage-functions=57 test/*.test.ts test/repair/*.test.ts dist/repair/*.test.js", "test:coverage:changed": "pnpm run build:all && node --test --experimental-test-coverage --test-coverage-include='dist/repair/fix-prompt-builder.js' --test-coverage-lines=85 --test-coverage-branches=85 --test-coverage-functions=85 test/repair/*.test.ts dist/repair/*.test.js", "check:active-surface": "node scripts/check-active-surface.ts", + "check:limits": "node scripts/check-limits.ts", "lint": "pnpm run lint:src && pnpm run lint:repair && pnpm run lint:scripts", "lint:src": "oxlint src/*.ts --tsconfig tsconfig.json --type-aware --deny-warnings --report-unused-disable-directives -D correctness", "lint:repair": "oxlint src/repair --tsconfig tsconfig.repair.json --deny-warnings --report-unused-disable-directives -D correctness", "lint:scripts": "oxlint scripts test --deny-warnings --report-unused-disable-directives -D correctness", - "format": "oxfmt --write src scripts test package.json tsconfig.json tsconfig.repair.json .oxfmtrc.json schema .github/actions .github/workflows", - "format:check": "oxfmt --check src scripts test package.json tsconfig.json tsconfig.repair.json .oxfmtrc.json schema .github/actions .github/workflows", + "format": "oxfmt --write src scripts test package.json tsconfig.json tsconfig.repair.json .oxfmtrc.json config schema .github/actions .github/workflows", + "format:check": "oxfmt --check src scripts test package.json tsconfig.json tsconfig.repair.json .oxfmtrc.json config schema .github/actions .github/workflows", "oxformat": "pnpm run format", "oxformat:check": "pnpm run format:check", - "check": "pnpm run check:active-surface && pnpm run build:all && pnpm run lint && pnpm run test:unit && pnpm run test:repair && pnpm run test:coverage:changed && pnpm run test:coverage && pnpm run format:check" + "check": "pnpm run check:active-surface && pnpm run check:limits && pnpm run build:all && pnpm run lint && pnpm run test:unit && pnpm run test:repair && pnpm run test:coverage:changed && pnpm run test:coverage && pnpm run format:check" }, "devDependencies": { "@types/node": "^25.6.0", diff --git a/scripts/check-active-surface.ts b/scripts/check-active-surface.ts index 5fac64b705..91c34958ec 100644 --- a/scripts/check-active-surface.ts +++ b/scripts/check-active-surface.ts @@ -5,6 +5,7 @@ import path from "node:path"; const root = process.cwd(); const activeRoots: string[] = [ ".github/workflows", + "config", "src", "test", "docs", diff --git a/scripts/check-limits.ts b/scripts/check-limits.ts new file mode 100644 index 0000000000..7cdf6bdcf0 --- /dev/null +++ b/scripts/check-limits.ts @@ -0,0 +1,127 @@ +#!/usr/bin/env node +import fs from "node:fs"; +import path from "node:path"; + +type AutomationLimits = { + review_shards: { + normal_default: number; + normal_active_floor: number; + hot_intake_default: number; + exact_item_default: number; + hard_cap: number; + }; + commit_review: { + page_size_default: number; + page_size_hard_cap: number; + }; + repair_live_runs: { + default: number; + hard_cap: number; + automerge_default: number; + issue_implementation_default: number; + }; + issue_implementation: { + dispatches_per_sweep_default: number; + }; +}; + +const root = process.cwd(); +const limits = JSON.parse( + fs.readFileSync(path.join(root, "config", "automation-limits.json"), "utf8"), +) as AutomationLimits; + +const expectations: { file: string; label: string; pattern: RegExp }[] = [ + { + file: ".github/workflows/sweep.yml", + label: "manual workflow_dispatch shard_count default", + pattern: new RegExp( + `shard_count:[\\s\\S]{0,180}default: "${limits.review_shards.normal_default}"`, + ), + }, + { + file: "README.md", + label: "manual plan shard-count example", + pattern: new RegExp(`--shard-count ${limits.review_shards.normal_default}\\b`), + }, + { + file: "docs/commit-dispatcher.md", + label: "commit review page size env example", + pattern: new RegExp( + `CLAWSWEEPER_COMMIT_REVIEW_PAGE_SIZE=${limits.commit_review.page_size_default}\\b`, + ), + }, + { + file: "docs/commit-sweeper.md", + label: "commit review page size default", + pattern: new RegExp(`defaults to ${limits.commit_review.page_size_default}\\b`), + }, + { + file: "docs/repair/README.md", + label: "repair live run default", + pattern: new RegExp(`CLAWSWEEPER_MAX_LIVE_WORKERS=${limits.repair_live_runs.default}\\b`), + }, + { + file: "docs/scheduler.md", + label: "normal review shard default", + pattern: new RegExp(`${limits.review_shards.normal_default} concurrent Codex\\s+review shards`), + }, + { + file: "docs/scheduler.md", + label: "normal active shard floor", + pattern: new RegExp(`fewer than ${limits.review_shards.normal_active_floor} items are due`), + }, + { + file: "docs/scheduler.md", + label: "hot intake shard default", + pattern: new RegExp(`broad hot intake: ${limits.review_shards.hot_intake_default} shards`), + }, + { + file: "docs/limits.md", + label: "limits documentation references source file", + pattern: /config\/automation-limits\.json/, + }, +]; + +for (const [limitPath, value] of Object.entries(flattenLimits(limits))) { + expectations.push({ + file: "docs/limits.md", + label: `${limitPath} documented current value`, + pattern: new RegExp(`\\| \`${escapeRegExp(limitPath)}\` \\| ${value} \\|`), + }); +} + +const missing: string[] = []; +for (const expectation of expectations) { + const text = fs.readFileSync(path.join(root, expectation.file), "utf8"); + if (!expectation.pattern.test(text)) { + missing.push(`${expectation.file}: ${expectation.label}`); + } +} + +if (missing.length > 0) { + console.error("Automation limits drift check failed:"); + for (const item of missing) console.error(`- ${item}`); + process.exit(1); +} + +function flattenLimits(value: unknown, prefix = ""): Record { + const out: Record = {}; + if (!isRecord(value)) return out; + for (const [key, child] of Object.entries(value)) { + const childPath = prefix ? `${prefix}.${key}` : key; + if (Number.isInteger(child)) { + out[childPath] = child; + } else { + Object.assign(out, flattenLimits(child, childPath)); + } + } + return out; +} + +function isRecord(value: unknown): value is Record { + return typeof value === "object" && value !== null && !Array.isArray(value); +} + +function escapeRegExp(value: string): string { + return value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); +} diff --git a/src/clawsweeper.ts b/src/clawsweeper.ts index df62cdaa39..771b1af579 100644 --- a/src/clawsweeper.ts +++ b/src/clawsweeper.ts @@ -36,6 +36,7 @@ import { import { parseGhJson, parseGhJsonLines } from "./github-json.js"; import { stableJson } from "./stable-json.js"; import { runText } from "./command.js"; +import { AUTOMATION_LIMITS } from "./limits.js"; import { boolArg, itemNumbersArg, @@ -463,8 +464,8 @@ interface PlanCandidateResult { } const DEFAULT_PLAN_BATCH_SIZE = 3; -const DEFAULT_PLAN_SHARD_COUNT = 64; -const MAX_PLAN_SHARD_COUNT = 100; +const DEFAULT_PLAN_SHARD_COUNT = AUTOMATION_LIMITS.review_shards.normal_default; +const MAX_PLAN_SHARD_COUNT = AUTOMATION_LIMITS.review_shards.hard_cap; type SchedulerBucket = | "hot_issue" diff --git a/src/limits.ts b/src/limits.ts new file mode 100644 index 0000000000..1b116ed41d --- /dev/null +++ b/src/limits.ts @@ -0,0 +1,95 @@ +import { readFileSync } from "node:fs"; +import { dirname, join, resolve } from "node:path"; +import { fileURLToPath } from "node:url"; + +export type AutomationLimits = { + review_shards: { + normal_default: number; + normal_active_floor: number; + hot_intake_default: number; + exact_item_default: number; + hard_cap: number; + }; + commit_review: { + page_size_default: number; + page_size_hard_cap: number; + }; + repair_live_runs: { + default: number; + hard_cap: number; + automerge_default: number; + issue_implementation_default: number; + }; + issue_implementation: { + dispatches_per_sweep_default: number; + }; +}; + +export const AUTOMATION_LIMITS = readAutomationLimits(); + +export function readAutomationLimits( + filePath = join(repoRoot(), "config", "automation-limits.json"), +): AutomationLimits { + const parsed = JSON.parse(readFileSync(filePath, "utf8")) as unknown; + return validateAutomationLimits(parsed); +} + +function validateAutomationLimits(value: unknown): AutomationLimits { + if (!isRecord(value)) throw new Error("automation limits must be an object"); + const limits = value as Record; + return { + review_shards: { + normal_default: positiveInteger(limits, "review_shards.normal_default"), + normal_active_floor: positiveInteger(limits, "review_shards.normal_active_floor"), + hot_intake_default: positiveInteger(limits, "review_shards.hot_intake_default"), + exact_item_default: positiveInteger(limits, "review_shards.exact_item_default"), + hard_cap: positiveInteger(limits, "review_shards.hard_cap"), + }, + commit_review: { + page_size_default: positiveInteger(limits, "commit_review.page_size_default"), + page_size_hard_cap: positiveInteger(limits, "commit_review.page_size_hard_cap"), + }, + repair_live_runs: { + default: positiveInteger(limits, "repair_live_runs.default"), + hard_cap: positiveInteger(limits, "repair_live_runs.hard_cap"), + automerge_default: positiveInteger(limits, "repair_live_runs.automerge_default"), + issue_implementation_default: positiveInteger( + limits, + "repair_live_runs.issue_implementation_default", + ), + }, + issue_implementation: { + dispatches_per_sweep_default: positiveInteger( + limits, + "issue_implementation.dispatches_per_sweep_default", + ), + }, + }; +} + +function positiveInteger(root: Record, path: string): number { + const value = getPath(root, path); + if (typeof value !== "number" || !Number.isInteger(value) || value < 1) { + throw new Error(`automation limit ${path} must be a positive integer`); + } + return value; +} + +function getPath(root: Record, path: string): unknown { + let cursor: unknown = root; + for (const segment of path.split(".")) { + if (!isRecord(cursor) || !(segment in cursor)) { + throw new Error(`automation limit ${path} is missing`); + } + cursor = cursor[segment]; + } + return cursor; +} + +function isRecord(value: unknown): value is Record { + return typeof value === "object" && value !== null && !Array.isArray(value); +} + +function repoRoot(): string { + return resolve(dirname(fileURLToPath(import.meta.url)), ".."); +} diff --git a/src/repair/config.ts b/src/repair/config.ts index 33d49658eb..ad2ca6b9e6 100644 --- a/src/repair/config.ts +++ b/src/repair/config.ts @@ -2,6 +2,7 @@ import type { JsonValue, LooseRecord } from "./json-types.js"; import { DEFAULT_ALLOWED_REPOSITORY_PERMISSIONS } from "./comment-router-core.js"; import { currentProjectRepo, readMaxLiveWorkers } from "./lib.js"; import { assertRepo, commaSet, positiveInteger } from "./comment-router-utils.js"; +import { AUTOMATION_LIMITS } from "./limits.js"; import { DEFAULT_HEAD_PREFIX, DEFAULT_TARGET_REPO, @@ -110,7 +111,7 @@ export function readCommentRouterConfig(args: LooseRecord): CommentRouterConfig "max-live-workers": args["automerge-max-live-workers"] ?? process.env.CLAWSWEEPER_AUTOMERGE_MAX_LIVE_WORKERS ?? - 40, + AUTOMATION_LIMITS.repair_live_runs.automerge_default, }), automergeRunNamePrefix: stringSetting( args["automerge-run-name-prefix"] ?? process.env.CLAWSWEEPER_AUTOMERGE_RUN_NAME_PREFIX, diff --git a/src/repair/dispatch-jobs.ts b/src/repair/dispatch-jobs.ts index 77150f512f..cddeb73ba5 100755 --- a/src/repair/dispatch-jobs.ts +++ b/src/repair/dispatch-jobs.ts @@ -17,6 +17,7 @@ import { } from "./lib.js"; import { sleepMs } from "./timing.js"; import { REPAIR_CLUSTER_WORKFLOW } from "./constants.js"; +import { AUTOMATION_LIMITS } from "./limits.js"; const args = parseArgs(process.argv.slice(2)); const defaultRunner = process.env.CLAWSWEEPER_WORKER_RUNNER ?? "blacksmith-4vcpu-ubuntu-2404"; @@ -36,7 +37,7 @@ const activeRepairRunsByPrefix = new Map(); if (files.length === 0) { console.error( - "usage: node scripts/dispatch-jobs.ts [...] [--mode plan|execute|autonomous] [--runner label] [--execution-runner label] [--model model] [--max-live-workers 40] [--wait-for-capacity]", + `usage: node scripts/dispatch-jobs.ts [...] [--mode plan|execute|autonomous] [--runner label] [--execution-runner label] [--model model] [--max-live-workers ${AUTOMATION_LIMITS.repair_live_runs.default}] [--wait-for-capacity]`, ); process.exit(2); } diff --git a/src/repair/limits.ts b/src/repair/limits.ts new file mode 100644 index 0000000000..3f36bd7dab --- /dev/null +++ b/src/repair/limits.ts @@ -0,0 +1,91 @@ +import { readFileSync } from "node:fs"; +import { join } from "node:path"; +import { repoRoot } from "./paths.js"; + +export type AutomationLimits = { + review_shards: { + normal_default: number; + normal_active_floor: number; + hot_intake_default: number; + exact_item_default: number; + hard_cap: number; + }; + commit_review: { + page_size_default: number; + page_size_hard_cap: number; + }; + repair_live_runs: { + default: number; + hard_cap: number; + automerge_default: number; + issue_implementation_default: number; + }; + issue_implementation: { + dispatches_per_sweep_default: number; + }; +}; + +export const AUTOMATION_LIMITS = readAutomationLimits(); + +export function readAutomationLimits( + filePath = join(repoRoot(), "config", "automation-limits.json"), +): AutomationLimits { + const parsed = JSON.parse(readFileSync(filePath, "utf8")) as unknown; + return validateAutomationLimits(parsed); +} + +function validateAutomationLimits(value: unknown): AutomationLimits { + if (!isRecord(value)) throw new Error("automation limits must be an object"); + const limits = value as Record; + return { + review_shards: { + normal_default: positiveInteger(limits, "review_shards.normal_default"), + normal_active_floor: positiveInteger(limits, "review_shards.normal_active_floor"), + hot_intake_default: positiveInteger(limits, "review_shards.hot_intake_default"), + exact_item_default: positiveInteger(limits, "review_shards.exact_item_default"), + hard_cap: positiveInteger(limits, "review_shards.hard_cap"), + }, + commit_review: { + page_size_default: positiveInteger(limits, "commit_review.page_size_default"), + page_size_hard_cap: positiveInteger(limits, "commit_review.page_size_hard_cap"), + }, + repair_live_runs: { + default: positiveInteger(limits, "repair_live_runs.default"), + hard_cap: positiveInteger(limits, "repair_live_runs.hard_cap"), + automerge_default: positiveInteger(limits, "repair_live_runs.automerge_default"), + issue_implementation_default: positiveInteger( + limits, + "repair_live_runs.issue_implementation_default", + ), + }, + issue_implementation: { + dispatches_per_sweep_default: positiveInteger( + limits, + "issue_implementation.dispatches_per_sweep_default", + ), + }, + }; +} + +function positiveInteger(root: Record, path: string): number { + const value = getPath(root, path); + if (typeof value !== "number" || !Number.isInteger(value) || value < 1) { + throw new Error(`automation limit ${path} must be a positive integer`); + } + return value; +} + +function getPath(root: Record, path: string): unknown { + let cursor: unknown = root; + for (const segment of path.split(".")) { + if (!isRecord(cursor) || !(segment in cursor)) { + throw new Error(`automation limit ${path} is missing`); + } + cursor = cursor[segment]; + } + return cursor; +} + +function isRecord(value: unknown): value is Record { + return typeof value === "object" && value !== null && !Array.isArray(value); +} diff --git a/src/repair/live-worker-capacity.ts b/src/repair/live-worker-capacity.ts index c4c7849f2e..3fcdfc0039 100644 --- a/src/repair/live-worker-capacity.ts +++ b/src/repair/live-worker-capacity.ts @@ -1,11 +1,12 @@ import { ghJson } from "./github-cli.js"; import type { JsonValue, LooseRecord } from "./json-types.js"; import { REPAIR_CLUSTER_WORKFLOW } from "./constants.js"; +import { AUTOMATION_LIMITS } from "./limits.js"; import { currentProjectRepo } from "./project-repo.js"; import { sleepMs } from "./timing.js"; -const DEFAULT_MAX_LIVE_WORKERS = 40; -export const MAX_LIVE_WORKERS = 100; +const DEFAULT_MAX_LIVE_WORKERS = AUTOMATION_LIMITS.repair_live_runs.default; +export const MAX_LIVE_WORKERS = AUTOMATION_LIMITS.repair_live_runs.hard_cap; export const DEFAULT_AUTOMERGE_REPAIR_RUN_NAME_PREFIX = "automerge repair "; export const DEFAULT_REPAIR_RUN_NAME_PREFIX = "repair cluster "; const DEFAULT_CAPACITY_POLL_MS = 30_000; diff --git a/src/repair/requeue-job.ts b/src/repair/requeue-job.ts index c415c9fedf..20ef1ec595 100644 --- a/src/repair/requeue-job.ts +++ b/src/repair/requeue-job.ts @@ -17,6 +17,7 @@ import { import { ghJson, ghText } from "./github-cli.js"; import { sleepMs } from "./timing.js"; import { REPAIR_CLUSTER_WORKFLOW } from "./constants.js"; +import { AUTOMATION_LIMITS } from "./limits.js"; const DEFAULT_REPO = currentProjectRepo(); const DEFAULT_WORKFLOW = REPAIR_CLUSTER_WORKFLOW; @@ -46,7 +47,7 @@ const resolved = requestedRunId if (!resolved.source_job) { console.error( - "usage: node scripts/requeue-job.ts [--mode plan|execute|autonomous] [--execute] [--open-execute-window] [--runner label] [--execution-runner label] [--model model] [--max-live-workers 40] [--wait-for-capacity]", + `usage: node scripts/requeue-job.ts [--mode plan|execute|autonomous] [--execute] [--open-execute-window] [--runner label] [--execution-runner label] [--model model] [--max-live-workers ${AUTOMATION_LIMITS.repair_live_runs.default}] [--wait-for-capacity]`, ); process.exit(2); } diff --git a/src/repair/workflow-utils.ts b/src/repair/workflow-utils.ts index 0e53771f1b..075dea1f80 100644 --- a/src/repair/workflow-utils.ts +++ b/src/repair/workflow-utils.ts @@ -5,6 +5,7 @@ import path from "node:path"; import { pathToFileURL } from "node:url"; import { parseArgs } from "./lib.js"; import { isJsonObject } from "./json-types.js"; +import { AUTOMATION_LIMITS } from "./limits.js"; type ApplyAction = { action: string; @@ -49,6 +50,9 @@ function runCli(): void { case "count-requeue-required": console.log(countRequeueRequired(requiredString("dir"))); break; + case "limit": + process.stdout.write(String(automationLimit(requiredString("path")))); + break; case "proposed-item-numbers": process.stdout.write(proposedItemNumbers(proposedItemOptions()).join(",")); break; @@ -60,10 +64,28 @@ function runCli(): void { } } +export function automationLimit(limitPath: string): number { + let cursor: unknown = AUTOMATION_LIMITS; + for (const segment of limitPath.split(".")) { + if (!segment) throw new Error(`invalid automation limit path: ${limitPath}`); + if (!isJsonObject(cursor) || !(segment in cursor)) { + throw new Error(`unknown automation limit: ${limitPath}`); + } + cursor = cursor[segment]; + } + if (typeof cursor !== "number" || !Number.isInteger(cursor) || cursor < 1) { + throw new Error(`automation limit ${limitPath} must resolve to a positive integer`); + } + return cursor; +} + function printPlanOutput(): void { const plan = readJsonObject(requiredString("plan")); const batchSize = positiveNumber(optionalString("batch-size"), 5); - const shardCount = positiveNumber(optionalString("shard-count"), 64); + const shardCount = positiveNumber( + optionalString("shard-count"), + AUTOMATION_LIMITS.review_shards.normal_default, + ); printOutput(planOutputFields(plan, { batchSize, shardCount })); } diff --git a/test/repair/workflow-utils.test.ts b/test/repair/workflow-utils.test.ts index 459700317a..b92d6df5e8 100644 --- a/test/repair/workflow-utils.test.ts +++ b/test/repair/workflow-utils.test.ts @@ -6,6 +6,7 @@ import test from "node:test"; import { artifactItemNumbers, + automationLimit, countActions, countCommandActions, countRequeueRequired, @@ -15,6 +16,12 @@ import { proposedItemNumbers, } from "../../dist/repair/workflow-utils.js"; +test("workflow utilities expose automation limits", () => { + assert.equal(automationLimit("review_shards.normal_default"), 64); + assert.equal(automationLimit("repair_live_runs.default"), 40); + assert.throws(() => automationLimit("missing.default"), /unknown automation limit/); +}); + test("workflow utilities derive artifact item numbers and action counts", () => { const root = fs.mkdtempSync(path.join(os.tmpdir(), "clawsweeper-workflow-")); write(path.join(root, "artifacts/shard-a/openclaw-openclaw-42.md"), "report\n");