From b033b7343591359fedef5f4b04f4d48baa25dda6 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Tue, 5 May 2026 04:34:50 +0100 Subject: [PATCH] fix: count active workflow runs for worker scheduling --- .github/workflows/commit-review.yml | 14 +++++----- .github/workflows/sweep.yml | 23 ++++++++++----- CHANGELOG.md | 3 ++ docs/limits.md | 3 +- docs/scheduler.md | 14 ++++++---- scripts/check-active-surface.ts | 1 + src/repair/requeue-job.ts | 16 +++++++---- src/repair/self-heal-failed-runs.ts | 16 +++++++---- src/repair/sweep-openclaw-jobs.ts | 43 +++++++++++++---------------- 9 files changed, 76 insertions(+), 57 deletions(-) diff --git a/.github/workflows/commit-review.yml b/.github/workflows/commit-review.yml index d160c916da..12b2edb43d 100644 --- a/.github/workflows/commit-review.yml +++ b/.github/workflows/commit-review.yml @@ -164,25 +164,25 @@ jobs: run: | set -euo pipefail active_run_count() { - gh run list --repo "${{ github.repository }}" --workflow "$1" --limit 100 --json status 2>/dev/null \ - | jq '[.[] | select(.status == "in_progress" or .status == "pending" or .status == "queued" or .status == "waiting" or .status == "requested")] | length' 2>/dev/null \ + gh run list --repo "${{ github.repository }}" --limit 100 --json workflowName,status 2>/dev/null \ + | WORKFLOW_NAME="$1" jq '[.[] | select(.workflowName == env.WORKFLOW_NAME) | select(.status == "in_progress" or .status == "pending" or .status == "queued" or .status == "waiting" or .status == "requested")] | length' 2>/dev/null \ || printf '0' } active_sweep_background_workers() { local normal_limit hot_limit normal_limit="$(pnpm --dir clawsweeper run --silent workflow -- limit review_shards.normal_default)" hot_limit="$(pnpm --dir clawsweeper run --silent workflow -- limit review_shards.hot_intake_default)" - gh run list --repo "${{ github.repository }}" --workflow sweep.yml --limit 100 --json displayTitle,status 2>/dev/null \ - | NORMAL_LIMIT="$normal_limit" HOT_LIMIT="$hot_limit" jq '[.[] | select(.status == "in_progress" or .status == "pending" or .status == "queued" or .status == "waiting" or .status == "requested") | if .displayTitle == "Review ClawSweeper items" then (env.NORMAL_LIMIT | tonumber) elif .displayTitle == "Review hot ClawSweeper items" then (env.HOT_LIMIT | tonumber) else 0 end] | add // 0' 2>/dev/null \ + gh run list --repo "${{ github.repository }}" --limit 100 --json workflowName,displayTitle,status 2>/dev/null \ + | NORMAL_LIMIT="$normal_limit" HOT_LIMIT="$hot_limit" jq '[.[] | select(.workflowName == "ClawSweeper") | select(.status == "in_progress" or .status == "pending" or .status == "queued" or .status == "waiting" or .status == "requested") | if .displayTitle == "Review ClawSweeper items" then (env.NORMAL_LIMIT | tonumber) elif .displayTitle == "Review hot ClawSweeper items" then (env.HOT_LIMIT | tonumber) else 0 end] | add // 0' 2>/dev/null \ || printf '0' } active_sweep_exact_count() { - gh run list --repo "${{ github.repository }}" --workflow sweep.yml --limit 100 --json displayTitle,status 2>/dev/null \ - | jq '[.[] | select(.status == "in_progress" or .status == "pending" or .status == "queued" or .status == "waiting" or .status == "requested") | select(.displayTitle | startswith("Review event item "))] | length' 2>/dev/null \ + gh run list --repo "${{ github.repository }}" --limit 100 --json workflowName,displayTitle,status 2>/dev/null \ + | jq '[.[] | select(.workflowName == "ClawSweeper") | select(.status == "in_progress" or .status == "pending" or .status == "queued" or .status == "waiting" or .status == "requested") | select(.displayTitle | startswith("Review event item "))] | length' 2>/dev/null \ || printf '0' } if [ -z "$PAGE_SIZE" ]; then - active_critical_workers="$(( $(active_run_count repair-cluster-worker.yml) + $(active_sweep_exact_count) ))" + active_critical_workers="$(( $(active_run_count "repair cluster worker") + $(active_sweep_exact_count) ))" active_background_workers="$(active_sweep_background_workers)" PAGE_SIZE="$(pnpm --dir clawsweeper run --silent workflow -- worker-limit commit_review --active-critical "$active_critical_workers" --active-background "$active_background_workers")" fi diff --git a/.github/workflows/sweep.yml b/.github/workflows/sweep.yml index 65d3deb2e1..eaf9b73e49 100644 --- a/.github/workflows/sweep.yml +++ b/.github/workflows/sweep.yml @@ -605,21 +605,29 @@ jobs: pnpm --dir clawsweeper run --silent workflow -- worker-limit "$@" } active_run_count() { - gh run list --repo "${{ github.repository }}" --workflow "$1" --limit 100 --json status 2>/dev/null \ - | jq '[.[] | select(.status == "in_progress" or .status == "pending" or .status == "queued" or .status == "waiting" or .status == "requested")] | length' 2>/dev/null \ + gh run list --repo "${{ github.repository }}" --limit 100 --json workflowName,status 2>/dev/null \ + | WORKFLOW_NAME="$1" jq '[.[] | select(.workflowName == env.WORKFLOW_NAME) | select(.status == "in_progress" or .status == "pending" or .status == "queued" or .status == "waiting" or .status == "requested")] | length' 2>/dev/null \ || printf '0' } active_sweep_exact_count() { - gh run list --repo "${{ github.repository }}" --workflow sweep.yml --limit 100 --json displayTitle,status 2>/dev/null \ - | jq '[.[] | select((.status == "in_progress" or .status == "pending" or .status == "queued" or .status == "waiting" or .status == "requested") and (.displayTitle | startswith("Review event item ")))] | length' 2>/dev/null \ + gh run list --repo "${{ github.repository }}" --limit 100 --json workflowName,displayTitle,status 2>/dev/null \ + | jq '[.[] | select(.workflowName == "ClawSweeper") | select((.status == "in_progress" or .status == "pending" or .status == "queued" or .status == "waiting" or .status == "requested") and (.displayTitle | startswith("Review event item ")))] | length' 2>/dev/null \ + || printf '0' + } + active_sweep_background_workers() { + local normal_limit hot_limit + normal_limit="$(limit review_shards.normal_default)" + hot_limit="$(limit review_shards.hot_intake_default)" + gh run list --repo "${{ github.repository }}" --limit 100 --json databaseId,workflowName,displayTitle,status 2>/dev/null \ + | CURRENT_RUN_ID="${GITHUB_RUN_ID:-0}" NORMAL_LIMIT="$normal_limit" HOT_LIMIT="$hot_limit" jq '[.[] | select((.databaseId | tostring) != env.CURRENT_RUN_ID) | select(.workflowName == "ClawSweeper") | select(.status == "in_progress" or .status == "pending" or .status == "queued" or .status == "waiting" or .status == "requested") | if .displayTitle == "Review ClawSweeper items" then (env.NORMAL_LIMIT | tonumber) elif .displayTitle == "Review hot ClawSweeper items" then (env.HOT_LIMIT | tonumber) else 0 end] | add // 0' 2>/dev/null \ || printf '0' } exact_item_shards="$(limit review_shards.exact_item_default)" normal_active_floor="$(limit review_shards.normal_active_floor)" hard_shard_cap="$(limit review_shards.hard_cap)" commit_page_size="$(limit commit_review.page_size_default)" - active_critical_workers="$(( $(active_run_count repair-cluster-worker.yml) + $(active_sweep_exact_count) ))" - active_background_workers="$(( $(active_run_count commit-review.yml) * commit_page_size ))" + active_critical_workers="$(( $(active_run_count "repair cluster worker") + $(active_sweep_exact_count) ))" + active_background_workers="$(( $(active_run_count "ClawSweeper Commit Review") * commit_page_size + $(active_sweep_background_workers) ))" hot_intake_shards="$(worker_limit hot_intake --active-critical "$active_critical_workers" --active-background "$active_background_workers")" normal_shards="$(worker_limit normal_review --active-critical "$active_critical_workers" --active-background "$active_background_workers")" hot_intake="${{ ((github.event_name == 'workflow_dispatch' && github.event.inputs.hot_intake == 'true') || (github.event_name == 'schedule' && (github.event.schedule == '*/5 * * * *' || github.event.schedule == '2/5 * * * *'))) && 'true' || 'false' }}" @@ -1755,7 +1763,7 @@ jobs: set -euo pipefail hot_intake_shards="$(pnpm run --silent workflow -- limit review_shards.hot_intake_default)" normal_shards="$(pnpm run --silent workflow -- limit review_shards.normal_default)" - runs_json="$(gh run list --repo "${{ github.repository }}" --workflow sweep.yml --limit 80 --json displayTitle,status,createdAt)" + runs_json="$(gh run list --repo "${{ github.repository }}" --limit 100 --json workflowName,displayTitle,status,createdAt)" eval "$( RUNS_JSON="$runs_json" node <<'NODE' const runs = JSON.parse(process.env.RUNS_JSON || "[]"); @@ -1763,6 +1771,7 @@ jobs: const active = new Set(["in_progress", "pending", "queued", "waiting", "requested"]); function recent(title, windowMs) { return runs.some((run) => { + if (run.workflowName !== "ClawSweeper") return false; if (run.displayTitle !== title) return false; if (active.has(String(run.status))) return true; const createdAt = Date.parse(String(run.createdAt || "")); diff --git a/CHANGELOG.md b/CHANGELOG.md index a49d9ddcbc..52f65d4a6a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,9 @@ checkpoint, and status-only commits are intentionally omitted. dispatches. - Made background review lanes yield to active repair and exact-item work to lower GitHub and Codex rate-limit pressure during busy periods. +- Fixed live worker scheduling to filter GitHub Actions runs through supported + `workflowName` JSON fields instead of silently falling back to zero active + workers when `gh run list --workflow` is unavailable. - Retried Codex edit workers after TPM/rate-limit exits and collapsed JSONL failure transcripts into concise repair status reasons. - Added deterministic merged closing-PR provenance to issue close reports and public close comments when GitHub exposes a high-confidence closing PR. diff --git a/docs/limits.md b/docs/limits.md index c69327f2cd..3d2ec14ec4 100644 --- a/docs/limits.md +++ b/docs/limits.md @@ -76,7 +76,8 @@ The scheduler does this for background lanes: 1. start with `workers.max` 2. subtract active priority work, currently repair workers plus exact-item sweep runs -3. subtract active background work already known to the workflow +3. subtract active background work already known to the workflow, including + commit-review pages and other active normal/hot sweep runs 4. reserve `workers.reserve_for_interactive` 5. cap the result at the lane's derived quiet-system ceiling 6. return at least 1 so an enabled lane can still make slow progress diff --git a/docs/scheduler.md b/docs/scheduler.md index 7867e23bcc..dc074c791c 100644 --- a/docs/scheduler.md +++ b/docs/scheduler.md @@ -310,10 +310,11 @@ count, inspect active review shard jobs on the current workflow run. The live scheduler estimate happens before planning and is intentionally coarse: it counts active repair-cluster workflow runs as priority work, active exact-item -sweep runs as priority work, and active commit-review workflow runs as -background work weighted by the configured commit page size. GitHub Actions can -start or finish jobs after that estimate, so the scheduler is a throttle, not a -distributed lock. +sweep runs as priority work, active commit-review workflow runs as background +work weighted by the configured commit page size, and other active normal/hot +sweep runs as background work weighted by their quiet-system ceilings. GitHub +Actions can start or finish jobs after that estimate, so the scheduler is a +throttle, not a distributed lock. Planning status intentionally does not run `pnpm run reconcile`. Reconciliation can scan many live GitHub pages and has delayed review shard startup. The @@ -409,8 +410,9 @@ schedule remains the fallback if dispatch is delayed. Useful commands: ```bash -gh run list --repo openclaw/clawsweeper --workflow sweep.yml --limit 20 \ - --json databaseId,displayTitle,event,status,conclusion,createdAt,headSha,url +gh run list --repo openclaw/clawsweeper --limit 100 \ + --json databaseId,workflowName,displayTitle,event,status,conclusion,createdAt,headSha,url \ + --jq '.[] | select(.workflowName == "ClawSweeper")' gh run view --repo openclaw/clawsweeper --json jobs \ --jq '[.jobs[] | select(.name | startswith("Review shard")) | select(.status=="in_progress")] | length' diff --git a/scripts/check-active-surface.ts b/scripts/check-active-surface.ts index 91c34958ec..57817dace3 100644 --- a/scripts/check-active-surface.ts +++ b/scripts/check-active-surface.ts @@ -51,6 +51,7 @@ const retiredPatterns: { label: string; pattern: RegExp }[] = [ { label: "retired ClawSweeper read token", pattern: /\bCLAWSWEEPER_READ_GH_TOKEN\b/ }, { label: "retired repair Codex token", pattern: /\bCLAWSWEEPER_CODEX_GH_TOKEN\b/ }, { label: "retired review token", pattern: /\bCLAWSWEEPER_REVIEW_GH_TOKEN\b/ }, + { label: "unsupported gh run list workflow flag", pattern: /\bgh run list\b.*--workflow\b/ }, ]; type Finding = { diff --git a/src/repair/requeue-job.ts b/src/repair/requeue-job.ts index 20ef1ec595..1c21628ffa 100644 --- a/src/repair/requeue-job.ts +++ b/src/repair/requeue-job.ts @@ -215,18 +215,22 @@ function assertGateOpenIfNeeded(mode: string) { } function listClusterRuns() { - return ghJson([ + const workflowName = workflowDisplayName(workflow); + return ghJson([ "run", "list", "--repo", repo, - "--workflow", - workflow, "--limit", - "50", + "200", "--json", - "databaseId,headSha,status,conclusion,createdAt,url", - ]); + "databaseId,workflowName,headSha,status,conclusion,createdAt,url", + ]).filter((run: LooseRecord) => run.workflowName === workflowName); +} + +function workflowDisplayName(workflowNameOrFile: string): string { + if (workflowNameOrFile === "repair-cluster-worker.yml") return "repair cluster worker"; + return workflowNameOrFile; } function readGate(name: string) { diff --git a/src/repair/self-heal-failed-runs.ts b/src/repair/self-heal-failed-runs.ts index dba9049ec1..b8cd9b8a09 100644 --- a/src/repair/self-heal-failed-runs.ts +++ b/src/repair/self-heal-failed-runs.ts @@ -384,18 +384,22 @@ function selfHealLedgerPath() { } function listClusterRuns() { - return ghJson([ + const workflowName = workflowDisplayName(workflow); + return ghJson([ "run", "list", "--repo", repo, - "--workflow", - workflow, "--limit", - "50", + "200", "--json", - "databaseId,displayTitle,headSha,status,conclusion,createdAt,updatedAt,url", - ]); + "databaseId,workflowName,displayTitle,headSha,status,conclusion,createdAt,updatedAt,url", + ]).filter((run: LooseRecord) => run.workflowName === workflowName); +} + +function workflowDisplayName(workflowNameOrFile: string): string { + if (workflowNameOrFile === "repair-cluster-worker.yml") return "repair cluster worker"; + return workflowNameOrFile; } function readExecuteGate() { diff --git a/src/repair/sweep-openclaw-jobs.ts b/src/repair/sweep-openclaw-jobs.ts index 4bf55f461d..85b5c368d1 100644 --- a/src/repair/sweep-openclaw-jobs.ts +++ b/src/repair/sweep-openclaw-jobs.ts @@ -281,30 +281,20 @@ function readOpenClawSweeperPrClusters() { function readActiveClusterRuns() { const repo = process.env.CLAWSWEEPER_REPO ?? "openclaw/clawsweeper"; - const statuses = ["queued", "in_progress", "waiting", "requested", "pending"]; - const runs: LooseRecord[] = []; - for (const status of statuses) { - try { - runs.push( - ...ghJson([ - "run", - "list", - "--repo", - repo, - "--workflow", - REPAIR_CLUSTER_WORKFLOW, - "--status", - status, - "--limit", - "100", - "--json", - "databaseId,status,conclusion,createdAt,updatedAt,url,displayTitle", - ]), - ); - } catch { - // Some statuses are not accepted on older gh versions; active PR detection is still useful. - } - } + const statuses = new Set(["queued", "in_progress", "waiting", "requested", "pending"]); + const workflowName = workflowDisplayName(REPAIR_CLUSTER_WORKFLOW); + const runs = ghJson([ + "run", + "list", + "--repo", + repo, + "--limit", + "200", + "--json", + "databaseId,workflowName,status,conclusion,createdAt,updatedAt,url,displayTitle", + ]).filter((run: LooseRecord) => { + return run.workflowName === workflowName && statuses.has(String(run.status)); + }); const byId = new Map(); for (const run of runs) byId.set(String(run.databaseId), run); return [...byId.values()].sort((left: JsonValue, right: JsonValue) => @@ -312,6 +302,11 @@ function readActiveClusterRuns() { ); } +function workflowDisplayName(workflow: string): string { + if (workflow === "repair-cluster-worker.yml") return "repair cluster worker"; + return workflow; +} + function publicRow(row: LooseRecord) { return Object.fromEntries( Object.entries(row).filter(([, value]: JsonValue[]) => value !== undefined),