fix: count active workflow runs for worker scheduling

This commit is contained in:
Peter Steinberger 2026-05-05 04:34:50 +01:00
parent e923eac212
commit b033b73435
No known key found for this signature in database
9 changed files with 76 additions and 57 deletions

View File

@ -164,25 +164,25 @@ jobs:
run: |
set -euo pipefail
active_run_count() {
gh run list --repo "${{ github.repository }}" --workflow "$1" --limit 100 --json status 2>/dev/null \
| jq '[.[] | select(.status == "in_progress" or .status == "pending" or .status == "queued" or .status == "waiting" or .status == "requested")] | length' 2>/dev/null \
gh run list --repo "${{ github.repository }}" --limit 100 --json workflowName,status 2>/dev/null \
| WORKFLOW_NAME="$1" jq '[.[] | select(.workflowName == env.WORKFLOW_NAME) | select(.status == "in_progress" or .status == "pending" or .status == "queued" or .status == "waiting" or .status == "requested")] | length' 2>/dev/null \
|| printf '0'
}
active_sweep_background_workers() {
local normal_limit hot_limit
normal_limit="$(pnpm --dir clawsweeper run --silent workflow -- limit review_shards.normal_default)"
hot_limit="$(pnpm --dir clawsweeper run --silent workflow -- limit review_shards.hot_intake_default)"
gh run list --repo "${{ github.repository }}" --workflow sweep.yml --limit 100 --json displayTitle,status 2>/dev/null \
| NORMAL_LIMIT="$normal_limit" HOT_LIMIT="$hot_limit" jq '[.[] | select(.status == "in_progress" or .status == "pending" or .status == "queued" or .status == "waiting" or .status == "requested") | if .displayTitle == "Review ClawSweeper items" then (env.NORMAL_LIMIT | tonumber) elif .displayTitle == "Review hot ClawSweeper items" then (env.HOT_LIMIT | tonumber) else 0 end] | add // 0' 2>/dev/null \
gh run list --repo "${{ github.repository }}" --limit 100 --json workflowName,displayTitle,status 2>/dev/null \
| NORMAL_LIMIT="$normal_limit" HOT_LIMIT="$hot_limit" jq '[.[] | select(.workflowName == "ClawSweeper") | select(.status == "in_progress" or .status == "pending" or .status == "queued" or .status == "waiting" or .status == "requested") | if .displayTitle == "Review ClawSweeper items" then (env.NORMAL_LIMIT | tonumber) elif .displayTitle == "Review hot ClawSweeper items" then (env.HOT_LIMIT | tonumber) else 0 end] | add // 0' 2>/dev/null \
|| printf '0'
}
active_sweep_exact_count() {
gh run list --repo "${{ github.repository }}" --workflow sweep.yml --limit 100 --json displayTitle,status 2>/dev/null \
| jq '[.[] | select(.status == "in_progress" or .status == "pending" or .status == "queued" or .status == "waiting" or .status == "requested") | select(.displayTitle | startswith("Review event item "))] | length' 2>/dev/null \
gh run list --repo "${{ github.repository }}" --limit 100 --json workflowName,displayTitle,status 2>/dev/null \
| jq '[.[] | select(.workflowName == "ClawSweeper") | select(.status == "in_progress" or .status == "pending" or .status == "queued" or .status == "waiting" or .status == "requested") | select(.displayTitle | startswith("Review event item "))] | length' 2>/dev/null \
|| printf '0'
}
if [ -z "$PAGE_SIZE" ]; then
active_critical_workers="$(( $(active_run_count repair-cluster-worker.yml) + $(active_sweep_exact_count) ))"
active_critical_workers="$(( $(active_run_count "repair cluster worker") + $(active_sweep_exact_count) ))"
active_background_workers="$(active_sweep_background_workers)"
PAGE_SIZE="$(pnpm --dir clawsweeper run --silent workflow -- worker-limit commit_review --active-critical "$active_critical_workers" --active-background "$active_background_workers")"
fi

View File

@ -605,21 +605,29 @@ jobs:
pnpm --dir clawsweeper run --silent workflow -- worker-limit "$@"
}
active_run_count() {
gh run list --repo "${{ github.repository }}" --workflow "$1" --limit 100 --json status 2>/dev/null \
| jq '[.[] | select(.status == "in_progress" or .status == "pending" or .status == "queued" or .status == "waiting" or .status == "requested")] | length' 2>/dev/null \
gh run list --repo "${{ github.repository }}" --limit 100 --json workflowName,status 2>/dev/null \
| WORKFLOW_NAME="$1" jq '[.[] | select(.workflowName == env.WORKFLOW_NAME) | select(.status == "in_progress" or .status == "pending" or .status == "queued" or .status == "waiting" or .status == "requested")] | length' 2>/dev/null \
|| printf '0'
}
active_sweep_exact_count() {
gh run list --repo "${{ github.repository }}" --workflow sweep.yml --limit 100 --json displayTitle,status 2>/dev/null \
| jq '[.[] | select((.status == "in_progress" or .status == "pending" or .status == "queued" or .status == "waiting" or .status == "requested") and (.displayTitle | startswith("Review event item ")))] | length' 2>/dev/null \
gh run list --repo "${{ github.repository }}" --limit 100 --json workflowName,displayTitle,status 2>/dev/null \
| jq '[.[] | select(.workflowName == "ClawSweeper") | select((.status == "in_progress" or .status == "pending" or .status == "queued" or .status == "waiting" or .status == "requested") and (.displayTitle | startswith("Review event item ")))] | length' 2>/dev/null \
|| printf '0'
}
active_sweep_background_workers() {
local normal_limit hot_limit
normal_limit="$(limit review_shards.normal_default)"
hot_limit="$(limit review_shards.hot_intake_default)"
gh run list --repo "${{ github.repository }}" --limit 100 --json databaseId,workflowName,displayTitle,status 2>/dev/null \
| CURRENT_RUN_ID="${GITHUB_RUN_ID:-0}" NORMAL_LIMIT="$normal_limit" HOT_LIMIT="$hot_limit" jq '[.[] | select((.databaseId | tostring) != env.CURRENT_RUN_ID) | select(.workflowName == "ClawSweeper") | select(.status == "in_progress" or .status == "pending" or .status == "queued" or .status == "waiting" or .status == "requested") | if .displayTitle == "Review ClawSweeper items" then (env.NORMAL_LIMIT | tonumber) elif .displayTitle == "Review hot ClawSweeper items" then (env.HOT_LIMIT | tonumber) else 0 end] | add // 0' 2>/dev/null \
|| printf '0'
}
exact_item_shards="$(limit review_shards.exact_item_default)"
normal_active_floor="$(limit review_shards.normal_active_floor)"
hard_shard_cap="$(limit review_shards.hard_cap)"
commit_page_size="$(limit commit_review.page_size_default)"
active_critical_workers="$(( $(active_run_count repair-cluster-worker.yml) + $(active_sweep_exact_count) ))"
active_background_workers="$(( $(active_run_count commit-review.yml) * commit_page_size ))"
active_critical_workers="$(( $(active_run_count "repair cluster worker") + $(active_sweep_exact_count) ))"
active_background_workers="$(( $(active_run_count "ClawSweeper Commit Review") * commit_page_size + $(active_sweep_background_workers) ))"
hot_intake_shards="$(worker_limit hot_intake --active-critical "$active_critical_workers" --active-background "$active_background_workers")"
normal_shards="$(worker_limit normal_review --active-critical "$active_critical_workers" --active-background "$active_background_workers")"
hot_intake="${{ ((github.event_name == 'workflow_dispatch' && github.event.inputs.hot_intake == 'true') || (github.event_name == 'schedule' && (github.event.schedule == '*/5 * * * *' || github.event.schedule == '2/5 * * * *'))) && 'true' || 'false' }}"
@ -1755,7 +1763,7 @@ jobs:
set -euo pipefail
hot_intake_shards="$(pnpm run --silent workflow -- limit review_shards.hot_intake_default)"
normal_shards="$(pnpm run --silent workflow -- limit review_shards.normal_default)"
runs_json="$(gh run list --repo "${{ github.repository }}" --workflow sweep.yml --limit 80 --json displayTitle,status,createdAt)"
runs_json="$(gh run list --repo "${{ github.repository }}" --limit 100 --json workflowName,displayTitle,status,createdAt)"
eval "$(
RUNS_JSON="$runs_json" node <<'NODE'
const runs = JSON.parse(process.env.RUNS_JSON || "[]");
@ -1763,6 +1771,7 @@ jobs:
const active = new Set(["in_progress", "pending", "queued", "waiting", "requested"]);
function recent(title, windowMs) {
return runs.some((run) => {
if (run.workflowName !== "ClawSweeper") return false;
if (run.displayTitle !== title) return false;
if (active.has(String(run.status))) return true;
const createdAt = Date.parse(String(run.createdAt || ""));

View File

@ -24,6 +24,9 @@ checkpoint, and status-only commits are intentionally omitted.
dispatches.
- Made background review lanes yield to active repair and exact-item work to
lower GitHub and Codex rate-limit pressure during busy periods.
- Fixed live worker scheduling to filter GitHub Actions runs through supported
`workflowName` JSON fields instead of silently falling back to zero active
workers when `gh run list --workflow` is unavailable.
- Retried Codex edit workers after TPM/rate-limit exits and collapsed JSONL failure transcripts into concise repair status reasons.
- Added deterministic merged closing-PR provenance to issue close reports and
public close comments when GitHub exposes a high-confidence closing PR.

View File

@ -76,7 +76,8 @@ The scheduler does this for background lanes:
1. start with `workers.max`
2. subtract active priority work, currently repair workers plus exact-item sweep
runs
3. subtract active background work already known to the workflow
3. subtract active background work already known to the workflow, including
commit-review pages and other active normal/hot sweep runs
4. reserve `workers.reserve_for_interactive`
5. cap the result at the lane's derived quiet-system ceiling
6. return at least 1 so an enabled lane can still make slow progress

View File

@ -310,10 +310,11 @@ count, inspect active review shard jobs on the current workflow run.
The live scheduler estimate happens before planning and is intentionally coarse:
it counts active repair-cluster workflow runs as priority work, active exact-item
sweep runs as priority work, and active commit-review workflow runs as
background work weighted by the configured commit page size. GitHub Actions can
start or finish jobs after that estimate, so the scheduler is a throttle, not a
distributed lock.
sweep runs as priority work, active commit-review workflow runs as background
work weighted by the configured commit page size, and other active normal/hot
sweep runs as background work weighted by their quiet-system ceilings. GitHub
Actions can start or finish jobs after that estimate, so the scheduler is a
throttle, not a distributed lock.
Planning status intentionally does not run `pnpm run reconcile`. Reconciliation
can scan many live GitHub pages and has delayed review shard startup. The
@ -409,8 +410,9 @@ schedule remains the fallback if dispatch is delayed.
Useful commands:
```bash
gh run list --repo openclaw/clawsweeper --workflow sweep.yml --limit 20 \
--json databaseId,displayTitle,event,status,conclusion,createdAt,headSha,url
gh run list --repo openclaw/clawsweeper --limit 100 \
--json databaseId,workflowName,displayTitle,event,status,conclusion,createdAt,headSha,url \
--jq '.[] | select(.workflowName == "ClawSweeper")'
gh run view <run-id> --repo openclaw/clawsweeper --json jobs \
--jq '[.jobs[] | select(.name | startswith("Review shard")) | select(.status=="in_progress")] | length'

View File

@ -51,6 +51,7 @@ const retiredPatterns: { label: string; pattern: RegExp }[] = [
{ label: "retired ClawSweeper read token", pattern: /\bCLAWSWEEPER_READ_GH_TOKEN\b/ },
{ label: "retired repair Codex token", pattern: /\bCLAWSWEEPER_CODEX_GH_TOKEN\b/ },
{ label: "retired review token", pattern: /\bCLAWSWEEPER_REVIEW_GH_TOKEN\b/ },
{ label: "unsupported gh run list workflow flag", pattern: /\bgh run list\b.*--workflow\b/ },
];
type Finding = {

View File

@ -215,18 +215,22 @@ function assertGateOpenIfNeeded(mode: string) {
}
function listClusterRuns() {
return ghJson([
const workflowName = workflowDisplayName(workflow);
return ghJson<LooseRecord[]>([
"run",
"list",
"--repo",
repo,
"--workflow",
workflow,
"--limit",
"50",
"200",
"--json",
"databaseId,headSha,status,conclusion,createdAt,url",
]);
"databaseId,workflowName,headSha,status,conclusion,createdAt,url",
]).filter((run: LooseRecord) => run.workflowName === workflowName);
}
function workflowDisplayName(workflowNameOrFile: string): string {
if (workflowNameOrFile === "repair-cluster-worker.yml") return "repair cluster worker";
return workflowNameOrFile;
}
function readGate(name: string) {

View File

@ -384,18 +384,22 @@ function selfHealLedgerPath() {
}
function listClusterRuns() {
return ghJson([
const workflowName = workflowDisplayName(workflow);
return ghJson<LooseRecord[]>([
"run",
"list",
"--repo",
repo,
"--workflow",
workflow,
"--limit",
"50",
"200",
"--json",
"databaseId,displayTitle,headSha,status,conclusion,createdAt,updatedAt,url",
]);
"databaseId,workflowName,displayTitle,headSha,status,conclusion,createdAt,updatedAt,url",
]).filter((run: LooseRecord) => run.workflowName === workflowName);
}
function workflowDisplayName(workflowNameOrFile: string): string {
if (workflowNameOrFile === "repair-cluster-worker.yml") return "repair cluster worker";
return workflowNameOrFile;
}
function readExecuteGate() {

View File

@ -281,30 +281,20 @@ function readOpenClawSweeperPrClusters() {
function readActiveClusterRuns() {
const repo = process.env.CLAWSWEEPER_REPO ?? "openclaw/clawsweeper";
const statuses = ["queued", "in_progress", "waiting", "requested", "pending"];
const runs: LooseRecord[] = [];
for (const status of statuses) {
try {
runs.push(
...ghJson([
"run",
"list",
"--repo",
repo,
"--workflow",
REPAIR_CLUSTER_WORKFLOW,
"--status",
status,
"--limit",
"100",
"--json",
"databaseId,status,conclusion,createdAt,updatedAt,url,displayTitle",
]),
);
} catch {
// Some statuses are not accepted on older gh versions; active PR detection is still useful.
}
}
const statuses = new Set(["queued", "in_progress", "waiting", "requested", "pending"]);
const workflowName = workflowDisplayName(REPAIR_CLUSTER_WORKFLOW);
const runs = ghJson<LooseRecord[]>([
"run",
"list",
"--repo",
repo,
"--limit",
"200",
"--json",
"databaseId,workflowName,status,conclusion,createdAt,updatedAt,url,displayTitle",
]).filter((run: LooseRecord) => {
return run.workflowName === workflowName && statuses.has(String(run.status));
});
const byId = new Map();
for (const run of runs) byId.set(String(run.databaseId), run);
return [...byId.values()].sort((left: JsonValue, right: JsonValue) =>
@ -312,6 +302,11 @@ function readActiveClusterRuns() {
);
}
function workflowDisplayName(workflow: string): string {
if (workflow === "repair-cluster-worker.yml") return "repair cluster worker";
return workflow;
}
function publicRow(row: LooseRecord) {
return Object.fromEntries(
Object.entries(row).filter(([, value]: JsonValue[]) => value !== undefined),