fix: count active workflow runs for worker scheduling
This commit is contained in:
parent
e923eac212
commit
b033b73435
14
.github/workflows/commit-review.yml
vendored
14
.github/workflows/commit-review.yml
vendored
@ -164,25 +164,25 @@ jobs:
|
||||
run: |
|
||||
set -euo pipefail
|
||||
active_run_count() {
|
||||
gh run list --repo "${{ github.repository }}" --workflow "$1" --limit 100 --json status 2>/dev/null \
|
||||
| jq '[.[] | select(.status == "in_progress" or .status == "pending" or .status == "queued" or .status == "waiting" or .status == "requested")] | length' 2>/dev/null \
|
||||
gh run list --repo "${{ github.repository }}" --limit 100 --json workflowName,status 2>/dev/null \
|
||||
| WORKFLOW_NAME="$1" jq '[.[] | select(.workflowName == env.WORKFLOW_NAME) | select(.status == "in_progress" or .status == "pending" or .status == "queued" or .status == "waiting" or .status == "requested")] | length' 2>/dev/null \
|
||||
|| printf '0'
|
||||
}
|
||||
active_sweep_background_workers() {
|
||||
local normal_limit hot_limit
|
||||
normal_limit="$(pnpm --dir clawsweeper run --silent workflow -- limit review_shards.normal_default)"
|
||||
hot_limit="$(pnpm --dir clawsweeper run --silent workflow -- limit review_shards.hot_intake_default)"
|
||||
gh run list --repo "${{ github.repository }}" --workflow sweep.yml --limit 100 --json displayTitle,status 2>/dev/null \
|
||||
| NORMAL_LIMIT="$normal_limit" HOT_LIMIT="$hot_limit" jq '[.[] | select(.status == "in_progress" or .status == "pending" or .status == "queued" or .status == "waiting" or .status == "requested") | if .displayTitle == "Review ClawSweeper items" then (env.NORMAL_LIMIT | tonumber) elif .displayTitle == "Review hot ClawSweeper items" then (env.HOT_LIMIT | tonumber) else 0 end] | add // 0' 2>/dev/null \
|
||||
gh run list --repo "${{ github.repository }}" --limit 100 --json workflowName,displayTitle,status 2>/dev/null \
|
||||
| NORMAL_LIMIT="$normal_limit" HOT_LIMIT="$hot_limit" jq '[.[] | select(.workflowName == "ClawSweeper") | select(.status == "in_progress" or .status == "pending" or .status == "queued" or .status == "waiting" or .status == "requested") | if .displayTitle == "Review ClawSweeper items" then (env.NORMAL_LIMIT | tonumber) elif .displayTitle == "Review hot ClawSweeper items" then (env.HOT_LIMIT | tonumber) else 0 end] | add // 0' 2>/dev/null \
|
||||
|| printf '0'
|
||||
}
|
||||
active_sweep_exact_count() {
|
||||
gh run list --repo "${{ github.repository }}" --workflow sweep.yml --limit 100 --json displayTitle,status 2>/dev/null \
|
||||
| jq '[.[] | select(.status == "in_progress" or .status == "pending" or .status == "queued" or .status == "waiting" or .status == "requested") | select(.displayTitle | startswith("Review event item "))] | length' 2>/dev/null \
|
||||
gh run list --repo "${{ github.repository }}" --limit 100 --json workflowName,displayTitle,status 2>/dev/null \
|
||||
| jq '[.[] | select(.workflowName == "ClawSweeper") | select(.status == "in_progress" or .status == "pending" or .status == "queued" or .status == "waiting" or .status == "requested") | select(.displayTitle | startswith("Review event item "))] | length' 2>/dev/null \
|
||||
|| printf '0'
|
||||
}
|
||||
if [ -z "$PAGE_SIZE" ]; then
|
||||
active_critical_workers="$(( $(active_run_count repair-cluster-worker.yml) + $(active_sweep_exact_count) ))"
|
||||
active_critical_workers="$(( $(active_run_count "repair cluster worker") + $(active_sweep_exact_count) ))"
|
||||
active_background_workers="$(active_sweep_background_workers)"
|
||||
PAGE_SIZE="$(pnpm --dir clawsweeper run --silent workflow -- worker-limit commit_review --active-critical "$active_critical_workers" --active-background "$active_background_workers")"
|
||||
fi
|
||||
|
||||
23
.github/workflows/sweep.yml
vendored
23
.github/workflows/sweep.yml
vendored
@ -605,21 +605,29 @@ jobs:
|
||||
pnpm --dir clawsweeper run --silent workflow -- worker-limit "$@"
|
||||
}
|
||||
active_run_count() {
|
||||
gh run list --repo "${{ github.repository }}" --workflow "$1" --limit 100 --json status 2>/dev/null \
|
||||
| jq '[.[] | select(.status == "in_progress" or .status == "pending" or .status == "queued" or .status == "waiting" or .status == "requested")] | length' 2>/dev/null \
|
||||
gh run list --repo "${{ github.repository }}" --limit 100 --json workflowName,status 2>/dev/null \
|
||||
| WORKFLOW_NAME="$1" jq '[.[] | select(.workflowName == env.WORKFLOW_NAME) | select(.status == "in_progress" or .status == "pending" or .status == "queued" or .status == "waiting" or .status == "requested")] | length' 2>/dev/null \
|
||||
|| printf '0'
|
||||
}
|
||||
active_sweep_exact_count() {
|
||||
gh run list --repo "${{ github.repository }}" --workflow sweep.yml --limit 100 --json displayTitle,status 2>/dev/null \
|
||||
| jq '[.[] | select((.status == "in_progress" or .status == "pending" or .status == "queued" or .status == "waiting" or .status == "requested") and (.displayTitle | startswith("Review event item ")))] | length' 2>/dev/null \
|
||||
gh run list --repo "${{ github.repository }}" --limit 100 --json workflowName,displayTitle,status 2>/dev/null \
|
||||
| jq '[.[] | select(.workflowName == "ClawSweeper") | select((.status == "in_progress" or .status == "pending" or .status == "queued" or .status == "waiting" or .status == "requested") and (.displayTitle | startswith("Review event item ")))] | length' 2>/dev/null \
|
||||
|| printf '0'
|
||||
}
|
||||
active_sweep_background_workers() {
|
||||
local normal_limit hot_limit
|
||||
normal_limit="$(limit review_shards.normal_default)"
|
||||
hot_limit="$(limit review_shards.hot_intake_default)"
|
||||
gh run list --repo "${{ github.repository }}" --limit 100 --json databaseId,workflowName,displayTitle,status 2>/dev/null \
|
||||
| CURRENT_RUN_ID="${GITHUB_RUN_ID:-0}" NORMAL_LIMIT="$normal_limit" HOT_LIMIT="$hot_limit" jq '[.[] | select((.databaseId | tostring) != env.CURRENT_RUN_ID) | select(.workflowName == "ClawSweeper") | select(.status == "in_progress" or .status == "pending" or .status == "queued" or .status == "waiting" or .status == "requested") | if .displayTitle == "Review ClawSweeper items" then (env.NORMAL_LIMIT | tonumber) elif .displayTitle == "Review hot ClawSweeper items" then (env.HOT_LIMIT | tonumber) else 0 end] | add // 0' 2>/dev/null \
|
||||
|| printf '0'
|
||||
}
|
||||
exact_item_shards="$(limit review_shards.exact_item_default)"
|
||||
normal_active_floor="$(limit review_shards.normal_active_floor)"
|
||||
hard_shard_cap="$(limit review_shards.hard_cap)"
|
||||
commit_page_size="$(limit commit_review.page_size_default)"
|
||||
active_critical_workers="$(( $(active_run_count repair-cluster-worker.yml) + $(active_sweep_exact_count) ))"
|
||||
active_background_workers="$(( $(active_run_count commit-review.yml) * commit_page_size ))"
|
||||
active_critical_workers="$(( $(active_run_count "repair cluster worker") + $(active_sweep_exact_count) ))"
|
||||
active_background_workers="$(( $(active_run_count "ClawSweeper Commit Review") * commit_page_size + $(active_sweep_background_workers) ))"
|
||||
hot_intake_shards="$(worker_limit hot_intake --active-critical "$active_critical_workers" --active-background "$active_background_workers")"
|
||||
normal_shards="$(worker_limit normal_review --active-critical "$active_critical_workers" --active-background "$active_background_workers")"
|
||||
hot_intake="${{ ((github.event_name == 'workflow_dispatch' && github.event.inputs.hot_intake == 'true') || (github.event_name == 'schedule' && (github.event.schedule == '*/5 * * * *' || github.event.schedule == '2/5 * * * *'))) && 'true' || 'false' }}"
|
||||
@ -1755,7 +1763,7 @@ jobs:
|
||||
set -euo pipefail
|
||||
hot_intake_shards="$(pnpm run --silent workflow -- limit review_shards.hot_intake_default)"
|
||||
normal_shards="$(pnpm run --silent workflow -- limit review_shards.normal_default)"
|
||||
runs_json="$(gh run list --repo "${{ github.repository }}" --workflow sweep.yml --limit 80 --json displayTitle,status,createdAt)"
|
||||
runs_json="$(gh run list --repo "${{ github.repository }}" --limit 100 --json workflowName,displayTitle,status,createdAt)"
|
||||
eval "$(
|
||||
RUNS_JSON="$runs_json" node <<'NODE'
|
||||
const runs = JSON.parse(process.env.RUNS_JSON || "[]");
|
||||
@ -1763,6 +1771,7 @@ jobs:
|
||||
const active = new Set(["in_progress", "pending", "queued", "waiting", "requested"]);
|
||||
function recent(title, windowMs) {
|
||||
return runs.some((run) => {
|
||||
if (run.workflowName !== "ClawSweeper") return false;
|
||||
if (run.displayTitle !== title) return false;
|
||||
if (active.has(String(run.status))) return true;
|
||||
const createdAt = Date.parse(String(run.createdAt || ""));
|
||||
|
||||
@ -24,6 +24,9 @@ checkpoint, and status-only commits are intentionally omitted.
|
||||
dispatches.
|
||||
- Made background review lanes yield to active repair and exact-item work to
|
||||
lower GitHub and Codex rate-limit pressure during busy periods.
|
||||
- Fixed live worker scheduling to filter GitHub Actions runs through supported
|
||||
`workflowName` JSON fields instead of silently falling back to zero active
|
||||
workers when `gh run list --workflow` is unavailable.
|
||||
- Retried Codex edit workers after TPM/rate-limit exits and collapsed JSONL failure transcripts into concise repair status reasons.
|
||||
- Added deterministic merged closing-PR provenance to issue close reports and
|
||||
public close comments when GitHub exposes a high-confidence closing PR.
|
||||
|
||||
@ -76,7 +76,8 @@ The scheduler does this for background lanes:
|
||||
1. start with `workers.max`
|
||||
2. subtract active priority work, currently repair workers plus exact-item sweep
|
||||
runs
|
||||
3. subtract active background work already known to the workflow
|
||||
3. subtract active background work already known to the workflow, including
|
||||
commit-review pages and other active normal/hot sweep runs
|
||||
4. reserve `workers.reserve_for_interactive`
|
||||
5. cap the result at the lane's derived quiet-system ceiling
|
||||
6. return at least 1 so an enabled lane can still make slow progress
|
||||
|
||||
@ -310,10 +310,11 @@ count, inspect active review shard jobs on the current workflow run.
|
||||
|
||||
The live scheduler estimate happens before planning and is intentionally coarse:
|
||||
it counts active repair-cluster workflow runs as priority work, active exact-item
|
||||
sweep runs as priority work, and active commit-review workflow runs as
|
||||
background work weighted by the configured commit page size. GitHub Actions can
|
||||
start or finish jobs after that estimate, so the scheduler is a throttle, not a
|
||||
distributed lock.
|
||||
sweep runs as priority work, active commit-review workflow runs as background
|
||||
work weighted by the configured commit page size, and other active normal/hot
|
||||
sweep runs as background work weighted by their quiet-system ceilings. GitHub
|
||||
Actions can start or finish jobs after that estimate, so the scheduler is a
|
||||
throttle, not a distributed lock.
|
||||
|
||||
Planning status intentionally does not run `pnpm run reconcile`. Reconciliation
|
||||
can scan many live GitHub pages and has delayed review shard startup. The
|
||||
@ -409,8 +410,9 @@ schedule remains the fallback if dispatch is delayed.
|
||||
Useful commands:
|
||||
|
||||
```bash
|
||||
gh run list --repo openclaw/clawsweeper --workflow sweep.yml --limit 20 \
|
||||
--json databaseId,displayTitle,event,status,conclusion,createdAt,headSha,url
|
||||
gh run list --repo openclaw/clawsweeper --limit 100 \
|
||||
--json databaseId,workflowName,displayTitle,event,status,conclusion,createdAt,headSha,url \
|
||||
--jq '.[] | select(.workflowName == "ClawSweeper")'
|
||||
|
||||
gh run view <run-id> --repo openclaw/clawsweeper --json jobs \
|
||||
--jq '[.jobs[] | select(.name | startswith("Review shard")) | select(.status=="in_progress")] | length'
|
||||
|
||||
@ -51,6 +51,7 @@ const retiredPatterns: { label: string; pattern: RegExp }[] = [
|
||||
{ label: "retired ClawSweeper read token", pattern: /\bCLAWSWEEPER_READ_GH_TOKEN\b/ },
|
||||
{ label: "retired repair Codex token", pattern: /\bCLAWSWEEPER_CODEX_GH_TOKEN\b/ },
|
||||
{ label: "retired review token", pattern: /\bCLAWSWEEPER_REVIEW_GH_TOKEN\b/ },
|
||||
{ label: "unsupported gh run list workflow flag", pattern: /\bgh run list\b.*--workflow\b/ },
|
||||
];
|
||||
|
||||
type Finding = {
|
||||
|
||||
@ -215,18 +215,22 @@ function assertGateOpenIfNeeded(mode: string) {
|
||||
}
|
||||
|
||||
function listClusterRuns() {
|
||||
return ghJson([
|
||||
const workflowName = workflowDisplayName(workflow);
|
||||
return ghJson<LooseRecord[]>([
|
||||
"run",
|
||||
"list",
|
||||
"--repo",
|
||||
repo,
|
||||
"--workflow",
|
||||
workflow,
|
||||
"--limit",
|
||||
"50",
|
||||
"200",
|
||||
"--json",
|
||||
"databaseId,headSha,status,conclusion,createdAt,url",
|
||||
]);
|
||||
"databaseId,workflowName,headSha,status,conclusion,createdAt,url",
|
||||
]).filter((run: LooseRecord) => run.workflowName === workflowName);
|
||||
}
|
||||
|
||||
function workflowDisplayName(workflowNameOrFile: string): string {
|
||||
if (workflowNameOrFile === "repair-cluster-worker.yml") return "repair cluster worker";
|
||||
return workflowNameOrFile;
|
||||
}
|
||||
|
||||
function readGate(name: string) {
|
||||
|
||||
@ -384,18 +384,22 @@ function selfHealLedgerPath() {
|
||||
}
|
||||
|
||||
function listClusterRuns() {
|
||||
return ghJson([
|
||||
const workflowName = workflowDisplayName(workflow);
|
||||
return ghJson<LooseRecord[]>([
|
||||
"run",
|
||||
"list",
|
||||
"--repo",
|
||||
repo,
|
||||
"--workflow",
|
||||
workflow,
|
||||
"--limit",
|
||||
"50",
|
||||
"200",
|
||||
"--json",
|
||||
"databaseId,displayTitle,headSha,status,conclusion,createdAt,updatedAt,url",
|
||||
]);
|
||||
"databaseId,workflowName,displayTitle,headSha,status,conclusion,createdAt,updatedAt,url",
|
||||
]).filter((run: LooseRecord) => run.workflowName === workflowName);
|
||||
}
|
||||
|
||||
function workflowDisplayName(workflowNameOrFile: string): string {
|
||||
if (workflowNameOrFile === "repair-cluster-worker.yml") return "repair cluster worker";
|
||||
return workflowNameOrFile;
|
||||
}
|
||||
|
||||
function readExecuteGate() {
|
||||
|
||||
@ -281,30 +281,20 @@ function readOpenClawSweeperPrClusters() {
|
||||
|
||||
function readActiveClusterRuns() {
|
||||
const repo = process.env.CLAWSWEEPER_REPO ?? "openclaw/clawsweeper";
|
||||
const statuses = ["queued", "in_progress", "waiting", "requested", "pending"];
|
||||
const runs: LooseRecord[] = [];
|
||||
for (const status of statuses) {
|
||||
try {
|
||||
runs.push(
|
||||
...ghJson([
|
||||
"run",
|
||||
"list",
|
||||
"--repo",
|
||||
repo,
|
||||
"--workflow",
|
||||
REPAIR_CLUSTER_WORKFLOW,
|
||||
"--status",
|
||||
status,
|
||||
"--limit",
|
||||
"100",
|
||||
"--json",
|
||||
"databaseId,status,conclusion,createdAt,updatedAt,url,displayTitle",
|
||||
]),
|
||||
);
|
||||
} catch {
|
||||
// Some statuses are not accepted on older gh versions; active PR detection is still useful.
|
||||
}
|
||||
}
|
||||
const statuses = new Set(["queued", "in_progress", "waiting", "requested", "pending"]);
|
||||
const workflowName = workflowDisplayName(REPAIR_CLUSTER_WORKFLOW);
|
||||
const runs = ghJson<LooseRecord[]>([
|
||||
"run",
|
||||
"list",
|
||||
"--repo",
|
||||
repo,
|
||||
"--limit",
|
||||
"200",
|
||||
"--json",
|
||||
"databaseId,workflowName,status,conclusion,createdAt,updatedAt,url,displayTitle",
|
||||
]).filter((run: LooseRecord) => {
|
||||
return run.workflowName === workflowName && statuses.has(String(run.status));
|
||||
});
|
||||
const byId = new Map();
|
||||
for (const run of runs) byId.set(String(run.databaseId), run);
|
||||
return [...byId.values()].sort((left: JsonValue, right: JsonValue) =>
|
||||
@ -312,6 +302,11 @@ function readActiveClusterRuns() {
|
||||
);
|
||||
}
|
||||
|
||||
function workflowDisplayName(workflow: string): string {
|
||||
if (workflow === "repair-cluster-worker.yml") return "repair cluster worker";
|
||||
return workflow;
|
||||
}
|
||||
|
||||
function publicRow(row: LooseRecord) {
|
||||
return Object.fromEntries(
|
||||
Object.entries(row).filter(([, value]: JsonValue[]) => value !== undefined),
|
||||
|
||||
Loading…
Reference in New Issue
Block a user