259 lines
9.7 KiB
Python
259 lines
9.7 KiB
Python
"""Per-run 1-to-1 audit across every (model, task, run_idx) triple.
|
|
|
|
Flags issues beyond aggregate coverage:
|
|
- Tasks where ALL models score 0 (task broken / verifier rejects everyone)
|
|
- Tasks where models produce output but all get C=0 (verifier bug)
|
|
- Tasks with suspiciously high cross-model infra-failure rates (harness bug)
|
|
- Specific runs with harness errors (timeout, handshake)
|
|
- Models with task-specific pathology (e.g., always fails on t3-X)
|
|
- Judge failures per-task that haven't been rejudged
|
|
- Missing runs in archive (logged but not cached)
|
|
|
|
Usage: python3 scripts/audit_per_run.py
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import re
|
|
from collections import defaultdict
|
|
from pathlib import Path
|
|
|
|
ROOT = Path(__file__).resolve().parent.parent
|
|
DRIFT = ROOT / "data" / "drift_2026-04-19-full"
|
|
ARCH = ROOT / "data" / "run_cache_archive" / "v2026-4-19-full"
|
|
|
|
MODEL_MAP = {
|
|
"opus46": ("anthropic_claude-opus-4-6", "opus-4-6"),
|
|
"opus47": ("anthropic_claude-opus-4-7", "opus-4-7"),
|
|
"sonnet46": ("anthropic_claude-sonnet-4-6", "sonnet-4-6"),
|
|
"gpt54": ("openai_gpt-5.4", "gpt-5.4"),
|
|
"gemini": ("google_gemini-3.1-pro-preview", "gemini-3.1-pro"),
|
|
"glm": ("openrouter_z-ai_glm-5.1", "glm-5.1"),
|
|
"minimax": ("openrouter_minimax_minimax-m2.7", "minimax-m2.7"),
|
|
"kimi": ("openrouter_moonshotai_kimi-k2.5", "kimi-k2.5"),
|
|
"qwen": ("openrouter_qwen_qwen3.6-plus", "qwen-3.6-plus"),
|
|
}
|
|
|
|
LOG_LINE = re.compile(
|
|
r"^\[(\d+)/120\]\s+(\S+)\s+\([^)]+\)\s+run\s+(\d+):\s+([+\-~])\s+([\d.]+)"
|
|
)
|
|
HARNESS_ERR = re.compile(r"ERROR clawbench\.harness: Run (\S+)/(\d+) failed")
|
|
JUDGE_INFRA_PHRASES = [
|
|
"gateway is restarting", "judge execution failed", "judge failed to run",
|
|
"judge call failed", "judge timed out",
|
|
]
|
|
|
|
|
|
def parse_log(log_path: Path):
|
|
runs = {}
|
|
errors = {}
|
|
if not log_path.exists():
|
|
return runs, errors
|
|
src = log_path.read_text(errors="ignore")
|
|
for line in src.splitlines():
|
|
m = LOG_LINE.match(line.strip())
|
|
if m:
|
|
seq, task, run_idx, outcome, score = m.groups()
|
|
runs[(task, int(run_idx) - 1)] = {"score": float(score), "outcome": outcome}
|
|
h = HARNESS_ERR.search(line)
|
|
if h:
|
|
errors[(h.group(1), int(h.group(2)))] = "harness_error"
|
|
return runs, errors
|
|
|
|
|
|
def scan_archive(cache_dir: Path):
|
|
out = {}
|
|
if not cache_dir.exists():
|
|
return out
|
|
for tdir in cache_dir.iterdir():
|
|
if not tdir.is_dir():
|
|
continue
|
|
for rf in tdir.glob("run*.json"):
|
|
m = re.match(r"run(\d+)\.json", rf.name)
|
|
if not m:
|
|
continue
|
|
try:
|
|
d = json.load(open(rf))
|
|
except Exception:
|
|
continue
|
|
jr = d.get("judge_result", {}) or {}
|
|
reason = (jr.get("reason") or "").lower()
|
|
# Don't flag rejudged runs as infra-failed even if reason is empty —
|
|
# a rejudged run has a real judge call behind it (rejudged_at field).
|
|
judge_infra = (
|
|
jr.get("enabled")
|
|
and "rejudged_at" not in jr
|
|
and (
|
|
any(p in reason for p in JUDGE_INFRA_PHRASES)
|
|
or jr.get("error")
|
|
or (not reason.strip() and jr.get("score", 0) == 0)
|
|
)
|
|
)
|
|
out[(tdir.name, int(m.group(1)))] = {
|
|
"run_score": d.get("run_score", 0),
|
|
"c": d.get("completion_result", {}).get("score", 0),
|
|
"t": d.get("trajectory_result", {}).get("score", 0),
|
|
"b": d.get("behavior_result", {}).get("score", 0),
|
|
"j": jr.get("score", 0) if jr.get("enabled") else None,
|
|
"judge_infra_failed": bool(judge_infra),
|
|
"rejudged": "rejudged_at" in jr,
|
|
"delivery": d.get("delivery_outcome"),
|
|
"failure_mode": d.get("failure_mode"),
|
|
"error": d.get("error"),
|
|
"n_messages": len(d.get("transcript", {}).get("messages", [])),
|
|
"has_assistant_text": any(
|
|
m.get("role") == "assistant" and m.get("text")
|
|
for m in d.get("transcript", {}).get("messages", [])
|
|
),
|
|
}
|
|
return out
|
|
|
|
|
|
def main():
|
|
# Gather everything
|
|
per_model = {}
|
|
for label, (sub, pretty) in MODEL_MAP.items():
|
|
log_p = DRIFT / f"docker_{label}_v2026-4-19-full.log"
|
|
arch_d = ARCH / sub
|
|
logged, errors = parse_log(log_p)
|
|
archived = scan_archive(arch_d)
|
|
per_model[pretty] = {
|
|
"logged": logged, "errors": errors, "archived": archived,
|
|
}
|
|
|
|
# Build per-task cross-model view
|
|
all_tasks = set()
|
|
for m in per_model.values():
|
|
for key in m["archived"]:
|
|
all_tasks.add(key[0])
|
|
for key in m["logged"]:
|
|
all_tasks.add(key[0])
|
|
|
|
# Issue classification
|
|
issues = defaultdict(list)
|
|
|
|
for task in sorted(all_tasks):
|
|
# Collect all runs for this task across models
|
|
task_runs_by_model = {}
|
|
for pretty, data in per_model.items():
|
|
task_runs = []
|
|
for run_idx in range(3):
|
|
key = (task, run_idx)
|
|
a = data["archived"].get(key)
|
|
logged = data["logged"].get(key)
|
|
err = (key in data["errors"])
|
|
task_runs.append({"archived": a, "logged": logged, "harness_err": err})
|
|
task_runs_by_model[pretty] = task_runs
|
|
|
|
# Compute cross-model stats
|
|
all_scores = []
|
|
all_cs = []
|
|
all_outputs = [] # model produced assistant text?
|
|
all_judge_infra = 0
|
|
all_harness_err = 0
|
|
for pretty, runs in task_runs_by_model.items():
|
|
for r in runs:
|
|
a = r["archived"]
|
|
if a:
|
|
all_scores.append(a["run_score"])
|
|
all_cs.append(a["c"])
|
|
all_outputs.append(a["has_assistant_text"])
|
|
if a["judge_infra_failed"]:
|
|
all_judge_infra += 1
|
|
elif r["logged"]:
|
|
all_scores.append(r["logged"]["score"])
|
|
if r["harness_err"]:
|
|
all_harness_err += 1
|
|
|
|
if not all_scores:
|
|
continue
|
|
mean_score = sum(all_scores) / len(all_scores)
|
|
mean_c = sum(all_cs) / len(all_cs) if all_cs else 0
|
|
output_rate = sum(all_outputs) / len(all_outputs) if all_outputs else 0
|
|
|
|
# Flag issues
|
|
if mean_score < 0.1:
|
|
issues["task_fails_all_models"].append((task, mean_score, output_rate))
|
|
if mean_c < 0.05 and output_rate > 0.5:
|
|
issues["verifier_rejects_valid_outputs"].append((task, mean_c, output_rate))
|
|
if all_harness_err >= 5:
|
|
issues["harness_errors_cluster"].append((task, all_harness_err))
|
|
if all_judge_infra >= 5:
|
|
issues["judge_infra_cluster"].append((task, all_judge_infra))
|
|
|
|
# Print issues
|
|
print("=" * 70)
|
|
print("ISSUE: Tasks where ALL models score near-zero (broken verifier or task)")
|
|
print("=" * 70)
|
|
for task, mean, out_rate in sorted(issues["task_fails_all_models"]):
|
|
print(f" {task:<40} mean_score={mean:.3f} assistant_output_rate={out_rate:.1%}")
|
|
|
|
print()
|
|
print("=" * 70)
|
|
print("ISSUE: Verifier rejects valid outputs (model produced text but C=0)")
|
|
print("=" * 70)
|
|
for task, mean_c, out_rate in sorted(issues["verifier_rejects_valid_outputs"]):
|
|
print(f" {task:<40} mean_completion={mean_c:.3f} assistant_output_rate={out_rate:.1%}")
|
|
|
|
print()
|
|
print("=" * 70)
|
|
print("ISSUE: Harness-error clusters (gateway failures per task)")
|
|
print("=" * 70)
|
|
for task, n in sorted(issues["harness_errors_cluster"], key=lambda x: -x[1]):
|
|
print(f" {task:<40} harness_error_count={n}")
|
|
|
|
print()
|
|
print("=" * 70)
|
|
print("ISSUE: Judge-infra clusters (judge failing per task)")
|
|
print("=" * 70)
|
|
for task, n in sorted(issues["judge_infra_cluster"], key=lambda x: -x[1]):
|
|
print(f" {task:<40} judge_infra_failures={n} (should be rejudged)")
|
|
|
|
# Per-model per-task pathologies
|
|
print()
|
|
print("=" * 70)
|
|
print("ISSUE: Model-specific task pathologies (all 3 runs of a task scored 0 on one model)")
|
|
print("=" * 70)
|
|
for pretty, data in per_model.items():
|
|
zero_tasks = []
|
|
for task in sorted(all_tasks):
|
|
all_three_zero = True
|
|
any_attempted = False
|
|
for run_idx in range(3):
|
|
key = (task, run_idx)
|
|
a = data["archived"].get(key)
|
|
logged = data["logged"].get(key)
|
|
if a:
|
|
any_attempted = True
|
|
if a["run_score"] > 0.01:
|
|
all_three_zero = False
|
|
elif logged:
|
|
any_attempted = True
|
|
if logged["score"] > 0.01:
|
|
all_three_zero = False
|
|
else:
|
|
all_three_zero = False # can't confirm
|
|
any_attempted = False
|
|
if any_attempted and all_three_zero:
|
|
zero_tasks.append(task)
|
|
if zero_tasks:
|
|
print(f" {pretty:<18}: all-zero on {len(zero_tasks)} tasks")
|
|
for t in zero_tasks[:6]:
|
|
print(f" - {t}")
|
|
|
|
# Task coverage mismatches
|
|
print()
|
|
print("=" * 70)
|
|
print("COVERAGE: Models with non-complete coverage (logged != 120 or archived != 120)")
|
|
print("=" * 70)
|
|
for pretty, data in per_model.items():
|
|
n_log = len(data["logged"])
|
|
n_arch = len(data["archived"])
|
|
if n_log < 120 or n_arch < 120:
|
|
print(f" {pretty:<18} logged={n_log:<4} archived={n_arch:<4} missing={120 - max(n_log, n_arch)}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|