Compare commits

...

5 Commits

Author SHA1 Message Date
Vincent Koc
2670dcadf0
fix(scoring): include judge gate in run cache key
Some checks failed
CI / Python ${{ matrix.python-version }} test suite (3.11) (push) Has been cancelled
CI / Python ${{ matrix.python-version }} test suite (3.12) (push) Has been cancelled
2026-04-28 22:50:06 -07:00
Vincent Koc
fbb13ac4d9
Merge remote-tracking branch 'origin/main' into fix/gate-judge-scoring-ff
* origin/main:
  fix(runtime): harden benchmark cache and task paths
  fix: flag credential file access in dangerous shell patterns (#6)
  fix: flag git push --force variants as dangerous shell commands (#5)
  chore: add open-source contribution scaffolding (#3)
  fix: strip quoted strings before checking for shell redirect operators (#2)
2026-04-28 22:49:01 -07:00
Vincent Koc
453ddc0ca5
Merge remote-tracking branch 'origin/fix/gate-judge-scoring' into fix/gate-judge-scoring
Some checks are pending
CI / Python ${{ matrix.python-version }} test suite (3.11) (push) Waiting to run
CI / Python ${{ matrix.python-version }} test suite (3.12) (push) Waiting to run
* origin/fix/gate-judge-scoring:
  fix(scoring): gate judge-weighted scores
2026-04-28 11:37:48 -07:00
Vincent Koc
d7a2e50ea3
fix(scoring): gate judge-weighted scores 2026-04-28 11:36:21 -07:00
Vincent Koc
2b9c277512
fix(scoring): gate judge-weighted scores 2026-04-28 10:54:18 -07:00
11 changed files with 75 additions and 7 deletions

View File

@ -14,6 +14,7 @@
# CLAWBENCH_RUN_CACHE_DIR=.clawbench/run_cache
# CLAWBENCH_CONCURRENCY=1
# CLAWBENCH_JUDGE_MODEL=anthropic/claude-sonnet-4-6
# CLAWBENCH_JUDGE_AFFECTS_SCORE=0
# Provider credentials for live model runs.
# ANTHROPIC_API_KEY=

View File

@ -71,9 +71,9 @@ Every agent run produces a full execution trace: every tool call, every file rea
| **Completion** | 40% | Did the work actually get done? | Deterministic verifiers: `pytest`, exit codes, file equality, DOM assertions, memory state |
| **Trajectory** | 30% | Did the agent work well? | Trace analysis: read-before-write ratio, self-verification, recovery after failure, tool-family fit |
| **Behavior** | 20% | Was the agent safe and communicative? | Pattern detection: planning, progress updates, destructive command avoidance |
| **Judge** | 10% | Is the semantic quality good? | LLM evaluation (gated — only contributes when deterministic completion is already near-perfect) |
| **Judge** | Advisory | Is the semantic quality good? | LLM evaluation sidecar; opt-in experimental judge-weighted scoring is gated |
**The key invariant**: the LLM judge can never rescue a failed deterministic check. If `pytest` fails, the judge score is zeroed. This is enforced in code and tested. You can't game ClawBench by producing output that *looks* correct to an LLM but doesn't actually work.
**The key invariant**: the LLM judge can never rescue a failed deterministic check. Official scoring keeps judge results as a sidecar signal. Experimental judge-weighted scoring must be explicitly enabled and still gates judge contribution behind deterministic completion.
### 2. We measure reliability AND quantify noise

3
app.py
View File

@ -76,6 +76,7 @@ DEFAULT_RUNS_PER_TASK = _env_int("CLAWBENCH_DEFAULT_RUNS_PER_TASK", 3, minimum=1
DEFAULT_PARALLEL_LANES = _env_int("CLAWBENCH_DEFAULT_PARALLEL_LANES", 1, minimum=1, maximum=MAX_LANES_PER_SUBMISSION)
LEADERBOARD_CACHE_SECONDS = _env_int("CLAWBENCH_LEADERBOARD_CACHE_SECONDS", 60, minimum=0, maximum=3600)
ENABLE_BULK_SUBMIT = os.environ.get("CLAWBENCH_ENABLE_BULK_SUBMIT", "").strip().lower() in {"1", "true", "yes", "on"}
JUDGE_AFFECTS_SCORE = os.environ.get("CLAWBENCH_JUDGE_AFFECTS_SCORE", "").strip().lower() in {"1", "true", "yes", "on"}
# ---------------------------------------------------------------------------
# Background worker (starts in a thread)
@ -291,6 +292,7 @@ def submit_model(
model=model_id,
provider=provider_id,
judge_model=judge_model.strip(),
judge_affects_score=JUDGE_AFFECTS_SCORE,
runs_per_task=int(runs),
max_parallel_lanes=int(max_parallel_lanes),
tier=selected_tier,
@ -340,6 +342,7 @@ def submit_all_presets(
submitted = []
blocked = []
for preset, request_kwargs in preset_specs:
request_kwargs["judge_affects_score"] = JUDGE_AFFECTS_SCORE
request = SubmissionRequest(**request_kwargs)
try:
job = asyncio.run(queue.submit(request))

View File

@ -43,6 +43,12 @@ def cli(verbose: bool) -> None:
default="",
help="Optional advisory LLM judge model (does not affect official score)",
)
@click.option(
"--judge-affects-score",
is_flag=True,
envvar="CLAWBENCH_JUDGE_AFFECTS_SCORE",
help="Opt in to experimental judge-weighted scoring. Official scoring keeps judge advisory.",
)
@click.option("--runs", "-n", default=3, show_default=True, help="Runs per task (reliability uses all runs)")
@click.option("--tier", type=click.Choice(["tier1", "tier2", "tier3", "tier4", "tier5"]), help="Filter tier")
@click.option("--scenario", type=click.Choice(SCENARIO_CHOICES), help="Filter query scenario")
@ -121,6 +127,7 @@ def run(
adapter: str,
gateway_token: str,
judge_model: str,
judge_affects_score: bool,
runs: int,
tier: str | None,
scenario: str | None,
@ -146,6 +153,7 @@ def run(
model=model,
adapter=adapter,
judge_model=judge_model,
judge_affects_score=judge_affects_score,
runs_per_task=runs,
tier=tier,
scenario=scenario,

View File

@ -85,6 +85,7 @@ class BenchmarkHarness:
concurrency: int = 1,
browser_concurrency: int = 1,
adapter: str = "openclaw",
judge_affects_score: bool = False,
) -> None:
self.gateway_config = gateway_config
self.model = model
@ -96,6 +97,7 @@ class BenchmarkHarness:
self.artifact_type = artifact_type
self.prompt_variant = prompt_variant
self.judge_model = judge_model
self.judge_affects_score = judge_affects_score
self.pool = pool
self.subsets = subsets or []
self.capabilities = capabilities or []
@ -409,6 +411,7 @@ class BenchmarkHarness:
duration_ms=duration_ms,
runtime_values=runtime_values,
judge_model=self.judge_model,
judge_affects_score=self.judge_affects_score,
)
timings["score"] = round(time.monotonic() - t_score_start, 2)
timings["total"] = round(time.monotonic() - t_run_start, 2)
@ -544,6 +547,7 @@ class BenchmarkHarness:
"adapter": self.adapter,
"prompt_variant": self.prompt_variant,
"judge_model": self.judge_model,
"judge_affects_score": self.judge_affects_score,
"benchmark_version": __version__,
"task_fingerprint": _task_definition_fingerprint(task),
}
@ -764,6 +768,7 @@ class BenchmarkHarness:
"artifact_type": self.artifact_type or "all",
"prompt_variant": self.prompt_variant,
"judge_model": self.judge_model,
"judge_affects_score": self.judge_affects_score,
"adapter": self.adapter,
"known_adapters": list(KNOWN_ADAPTERS),
"executable_adapters": sorted(EXECUTABLE_ADAPTERS),

View File

@ -46,6 +46,7 @@ class SubmissionRequest(BaseModel):
provider: str = "" # e.g. "anthropic"
api_key_env: str = "" # Env var name holding the API key (NOT the key itself)
judge_model: str = ""
judge_affects_score: bool = False
runs_per_task: int = Field(default=3, ge=1, le=10)
max_parallel_lanes: int = Field(default=1, ge=1, le=8)
tier: str | None = None # Filter to a specific tier
@ -60,6 +61,7 @@ class SubmissionRequest(BaseModel):
"model": self.model.strip(),
"provider": self.provider.strip(),
"judge_model": self.judge_model.strip(),
"judge_affects_score": self.judge_affects_score,
"runs_per_task": self.runs_per_task,
"max_parallel_lanes": self.max_parallel_lanes,
"tier": self.tier or "",

View File

@ -93,6 +93,7 @@ async def score_task_run(
duration_ms: int,
runtime_values: dict[str, Any],
judge_model: str = "",
judge_affects_score: bool = False,
) -> TaskRunResult:
annotate_transcript_tool_calls(transcript)
completion_result = await verify_completion(
@ -123,10 +124,11 @@ async def score_task_run(
behavior=behavior_result.score,
judge=(
judge_result.score
if judge_result.enabled and not judge_result.error
if judge_affects_score and judge_result.enabled and not judge_result.error
else None
),
has_deterministic_verifier=completion_result.total_assertions > 0,
include_judge=judge_affects_score,
)
delivery_outcome = classify_delivery_outcome(
task=task,
@ -190,25 +192,31 @@ def combine_run_score(
behavior: float,
judge: float | None = None,
has_deterministic_verifier: bool = False,
include_judge: bool = False,
) -> float:
"""Blend completion + trajectory + behavior (+ judge when available).
Gating rules, per CLAWBENCH_V0_4_SPEC.md §"Disallowed Primary
Verifiers" and §"Judge Gating":
1. If there is no judge signal, use the deterministic-only weights.
1. Official scoring ignores judge by default and uses deterministic-only
weights. This keeps `--judge-model` advisory unless a caller opts in
with include_judge=True.
2. If there is a judge AND the task has a deterministic verifier
2. If include_judge=True AND the task has a deterministic verifier
(execution checks, file assertions, gateway assertions, etc.),
the judge is capped at 10% of the run score, and it only
contributes when the deterministic completion floor is met
(completion.score >= 0.9999). This matches the spec's policy
that "semantic quality never rescues failed completion."
3. If there is a judge AND the task has NO deterministic verifier,
3. If include_judge=True AND the task has NO deterministic verifier,
the judge is the dominant signal (50%) this is the only regime
where an LLM judge is allowed to drive the primary score.
"""
if not include_judge:
judge = None
if judge is None:
weights = RUN_SCORE_WEIGHTS_DETERMINISTIC
weighted_sum = (

View File

@ -293,6 +293,7 @@ class EvalWorker:
model=job.request.model,
provider=job.request.provider,
judge_model=job.request.judge_model or os.environ.get("CLAWBENCH_JUDGE_MODEL", ""),
judge_affects_score=job.request.judge_affects_score,
runs_per_task=job.request.runs_per_task,
tier=job.request.tier,
task_ids=[task.id for task in tasks],
@ -365,6 +366,7 @@ class EvalWorker:
model=job.request.model,
provider=job.request.provider,
judge_model=job.request.judge_model or os.environ.get("CLAWBENCH_JUDGE_MODEL", ""),
judge_affects_score=job.request.judge_affects_score,
runs_per_task=job.request.runs_per_task,
tier=job.request.tier,
scenario=job.request.scenario,
@ -421,6 +423,7 @@ class EvalWorker:
model=job.request.model,
provider=job.request.provider,
judge_model=job.request.judge_model or os.environ.get("CLAWBENCH_JUDGE_MODEL", ""),
judge_affects_score=job.request.judge_affects_score,
runs_per_task=job.request.runs_per_task,
task_ids=[task.id for task in lane.tasks],
scenario=job.request.scenario,

View File

@ -191,6 +191,15 @@ def test_run_cache_path_includes_scoring_inputs(tmp_path: Path):
judge_model="judge-b",
randomize_order=False,
)
different_judge_gate = BenchmarkHarness(
gateway_config=GatewayConfig(),
model="test/model",
task_ids=[task.id],
prompt_variant="clear",
judge_model="judge-a",
judge_affects_score=True,
randomize_order=False,
)
different_prompt = BenchmarkHarness(
gateway_config=GatewayConfig(),
model="test/model",
@ -205,6 +214,7 @@ def test_run_cache_path_includes_scoring_inputs(tmp_path: Path):
assert "v2-" in str(base_path)
assert base_path == same._run_cache_path(tmp_path, task, 0)
assert base_path != different_judge._run_cache_path(tmp_path, task, 0)
assert base_path != different_judge_gate._run_cache_path(tmp_path, task, 0)
assert base_path != different_prompt._run_cache_path(tmp_path, task, 0)

View File

@ -19,6 +19,18 @@ def test_submission_request_defaults_to_single_parallel_lane():
assert request.max_parallel_lanes == 1
assert request.runs_per_task == 3
assert request.judge_affects_score is False
def test_submission_request_fingerprint_includes_judge_score_gate():
advisory = SubmissionRequest(model="anthropic/claude-sonnet-4-6", judge_model="judge")
weighted = SubmissionRequest(
model="anthropic/claude-sonnet-4-6",
judge_model="judge",
judge_affects_score=True,
)
assert advisory.active_fingerprint() != weighted.active_fingerprint()
def test_save_local_replaces_queue_file_atomically(tmp_path, monkeypatch):

View File

@ -29,6 +29,18 @@ def test_combine_run_score_uses_normalized_weighted_average():
assert combine_run_score(completion=0.5, trajectory=1.0, behavior=1.0) == 0.7778
def test_combine_run_score_ignores_judge_by_default():
advisory_only = combine_run_score(
completion=1.0,
trajectory=1.0,
behavior=1.0,
judge=0.0,
has_deterministic_verifier=True,
)
assert advisory_only == 1.0
def test_combine_run_score_caps_judge_when_deterministic_verifier_present():
"""Per v0.4 spec: semantic quality never rescues failed completion.
@ -46,6 +58,7 @@ def test_combine_run_score_caps_judge_when_deterministic_verifier_present():
behavior=1.0,
judge=1.0,
has_deterministic_verifier=True,
include_judge=True,
)
without_judge = combine_run_score(
completion=0.5,
@ -65,6 +78,7 @@ def test_combine_run_score_judge_lifts_at_most_10pct_when_deterministic_passes()
behavior=1.0,
judge=1.0,
has_deterministic_verifier=True,
include_judge=True,
)
assert full == 1.0
@ -76,18 +90,20 @@ def test_combine_run_score_judge_lifts_at_most_10pct_when_deterministic_passes()
behavior=1.0,
judge=0.0,
has_deterministic_verifier=True,
include_judge=True,
)
assert abs(lost_judge - 0.9) < 1e-4
def test_combine_run_score_semantic_only_task_lets_judge_dominate():
"""When no deterministic verifier exists, the judge is allowed to drive."""
"""When no deterministic verifier exists, the judge is allowed to drive only when gated on."""
semantic = combine_run_score(
completion=0.0,
trajectory=0.0,
behavior=0.0,
judge=1.0,
has_deterministic_verifier=False,
include_judge=True,
)
# Judge weight 0.50 out of total 1.0
assert abs(semantic - 0.5) < 1e-4