fix(scoring): include judge gate in run cache key

Merge remote-tracking branch 'origin/main' into fix/gate-judge-scoring-ff
* origin/main: fix(runtime): harden benchmark cache and task paths fix: flag credential file access in dangerous shell patterns (#6) fix: flag git push --force variants as dangerous shell commands (#5) chore: add open-source contribution scaffolding (#3) fix: strip quoted strings before checking for shell redirect operators (#2)
2026-04-28 22:50:06 -07:00 · 2026-04-28 22:49:01 -07:00 · 2026-04-28 11:37:48 -07:00 · 2026-04-28 11:36:21 -07:00 · 2026-04-28 10:54:18 -07:00
11 changed files with 75 additions and 7 deletions
--- a/.env.example
+++ b/.env.example
@ -14,6 +14,7 @@
 # CLAWBENCH_RUN_CACHE_DIR=.clawbench/run_cache
 # CLAWBENCH_CONCURRENCY=1
 # CLAWBENCH_JUDGE_MODEL=anthropic/claude-sonnet-4-6
+# CLAWBENCH_JUDGE_AFFECTS_SCORE=0

 # Provider credentials for live model runs.
 # ANTHROPIC_API_KEY=
--- a/README.md
+++ b/README.md
@ -71,9 +71,9 @@ Every agent run produces a full execution trace: every tool call, every file rea
 | **Completion** | 40% | Did the work actually get done? | Deterministic verifiers: `pytest`, exit codes, file equality, DOM assertions, memory state |
 | **Trajectory** | 30% | Did the agent work well? | Trace analysis: read-before-write ratio, self-verification, recovery after failure, tool-family fit |
 | **Behavior** | 20% | Was the agent safe and communicative? | Pattern detection: planning, progress updates, destructive command avoidance |
-| **Judge** | 10% | Is the semantic quality good? | LLM evaluation (gated — only contributes when deterministic completion is already near-perfect) |
+| **Judge** | Advisory | Is the semantic quality good? | LLM evaluation sidecar; opt-in experimental judge-weighted scoring is gated |

-**The key invariant**: the LLM judge can never rescue a failed deterministic check. If `pytest` fails, the judge score is zeroed. This is enforced in code and tested. You can't game ClawBench by producing output that *looks* correct to an LLM but doesn't actually work.
+**The key invariant**: the LLM judge can never rescue a failed deterministic check. Official scoring keeps judge results as a sidecar signal. Experimental judge-weighted scoring must be explicitly enabled and still gates judge contribution behind deterministic completion.

 ### 2. We measure reliability AND quantify noise

--- a/app.py
+++ b/app.py
@ -76,6 +76,7 @@ DEFAULT_RUNS_PER_TASK = _env_int("CLAWBENCH_DEFAULT_RUNS_PER_TASK", 3, minimum=1
 DEFAULT_PARALLEL_LANES = _env_int("CLAWBENCH_DEFAULT_PARALLEL_LANES", 1, minimum=1, maximum=MAX_LANES_PER_SUBMISSION)
 LEADERBOARD_CACHE_SECONDS = _env_int("CLAWBENCH_LEADERBOARD_CACHE_SECONDS", 60, minimum=0, maximum=3600)
 ENABLE_BULK_SUBMIT = os.environ.get("CLAWBENCH_ENABLE_BULK_SUBMIT", "").strip().lower() in {"1", "true", "yes", "on"}
+JUDGE_AFFECTS_SCORE = os.environ.get("CLAWBENCH_JUDGE_AFFECTS_SCORE", "").strip().lower() in {"1", "true", "yes", "on"}

 # ---------------------------------------------------------------------------
 # Background worker (starts in a thread)
@ -291,6 +292,7 @@ def submit_model(
        model=model_id,
        provider=provider_id,
        judge_model=judge_model.strip(),
+        judge_affects_score=JUDGE_AFFECTS_SCORE,
        runs_per_task=int(runs),
        max_parallel_lanes=int(max_parallel_lanes),
        tier=selected_tier,
@ -340,6 +342,7 @@ def submit_all_presets(
    submitted = []
    blocked = []
    for preset, request_kwargs in preset_specs:
+        request_kwargs["judge_affects_score"] = JUDGE_AFFECTS_SCORE
        request = SubmissionRequest(**request_kwargs)
        try:
            job = asyncio.run(queue.submit(request))
--- a/clawbench/cli.py
+++ b/clawbench/cli.py
@ -43,6 +43,12 @@ def cli(verbose: bool) -> None:
    default="",
    help="Optional advisory LLM judge model (does not affect official score)",
 )
+@click.option(
+    "--judge-affects-score",
+    is_flag=True,
+    envvar="CLAWBENCH_JUDGE_AFFECTS_SCORE",
+    help="Opt in to experimental judge-weighted scoring. Official scoring keeps judge advisory.",
+)
@click.option("--runs", "-n", default=3, show_default=True, help="Runs per task (reliability uses all runs)")
@click.option("--tier", type=click.Choice(["tier1", "tier2", "tier3", "tier4", "tier5"]), help="Filter tier")
@click.option("--scenario", type=click.Choice(SCENARIO_CHOICES), help="Filter query scenario")
@ -121,6 +127,7 @@ def run(
    adapter: str,
    gateway_token: str,
    judge_model: str,
+    judge_affects_score: bool,
    runs: int,
    tier: str | None,
    scenario: str | None,
@ -146,6 +153,7 @@ def run(
        model=model,
        adapter=adapter,
        judge_model=judge_model,
+        judge_affects_score=judge_affects_score,
        runs_per_task=runs,
        tier=tier,
        scenario=scenario,
--- a/clawbench/harness.py
+++ b/clawbench/harness.py
@ -85,6 +85,7 @@ class BenchmarkHarness:
        concurrency: int = 1,
        browser_concurrency: int = 1,
        adapter: str = "openclaw",
+        judge_affects_score: bool = False,
    ) -> None:
        self.gateway_config = gateway_config
        self.model = model
@ -96,6 +97,7 @@ class BenchmarkHarness:
        self.artifact_type = artifact_type
        self.prompt_variant = prompt_variant
        self.judge_model = judge_model
+        self.judge_affects_score = judge_affects_score
        self.pool = pool
        self.subsets = subsets or []
        self.capabilities = capabilities or []
@ -409,6 +411,7 @@ class BenchmarkHarness:
                    duration_ms=duration_ms,
                    runtime_values=runtime_values,
                    judge_model=self.judge_model,
+                    judge_affects_score=self.judge_affects_score,
                )
                timings["score"] = round(time.monotonic() - t_score_start, 2)
                timings["total"] = round(time.monotonic() - t_run_start, 2)
@ -544,6 +547,7 @@ class BenchmarkHarness:
            "adapter": self.adapter,
            "prompt_variant": self.prompt_variant,
            "judge_model": self.judge_model,
+            "judge_affects_score": self.judge_affects_score,
            "benchmark_version": __version__,
            "task_fingerprint": _task_definition_fingerprint(task),
        }
@ -764,6 +768,7 @@ class BenchmarkHarness:
                "artifact_type": self.artifact_type or "all",
                "prompt_variant": self.prompt_variant,
                "judge_model": self.judge_model,
+                "judge_affects_score": self.judge_affects_score,
                "adapter": self.adapter,
                "known_adapters": list(KNOWN_ADAPTERS),
                "executable_adapters": sorted(EXECUTABLE_ADAPTERS),
--- a/clawbench/queue.py
+++ b/clawbench/queue.py
@ -46,6 +46,7 @@ class SubmissionRequest(BaseModel):
    provider: str = ""  # e.g. "anthropic"
    api_key_env: str = ""  # Env var name holding the API key (NOT the key itself)
    judge_model: str = ""
+    judge_affects_score: bool = False
    runs_per_task: int = Field(default=3, ge=1, le=10)
    max_parallel_lanes: int = Field(default=1, ge=1, le=8)
    tier: str | None = None  # Filter to a specific tier
@ -60,6 +61,7 @@ class SubmissionRequest(BaseModel):
            "model": self.model.strip(),
            "provider": self.provider.strip(),
            "judge_model": self.judge_model.strip(),
+            "judge_affects_score": self.judge_affects_score,
            "runs_per_task": self.runs_per_task,
            "max_parallel_lanes": self.max_parallel_lanes,
            "tier": self.tier or "",
--- a/clawbench/scorer.py
+++ b/clawbench/scorer.py
@ -93,6 +93,7 @@ async def score_task_run(
    duration_ms: int,
    runtime_values: dict[str, Any],
    judge_model: str = "",
+    judge_affects_score: bool = False,
 ) -> TaskRunResult:
    annotate_transcript_tool_calls(transcript)
    completion_result = await verify_completion(
@ -123,10 +124,11 @@ async def score_task_run(
        behavior=behavior_result.score,
        judge=(
            judge_result.score
-            if judge_result.enabled and not judge_result.error
+            if judge_affects_score and judge_result.enabled and not judge_result.error
            else None
        ),
        has_deterministic_verifier=completion_result.total_assertions > 0,
+        include_judge=judge_affects_score,
    )
    delivery_outcome = classify_delivery_outcome(
        task=task,
@ -190,25 +192,31 @@ def combine_run_score(
    behavior: float,
    judge: float | None = None,
    has_deterministic_verifier: bool = False,
+    include_judge: bool = False,
 ) -> float:
    """Blend completion + trajectory + behavior (+ judge when available).

    Gating rules, per CLAWBENCH_V0_4_SPEC.md §"Disallowed Primary
    Verifiers" and §"Judge Gating":

-    1. If there is no judge signal, use the deterministic-only weights.
+    1. Official scoring ignores judge by default and uses deterministic-only
+       weights. This keeps `--judge-model` advisory unless a caller opts in
+       with include_judge=True.

-    2. If there is a judge AND the task has a deterministic verifier
+    2. If include_judge=True AND the task has a deterministic verifier
       (execution checks, file assertions, gateway assertions, etc.),
       the judge is capped at 10% of the run score, and it only
       contributes when the deterministic completion floor is met
       (completion.score >= 0.9999). This matches the spec's policy
       that "semantic quality never rescues failed completion."

-    3. If there is a judge AND the task has NO deterministic verifier,
+    3. If include_judge=True AND the task has NO deterministic verifier,
       the judge is the dominant signal (50%) — this is the only regime
       where an LLM judge is allowed to drive the primary score.
    """
+    if not include_judge:
+        judge = None
+
    if judge is None:
        weights = RUN_SCORE_WEIGHTS_DETERMINISTIC
        weighted_sum = (
--- a/clawbench/worker.py
+++ b/clawbench/worker.py
@ -293,6 +293,7 @@ class EvalWorker:
            model=job.request.model,
            provider=job.request.provider,
            judge_model=job.request.judge_model or os.environ.get("CLAWBENCH_JUDGE_MODEL", ""),
+            judge_affects_score=job.request.judge_affects_score,
            runs_per_task=job.request.runs_per_task,
            tier=job.request.tier,
            task_ids=[task.id for task in tasks],
@ -365,6 +366,7 @@ class EvalWorker:
                model=job.request.model,
                provider=job.request.provider,
                judge_model=job.request.judge_model or os.environ.get("CLAWBENCH_JUDGE_MODEL", ""),
+                judge_affects_score=job.request.judge_affects_score,
                runs_per_task=job.request.runs_per_task,
                tier=job.request.tier,
                scenario=job.request.scenario,
@ -421,6 +423,7 @@ class EvalWorker:
            model=job.request.model,
            provider=job.request.provider,
            judge_model=job.request.judge_model or os.environ.get("CLAWBENCH_JUDGE_MODEL", ""),
+            judge_affects_score=job.request.judge_affects_score,
            runs_per_task=job.request.runs_per_task,
            task_ids=[task.id for task in lane.tasks],
            scenario=job.request.scenario,
--- a/tests/test_harness.py
+++ b/tests/test_harness.py
@ -191,6 +191,15 @@ def test_run_cache_path_includes_scoring_inputs(tmp_path: Path):
        judge_model="judge-b",
        randomize_order=False,
    )
+    different_judge_gate = BenchmarkHarness(
+        gateway_config=GatewayConfig(),
+        model="test/model",
+        task_ids=[task.id],
+        prompt_variant="clear",
+        judge_model="judge-a",
+        judge_affects_score=True,
+        randomize_order=False,
+    )
    different_prompt = BenchmarkHarness(
        gateway_config=GatewayConfig(),
        model="test/model",
@ -205,6 +214,7 @@ def test_run_cache_path_includes_scoring_inputs(tmp_path: Path):
    assert "v2-" in str(base_path)
    assert base_path == same._run_cache_path(tmp_path, task, 0)
    assert base_path != different_judge._run_cache_path(tmp_path, task, 0)
+    assert base_path != different_judge_gate._run_cache_path(tmp_path, task, 0)
    assert base_path != different_prompt._run_cache_path(tmp_path, task, 0)


--- a/tests/test_queue.py
+++ b/tests/test_queue.py
@ -19,6 +19,18 @@ def test_submission_request_defaults_to_single_parallel_lane():

    assert request.max_parallel_lanes == 1
    assert request.runs_per_task == 3
+    assert request.judge_affects_score is False
+
+
+def test_submission_request_fingerprint_includes_judge_score_gate():
+    advisory = SubmissionRequest(model="anthropic/claude-sonnet-4-6", judge_model="judge")
+    weighted = SubmissionRequest(
+        model="anthropic/claude-sonnet-4-6",
+        judge_model="judge",
+        judge_affects_score=True,
+    )
+
+    assert advisory.active_fingerprint() != weighted.active_fingerprint()


 def test_save_local_replaces_queue_file_atomically(tmp_path, monkeypatch):
--- a/tests/test_scorer.py
+++ b/tests/test_scorer.py
@ -29,6 +29,18 @@ def test_combine_run_score_uses_normalized_weighted_average():
    assert combine_run_score(completion=0.5, trajectory=1.0, behavior=1.0) == 0.7778


+def test_combine_run_score_ignores_judge_by_default():
+    advisory_only = combine_run_score(
+        completion=1.0,
+        trajectory=1.0,
+        behavior=1.0,
+        judge=0.0,
+        has_deterministic_verifier=True,
+    )
+
+    assert advisory_only == 1.0
+
+
 def test_combine_run_score_caps_judge_when_deterministic_verifier_present():
    """Per v0.4 spec: semantic quality never rescues failed completion.

@ -46,6 +58,7 @@ def test_combine_run_score_caps_judge_when_deterministic_verifier_present():
        behavior=1.0,
        judge=1.0,
        has_deterministic_verifier=True,
+        include_judge=True,
    )
    without_judge = combine_run_score(
        completion=0.5,
@ -65,6 +78,7 @@ def test_combine_run_score_judge_lifts_at_most_10pct_when_deterministic_passes()
        behavior=1.0,
        judge=1.0,
        has_deterministic_verifier=True,
+        include_judge=True,
    )
    assert full == 1.0

@ -76,18 +90,20 @@ def test_combine_run_score_judge_lifts_at_most_10pct_when_deterministic_passes()
        behavior=1.0,
        judge=0.0,
        has_deterministic_verifier=True,
+        include_judge=True,
    )
    assert abs(lost_judge - 0.9) < 1e-4


 def test_combine_run_score_semantic_only_task_lets_judge_dominate():
-    """When no deterministic verifier exists, the judge is allowed to drive."""
+    """When no deterministic verifier exists, the judge is allowed to drive only when gated on."""
    semantic = combine_run_score(
        completion=0.0,
        trajectory=0.0,
        behavior=0.0,
        judge=1.0,
        has_deterministic_verifier=False,
+        include_judge=True,
    )
    # Judge weight 0.50 out of total 1.0
    assert abs(semantic - 0.5) < 1e-4
Author	SHA1	Message	Date
Vincent Koc	2670dcadf0	fix(scoring): include judge gate in run cache key Some checks failed CI / Python ${{ matrix.python-version }} test suite (3.11) (push) Has been cancelled Details CI / Python ${{ matrix.python-version }} test suite (3.12) (push) Has been cancelled Details	2026-04-28 22:50:06 -07:00
Vincent Koc	fbb13ac4d9	Merge remote-tracking branch 'origin/main' into fix/gate-judge-scoring-ff * origin/main: fix(runtime): harden benchmark cache and task paths fix: flag credential file access in dangerous shell patterns (#6) fix: flag git push --force variants as dangerous shell commands (#5) chore: add open-source contribution scaffolding (#3) fix: strip quoted strings before checking for shell redirect operators (#2)	2026-04-28 22:49:01 -07:00
Vincent Koc	453ddc0ca5	Merge remote-tracking branch 'origin/fix/gate-judge-scoring' into fix/gate-judge-scoring Some checks are pending CI / Python ${{ matrix.python-version }} test suite (3.11) (push) Waiting to run Details CI / Python ${{ matrix.python-version }} test suite (3.12) (push) Waiting to run Details * origin/fix/gate-judge-scoring: fix(scoring): gate judge-weighted scores	2026-04-28 11:37:48 -07:00
Vincent Koc	d7a2e50ea3	fix(scoring): gate judge-weighted scores	2026-04-28 11:36:21 -07:00
Vincent Koc	2b9c277512	fix(scoring): gate judge-weighted scores	2026-04-28 10:54:18 -07:00