From 07eba26f987367f20467b9a5c3f5cbcee215fa35 Mon Sep 17 00:00:00 2001
From: Vincent Koc <vincentkoc@ieee.org>
Date: Tue, 28 Apr 2026 23:07:27 -0700
Subject: [PATCH] test: cover judge score gate propagation

---
 tests/test_cli.py    | 47 +++++++++++++++++++++-
 tests/test_scorer.py | 92 ++++++++++++++++++++++++++++++++------------
 tests/test_worker.py | 48 +++++++++++++++++++++++
 3 files changed, 162 insertions(+), 25 deletions(-)

diff --git a/tests/test_cli.py b/tests/test_cli.py
index 1554399..2f7b712 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -1,6 +1,51 @@
-from clawbench.cli import SCENARIO_CHOICES
+from click.testing import CliRunner
+
+from clawbench.cli import SCENARIO_CHOICES, cli
 from clawbench.schemas import ScenarioDomain
 
 
 def test_cli_scenario_choices_track_schema_enum():
     assert SCENARIO_CHOICES == [scenario.value for scenario in ScenarioDomain]
+
+
+def test_run_command_forwards_judge_score_gate(monkeypatch, tmp_path):
+    captured: dict[str, object] = {}
+
+    class FakeResult:
+        submission_id = "submission-1"
+
+        def model_dump(self):
+            return {"submission_id": self.submission_id}
+
+    class FakeHarness:
+        def __init__(self, **kwargs):
+            captured.update(kwargs)
+
+        async def run(self):
+            return FakeResult()
+
+    monkeypatch.setattr("clawbench.cli.BenchmarkHarness", FakeHarness)
+
+    output = tmp_path / "result.json"
+    result = CliRunner().invoke(
+        cli,
+        [
+            "run",
+            "--model",
+            "anthropic/claude-sonnet-4-6",
+            "--judge-model",
+            "judge-model",
+            "--judge-affects-score",
+            "--runs",
+            "1",
+            "--task",
+            "t1-bugfix-discount",
+            "--output",
+            str(output),
+        ],
+    )
+
+    assert result.exit_code == 0, result.output
+    assert captured["judge_model"] == "judge-model"
+    assert captured["judge_affects_score"] is True
+    assert output.read_text(encoding="utf-8")
diff --git a/tests/test_scorer.py b/tests/test_scorer.py
index 7e76d6c..f99cdc6 100644
--- a/tests/test_scorer.py
+++ b/tests/test_scorer.py
@@ -1,8 +1,11 @@
+import pytest
+
 from clawbench.scorer import (
     classify_delivery_outcome,
     classify_failure_mode,
     combine_run_score,
     evaluate_behavior,
+    score_task_run,
 )
 from clawbench.schemas import (
     BehaviorExpectations,
@@ -22,6 +25,17 @@ from clawbench.schemas import (
 )
 
 
+def _task_with_user() -> TaskDefinition:
+    return TaskDefinition(
+        id="test-task",
+        name="Test Task",
+        tier=Tier.TIER1,
+        family=TaskFamily.CODING,
+        surface="coding",
+        user=SimulatedUser(turns=[UserTurn(message="Fix it")]),
+    )
+
+
 def test_combine_run_score_uses_normalized_weighted_average():
     assert combine_run_score(completion=1.0, trajectory=1.0, behavior=1.0) == 1.0
     assert combine_run_score(completion=0.0, trajectory=0.0, behavior=0.0) == 0.0
@@ -109,6 +123,57 @@ def test_combine_run_score_semantic_only_task_lets_judge_dominate():
     assert abs(semantic - 0.5) < 1e-4
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    ("judge_affects_score", "expected_score"),
+    [
+        (False, 1.0),
+        (True, 0.9),
+    ],
+)
+async def test_score_task_run_keeps_judge_advisory_until_gate_enabled(
+    monkeypatch,
+    tmp_path,
+    judge_affects_score: bool,
+    expected_score: float,
+):
+    async def fake_verify_completion(*args, **kwargs):
+        return CompletionResult(total_assertions=1, passed_assertions=1, score=1.0)
+
+    async def fake_judge_task_run(*args, **kwargs):
+        from clawbench.schemas import JudgeResult
+
+        return JudgeResult(enabled=True, model="judge-model", score=0.0, passed=False)
+
+    monkeypatch.setattr("clawbench.scorer.verify_completion", fake_verify_completion)
+    monkeypatch.setattr("clawbench.scorer.judge_task_run", fake_judge_task_run)
+    monkeypatch.setattr(
+        "clawbench.scorer.evaluate_trajectory",
+        lambda transcript, expectations: TrajectoryResult(score=1.0),
+    )
+    monkeypatch.setattr(
+        "clawbench.scorer.evaluate_behavior",
+        lambda expectations, transcript: BehaviorResult(score=1.0),
+    )
+
+    result = await score_task_run(
+        task=_task_with_user(),
+        transcript=Transcript(),
+        workspace=tmp_path,
+        client=object(),  # type: ignore[arg-type]
+        session_key="session",
+        agent_id="agent",
+        duration_ms=100,
+        runtime_values={},
+        judge_model="judge-model",
+        judge_affects_score=judge_affects_score,
+    )
+
+    assert result.judge_result.enabled is True
+    assert result.judge_result.score == 0.0
+    assert result.run_score == expected_score
+
+
 def test_evaluate_behavior_counts_later_tool_work_as_progress():
     transcript = Transcript(
         messages=[
@@ -130,14 +195,7 @@ def test_evaluate_behavior_counts_later_tool_work_as_progress():
 
 
 def test_classify_failure_mode_flags_hallucinated_completion():
-    task = TaskDefinition(
-        id="test-task",
-        name="Test Task",
-        tier=Tier.TIER1,
-        family=TaskFamily.CODING,
-        surface="coding",
-        user=SimulatedUser(turns=[UserTurn(message="Fix it")]),
-    )
+    task = _task_with_user()
     transcript = Transcript(messages=[TranscriptMessage(role="assistant", text="All done. Tests pass now.")])
     failure_mode = classify_failure_mode(
         task=task,
@@ -152,14 +210,7 @@ def test_classify_failure_mode_flags_hallucinated_completion():
 
 
 def test_classify_failure_mode_prefers_unsafe_mutation():
-    task = TaskDefinition(
-        id="test-task",
-        name="Test Task",
-        tier=Tier.TIER1,
-        family=TaskFamily.CODING,
-        surface="coding",
-        user=SimulatedUser(turns=[UserTurn(message="Fix it")]),
-    )
+    task = _task_with_user()
     failure_mode = classify_failure_mode(
         task=task,
         transcript=Transcript(),
@@ -173,14 +224,7 @@ def test_classify_failure_mode_prefers_unsafe_mutation():
 
 
 def test_classify_delivery_outcome_supports_partial_credit():
-    task = TaskDefinition(
-        id="test-task",
-        name="Test Task",
-        tier=Tier.TIER1,
-        family=TaskFamily.CODING,
-        surface="coding",
-        user=SimulatedUser(turns=[UserTurn(message="Fix it")]),
-    )
+    task = _task_with_user()
 
     assert (
         classify_delivery_outcome(
diff --git a/tests/test_worker.py b/tests/test_worker.py
index 559ac86..faeb883 100644
--- a/tests/test_worker.py
+++ b/tests/test_worker.py
@@ -171,6 +171,54 @@ def test_materialize_lane_runtime_spaces_ports_and_copies_auth(tmp_path: Path, m
     assert (lane1.state_dir / "agents" / "main" / "agent" / "auth-profiles.json").exists()
 
 
+@pytest.mark.asyncio
+async def test_run_serial_benchmark_forwards_judge_score_gate(monkeypatch):
+    queue = JobQueue()
+    worker = EvalWorker(queue)
+    captured: dict[str, object] = {}
+
+    async def fake_ensure_gateway() -> None:
+        return None
+
+    async def fake_preflight_browser_support_for_tasks(*args, **kwargs) -> None:
+        return None
+
+    class FakeHarness:
+        def __init__(self, **kwargs):
+            captured.update(kwargs)
+
+        async def run(self):
+            return SimpleNamespace(submission_id="submission-1")
+
+    monkeypatch.setattr(worker, "_stop_gateway", lambda: None)
+    monkeypatch.setattr(worker, "_ensure_gateway", fake_ensure_gateway)
+    monkeypatch.setattr(worker, "_preflight_browser_support_for_tasks", fake_preflight_browser_support_for_tasks)
+    monkeypatch.setattr("clawbench.worker.BenchmarkHarness", FakeHarness)
+
+    job = SimpleNamespace(
+        request=SimpleNamespace(
+            model="anthropic/claude-sonnet-4-6",
+            provider="anthropic",
+            judge_model="judge-model",
+            judge_affects_score=True,
+            runs_per_task=1,
+            tier="tier1",
+            scenario=None,
+            prompt_variant="clear",
+        )
+    )
+    progress = JobProgressTracker(total_tasks=1, runs_per_task=1, requested_parallel_lanes=1)
+
+    await worker._run_serial_benchmark(
+        job,
+        [DummyTask("t1-bugfix-discount", "tier1", "coding")],
+        progress,
+    )
+
+    assert captured["judge_model"] == "judge-model"
+    assert captured["judge_affects_score"] is True
+
+
 @pytest.mark.asyncio
 async def test_ensure_gateway_closes_parent_log_handle(monkeypatch):
     worker = EvalWorker(JobQueue())