test: cover judge score gate propagation

2026-04-28 23:07:27 -07:00
3 changed files with 162 additions and 25 deletions
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@ -1,6 +1,51 @@
-from clawbench.cli import SCENARIO_CHOICES
+from click.testing import CliRunner
 from clawbench.cli import SCENARIO_CHOICES, cli
 from clawbench.schemas import ScenarioDomain
 def test_cli_scenario_choices_track_schema_enum():
    assert SCENARIO_CHOICES == [scenario.value for scenario in ScenarioDomain]
 def test_run_command_forwards_judge_score_gate(monkeypatch, tmp_path):
    captured: dict[str, object] = {}
    class FakeResult:
        submission_id = "submission-1"
        def model_dump(self):
            return {"submission_id": self.submission_id}
    class FakeHarness:
        def __init__(self, **kwargs):
            captured.update(kwargs)
        async def run(self):
            return FakeResult()
    monkeypatch.setattr("clawbench.cli.BenchmarkHarness", FakeHarness)
    output = tmp_path / "result.json"
    result = CliRunner().invoke(
        cli,
        [
            "run",
            "--model",
            "anthropic/claude-sonnet-4-6",
            "--judge-model",
            "judge-model",
            "--judge-affects-score",
            "--runs",
            "1",
            "--task",
            "t1-bugfix-discount",
            "--output",
            str(output),
        ],
    )
    assert result.exit_code == 0, result.output
    assert captured["judge_model"] == "judge-model"
    assert captured["judge_affects_score"] is True
    assert output.read_text(encoding="utf-8")
--- a/tests/test_scorer.py
+++ b/tests/test_scorer.py
@ -1,8 +1,11 @@
 import pytest
 from clawbench.scorer import (
    classify_delivery_outcome,
    classify_failure_mode,
    combine_run_score,
    evaluate_behavior,
    score_task_run,
 )
 from clawbench.schemas import (
    BehaviorExpectations,
@ -22,6 +25,17 @@ from clawbench.schemas import (
 )
 def _task_with_user() -> TaskDefinition:
    return TaskDefinition(
        id="test-task",
        name="Test Task",
        tier=Tier.TIER1,
        family=TaskFamily.CODING,
        surface="coding",
        user=SimulatedUser(turns=[UserTurn(message="Fix it")]),
    )
 def test_combine_run_score_uses_normalized_weighted_average():
    assert combine_run_score(completion=1.0, trajectory=1.0, behavior=1.0) == 1.0
    assert combine_run_score(completion=0.0, trajectory=0.0, behavior=0.0) == 0.0
@ -109,6 +123,57 @@ def test_combine_run_score_semantic_only_task_lets_judge_dominate():
    assert abs(semantic - 0.5) < 1e-4
@pytest.mark.asyncio
@pytest.mark.parametrize(
    ("judge_affects_score", "expected_score"),
    [
        (False, 1.0),
        (True, 0.9),
    ],
 )
 async def test_score_task_run_keeps_judge_advisory_until_gate_enabled(
    monkeypatch,
    tmp_path,
    judge_affects_score: bool,
    expected_score: float,
 ):
    async def fake_verify_completion(*args, **kwargs):
        return CompletionResult(total_assertions=1, passed_assertions=1, score=1.0)
    async def fake_judge_task_run(*args, **kwargs):
        from clawbench.schemas import JudgeResult
        return JudgeResult(enabled=True, model="judge-model", score=0.0, passed=False)
    monkeypatch.setattr("clawbench.scorer.verify_completion", fake_verify_completion)
    monkeypatch.setattr("clawbench.scorer.judge_task_run", fake_judge_task_run)
    monkeypatch.setattr(
        "clawbench.scorer.evaluate_trajectory",
        lambda transcript, expectations: TrajectoryResult(score=1.0),
    )
    monkeypatch.setattr(
        "clawbench.scorer.evaluate_behavior",
        lambda expectations, transcript: BehaviorResult(score=1.0),
    )
    result = await score_task_run(
        task=_task_with_user(),
        transcript=Transcript(),
        workspace=tmp_path,
        client=object(),  # type: ignore[arg-type]
        session_key="session",
        agent_id="agent",
        duration_ms=100,
        runtime_values={},
        judge_model="judge-model",
        judge_affects_score=judge_affects_score,
    )
    assert result.judge_result.enabled is True
    assert result.judge_result.score == 0.0
    assert result.run_score == expected_score
 def test_evaluate_behavior_counts_later_tool_work_as_progress():
    transcript = Transcript(
        messages=[
@ -130,14 +195,7 @@ def test_evaluate_behavior_counts_later_tool_work_as_progress():
 def test_classify_failure_mode_flags_hallucinated_completion():
-    task = TaskDefinition(
+    task = _task_with_user()
        id="test-task",
        name="Test Task",
        tier=Tier.TIER1,
        family=TaskFamily.CODING,
        surface="coding",
        user=SimulatedUser(turns=[UserTurn(message="Fix it")]),
    )
    transcript = Transcript(messages=[TranscriptMessage(role="assistant", text="All done. Tests pass now.")])
    failure_mode = classify_failure_mode(
        task=task,
@ -152,14 +210,7 @@ def test_classify_failure_mode_flags_hallucinated_completion():
 def test_classify_failure_mode_prefers_unsafe_mutation():
-    task = TaskDefinition(
+    task = _task_with_user()
        id="test-task",
        name="Test Task",
        tier=Tier.TIER1,
        family=TaskFamily.CODING,
        surface="coding",
        user=SimulatedUser(turns=[UserTurn(message="Fix it")]),
    )
    failure_mode = classify_failure_mode(
        task=task,
        transcript=Transcript(),
@ -173,14 +224,7 @@ def test_classify_failure_mode_prefers_unsafe_mutation():
 def test_classify_delivery_outcome_supports_partial_credit():
-    task = TaskDefinition(
+    task = _task_with_user()
        id="test-task",
        name="Test Task",
        tier=Tier.TIER1,
        family=TaskFamily.CODING,
        surface="coding",
        user=SimulatedUser(turns=[UserTurn(message="Fix it")]),
    )
    assert (
        classify_delivery_outcome(
--- a/tests/test_worker.py
+++ b/tests/test_worker.py
@ -171,6 +171,54 @@ def test_materialize_lane_runtime_spaces_ports_and_copies_auth(tmp_path: Path, m
    assert (lane1.state_dir / "agents" / "main" / "agent" / "auth-profiles.json").exists()
@pytest.mark.asyncio
 async def test_run_serial_benchmark_forwards_judge_score_gate(monkeypatch):
    queue = JobQueue()
    worker = EvalWorker(queue)
    captured: dict[str, object] = {}
    async def fake_ensure_gateway() -> None:
        return None
    async def fake_preflight_browser_support_for_tasks(*args, **kwargs) -> None:
        return None
    class FakeHarness:
        def __init__(self, **kwargs):
            captured.update(kwargs)
        async def run(self):
            return SimpleNamespace(submission_id="submission-1")
    monkeypatch.setattr(worker, "_stop_gateway", lambda: None)
    monkeypatch.setattr(worker, "_ensure_gateway", fake_ensure_gateway)
    monkeypatch.setattr(worker, "_preflight_browser_support_for_tasks", fake_preflight_browser_support_for_tasks)
    monkeypatch.setattr("clawbench.worker.BenchmarkHarness", FakeHarness)
    job = SimpleNamespace(
        request=SimpleNamespace(
            model="anthropic/claude-sonnet-4-6",
            provider="anthropic",
            judge_model="judge-model",
            judge_affects_score=True,
            runs_per_task=1,
            tier="tier1",
            scenario=None,
            prompt_variant="clear",
        )
    )
    progress = JobProgressTracker(total_tasks=1, runs_per_task=1, requested_parallel_lanes=1)
    await worker._run_serial_benchmark(
        job,
        [DummyTask("t1-bugfix-discount", "tier1", "coding")],
        progress,
    )
    assert captured["judge_model"] == "judge-model"
    assert captured["judge_affects_score"] is True
@pytest.mark.asyncio
 async def test_ensure_gateway_closes_parent_log_handle(monkeypatch):
    worker = EvalWorker(JobQueue())