From 07eba26f987367f20467b9a5c3f5cbcee215fa35 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Tue, 28 Apr 2026 23:07:27 -0700 Subject: [PATCH] test: cover judge score gate propagation --- tests/test_cli.py | 47 +++++++++++++++++++++- tests/test_scorer.py | 92 ++++++++++++++++++++++++++++++++------------ tests/test_worker.py | 48 +++++++++++++++++++++++ 3 files changed, 162 insertions(+), 25 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 1554399..2f7b712 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,6 +1,51 @@ -from clawbench.cli import SCENARIO_CHOICES +from click.testing import CliRunner + +from clawbench.cli import SCENARIO_CHOICES, cli from clawbench.schemas import ScenarioDomain def test_cli_scenario_choices_track_schema_enum(): assert SCENARIO_CHOICES == [scenario.value for scenario in ScenarioDomain] + + +def test_run_command_forwards_judge_score_gate(monkeypatch, tmp_path): + captured: dict[str, object] = {} + + class FakeResult: + submission_id = "submission-1" + + def model_dump(self): + return {"submission_id": self.submission_id} + + class FakeHarness: + def __init__(self, **kwargs): + captured.update(kwargs) + + async def run(self): + return FakeResult() + + monkeypatch.setattr("clawbench.cli.BenchmarkHarness", FakeHarness) + + output = tmp_path / "result.json" + result = CliRunner().invoke( + cli, + [ + "run", + "--model", + "anthropic/claude-sonnet-4-6", + "--judge-model", + "judge-model", + "--judge-affects-score", + "--runs", + "1", + "--task", + "t1-bugfix-discount", + "--output", + str(output), + ], + ) + + assert result.exit_code == 0, result.output + assert captured["judge_model"] == "judge-model" + assert captured["judge_affects_score"] is True + assert output.read_text(encoding="utf-8") diff --git a/tests/test_scorer.py b/tests/test_scorer.py index 7e76d6c..f99cdc6 100644 --- a/tests/test_scorer.py +++ b/tests/test_scorer.py @@ -1,8 +1,11 @@ +import pytest + from clawbench.scorer import ( classify_delivery_outcome, classify_failure_mode, combine_run_score, evaluate_behavior, + score_task_run, ) from clawbench.schemas import ( BehaviorExpectations, @@ -22,6 +25,17 @@ from clawbench.schemas import ( ) +def _task_with_user() -> TaskDefinition: + return TaskDefinition( + id="test-task", + name="Test Task", + tier=Tier.TIER1, + family=TaskFamily.CODING, + surface="coding", + user=SimulatedUser(turns=[UserTurn(message="Fix it")]), + ) + + def test_combine_run_score_uses_normalized_weighted_average(): assert combine_run_score(completion=1.0, trajectory=1.0, behavior=1.0) == 1.0 assert combine_run_score(completion=0.0, trajectory=0.0, behavior=0.0) == 0.0 @@ -109,6 +123,57 @@ def test_combine_run_score_semantic_only_task_lets_judge_dominate(): assert abs(semantic - 0.5) < 1e-4 +@pytest.mark.asyncio +@pytest.mark.parametrize( + ("judge_affects_score", "expected_score"), + [ + (False, 1.0), + (True, 0.9), + ], +) +async def test_score_task_run_keeps_judge_advisory_until_gate_enabled( + monkeypatch, + tmp_path, + judge_affects_score: bool, + expected_score: float, +): + async def fake_verify_completion(*args, **kwargs): + return CompletionResult(total_assertions=1, passed_assertions=1, score=1.0) + + async def fake_judge_task_run(*args, **kwargs): + from clawbench.schemas import JudgeResult + + return JudgeResult(enabled=True, model="judge-model", score=0.0, passed=False) + + monkeypatch.setattr("clawbench.scorer.verify_completion", fake_verify_completion) + monkeypatch.setattr("clawbench.scorer.judge_task_run", fake_judge_task_run) + monkeypatch.setattr( + "clawbench.scorer.evaluate_trajectory", + lambda transcript, expectations: TrajectoryResult(score=1.0), + ) + monkeypatch.setattr( + "clawbench.scorer.evaluate_behavior", + lambda expectations, transcript: BehaviorResult(score=1.0), + ) + + result = await score_task_run( + task=_task_with_user(), + transcript=Transcript(), + workspace=tmp_path, + client=object(), # type: ignore[arg-type] + session_key="session", + agent_id="agent", + duration_ms=100, + runtime_values={}, + judge_model="judge-model", + judge_affects_score=judge_affects_score, + ) + + assert result.judge_result.enabled is True + assert result.judge_result.score == 0.0 + assert result.run_score == expected_score + + def test_evaluate_behavior_counts_later_tool_work_as_progress(): transcript = Transcript( messages=[ @@ -130,14 +195,7 @@ def test_evaluate_behavior_counts_later_tool_work_as_progress(): def test_classify_failure_mode_flags_hallucinated_completion(): - task = TaskDefinition( - id="test-task", - name="Test Task", - tier=Tier.TIER1, - family=TaskFamily.CODING, - surface="coding", - user=SimulatedUser(turns=[UserTurn(message="Fix it")]), - ) + task = _task_with_user() transcript = Transcript(messages=[TranscriptMessage(role="assistant", text="All done. Tests pass now.")]) failure_mode = classify_failure_mode( task=task, @@ -152,14 +210,7 @@ def test_classify_failure_mode_flags_hallucinated_completion(): def test_classify_failure_mode_prefers_unsafe_mutation(): - task = TaskDefinition( - id="test-task", - name="Test Task", - tier=Tier.TIER1, - family=TaskFamily.CODING, - surface="coding", - user=SimulatedUser(turns=[UserTurn(message="Fix it")]), - ) + task = _task_with_user() failure_mode = classify_failure_mode( task=task, transcript=Transcript(), @@ -173,14 +224,7 @@ def test_classify_failure_mode_prefers_unsafe_mutation(): def test_classify_delivery_outcome_supports_partial_credit(): - task = TaskDefinition( - id="test-task", - name="Test Task", - tier=Tier.TIER1, - family=TaskFamily.CODING, - surface="coding", - user=SimulatedUser(turns=[UserTurn(message="Fix it")]), - ) + task = _task_with_user() assert ( classify_delivery_outcome( diff --git a/tests/test_worker.py b/tests/test_worker.py index 559ac86..faeb883 100644 --- a/tests/test_worker.py +++ b/tests/test_worker.py @@ -171,6 +171,54 @@ def test_materialize_lane_runtime_spaces_ports_and_copies_auth(tmp_path: Path, m assert (lane1.state_dir / "agents" / "main" / "agent" / "auth-profiles.json").exists() +@pytest.mark.asyncio +async def test_run_serial_benchmark_forwards_judge_score_gate(monkeypatch): + queue = JobQueue() + worker = EvalWorker(queue) + captured: dict[str, object] = {} + + async def fake_ensure_gateway() -> None: + return None + + async def fake_preflight_browser_support_for_tasks(*args, **kwargs) -> None: + return None + + class FakeHarness: + def __init__(self, **kwargs): + captured.update(kwargs) + + async def run(self): + return SimpleNamespace(submission_id="submission-1") + + monkeypatch.setattr(worker, "_stop_gateway", lambda: None) + monkeypatch.setattr(worker, "_ensure_gateway", fake_ensure_gateway) + monkeypatch.setattr(worker, "_preflight_browser_support_for_tasks", fake_preflight_browser_support_for_tasks) + monkeypatch.setattr("clawbench.worker.BenchmarkHarness", FakeHarness) + + job = SimpleNamespace( + request=SimpleNamespace( + model="anthropic/claude-sonnet-4-6", + provider="anthropic", + judge_model="judge-model", + judge_affects_score=True, + runs_per_task=1, + tier="tier1", + scenario=None, + prompt_variant="clear", + ) + ) + progress = JobProgressTracker(total_tasks=1, runs_per_task=1, requested_parallel_lanes=1) + + await worker._run_serial_benchmark( + job, + [DummyTask("t1-bugfix-discount", "tier1", "coding")], + progress, + ) + + assert captured["judge_model"] == "judge-model" + assert captured["judge_affects_score"] is True + + @pytest.mark.asyncio async def test_ensure_gateway_closes_parent_log_handle(monkeypatch): worker = EvalWorker(JobQueue())