Compare commits

...

1 Commits

Author SHA1 Message Date
Vincent Koc
07eba26f98
test: cover judge score gate propagation
Some checks failed
CI / Python ${{ matrix.python-version }} test suite (3.11) (push) Has been cancelled
CI / Python ${{ matrix.python-version }} test suite (3.12) (push) Has been cancelled
2026-04-28 23:07:27 -07:00
3 changed files with 162 additions and 25 deletions

View File

@ -1,6 +1,51 @@
from clawbench.cli import SCENARIO_CHOICES
from click.testing import CliRunner
from clawbench.cli import SCENARIO_CHOICES, cli
from clawbench.schemas import ScenarioDomain
def test_cli_scenario_choices_track_schema_enum():
assert SCENARIO_CHOICES == [scenario.value for scenario in ScenarioDomain]
def test_run_command_forwards_judge_score_gate(monkeypatch, tmp_path):
captured: dict[str, object] = {}
class FakeResult:
submission_id = "submission-1"
def model_dump(self):
return {"submission_id": self.submission_id}
class FakeHarness:
def __init__(self, **kwargs):
captured.update(kwargs)
async def run(self):
return FakeResult()
monkeypatch.setattr("clawbench.cli.BenchmarkHarness", FakeHarness)
output = tmp_path / "result.json"
result = CliRunner().invoke(
cli,
[
"run",
"--model",
"anthropic/claude-sonnet-4-6",
"--judge-model",
"judge-model",
"--judge-affects-score",
"--runs",
"1",
"--task",
"t1-bugfix-discount",
"--output",
str(output),
],
)
assert result.exit_code == 0, result.output
assert captured["judge_model"] == "judge-model"
assert captured["judge_affects_score"] is True
assert output.read_text(encoding="utf-8")

View File

@ -1,8 +1,11 @@
import pytest
from clawbench.scorer import (
classify_delivery_outcome,
classify_failure_mode,
combine_run_score,
evaluate_behavior,
score_task_run,
)
from clawbench.schemas import (
BehaviorExpectations,
@ -22,6 +25,17 @@ from clawbench.schemas import (
)
def _task_with_user() -> TaskDefinition:
return TaskDefinition(
id="test-task",
name="Test Task",
tier=Tier.TIER1,
family=TaskFamily.CODING,
surface="coding",
user=SimulatedUser(turns=[UserTurn(message="Fix it")]),
)
def test_combine_run_score_uses_normalized_weighted_average():
assert combine_run_score(completion=1.0, trajectory=1.0, behavior=1.0) == 1.0
assert combine_run_score(completion=0.0, trajectory=0.0, behavior=0.0) == 0.0
@ -109,6 +123,57 @@ def test_combine_run_score_semantic_only_task_lets_judge_dominate():
assert abs(semantic - 0.5) < 1e-4
@pytest.mark.asyncio
@pytest.mark.parametrize(
("judge_affects_score", "expected_score"),
[
(False, 1.0),
(True, 0.9),
],
)
async def test_score_task_run_keeps_judge_advisory_until_gate_enabled(
monkeypatch,
tmp_path,
judge_affects_score: bool,
expected_score: float,
):
async def fake_verify_completion(*args, **kwargs):
return CompletionResult(total_assertions=1, passed_assertions=1, score=1.0)
async def fake_judge_task_run(*args, **kwargs):
from clawbench.schemas import JudgeResult
return JudgeResult(enabled=True, model="judge-model", score=0.0, passed=False)
monkeypatch.setattr("clawbench.scorer.verify_completion", fake_verify_completion)
monkeypatch.setattr("clawbench.scorer.judge_task_run", fake_judge_task_run)
monkeypatch.setattr(
"clawbench.scorer.evaluate_trajectory",
lambda transcript, expectations: TrajectoryResult(score=1.0),
)
monkeypatch.setattr(
"clawbench.scorer.evaluate_behavior",
lambda expectations, transcript: BehaviorResult(score=1.0),
)
result = await score_task_run(
task=_task_with_user(),
transcript=Transcript(),
workspace=tmp_path,
client=object(), # type: ignore[arg-type]
session_key="session",
agent_id="agent",
duration_ms=100,
runtime_values={},
judge_model="judge-model",
judge_affects_score=judge_affects_score,
)
assert result.judge_result.enabled is True
assert result.judge_result.score == 0.0
assert result.run_score == expected_score
def test_evaluate_behavior_counts_later_tool_work_as_progress():
transcript = Transcript(
messages=[
@ -130,14 +195,7 @@ def test_evaluate_behavior_counts_later_tool_work_as_progress():
def test_classify_failure_mode_flags_hallucinated_completion():
task = TaskDefinition(
id="test-task",
name="Test Task",
tier=Tier.TIER1,
family=TaskFamily.CODING,
surface="coding",
user=SimulatedUser(turns=[UserTurn(message="Fix it")]),
)
task = _task_with_user()
transcript = Transcript(messages=[TranscriptMessage(role="assistant", text="All done. Tests pass now.")])
failure_mode = classify_failure_mode(
task=task,
@ -152,14 +210,7 @@ def test_classify_failure_mode_flags_hallucinated_completion():
def test_classify_failure_mode_prefers_unsafe_mutation():
task = TaskDefinition(
id="test-task",
name="Test Task",
tier=Tier.TIER1,
family=TaskFamily.CODING,
surface="coding",
user=SimulatedUser(turns=[UserTurn(message="Fix it")]),
)
task = _task_with_user()
failure_mode = classify_failure_mode(
task=task,
transcript=Transcript(),
@ -173,14 +224,7 @@ def test_classify_failure_mode_prefers_unsafe_mutation():
def test_classify_delivery_outcome_supports_partial_credit():
task = TaskDefinition(
id="test-task",
name="Test Task",
tier=Tier.TIER1,
family=TaskFamily.CODING,
surface="coding",
user=SimulatedUser(turns=[UserTurn(message="Fix it")]),
)
task = _task_with_user()
assert (
classify_delivery_outcome(

View File

@ -171,6 +171,54 @@ def test_materialize_lane_runtime_spaces_ports_and_copies_auth(tmp_path: Path, m
assert (lane1.state_dir / "agents" / "main" / "agent" / "auth-profiles.json").exists()
@pytest.mark.asyncio
async def test_run_serial_benchmark_forwards_judge_score_gate(monkeypatch):
queue = JobQueue()
worker = EvalWorker(queue)
captured: dict[str, object] = {}
async def fake_ensure_gateway() -> None:
return None
async def fake_preflight_browser_support_for_tasks(*args, **kwargs) -> None:
return None
class FakeHarness:
def __init__(self, **kwargs):
captured.update(kwargs)
async def run(self):
return SimpleNamespace(submission_id="submission-1")
monkeypatch.setattr(worker, "_stop_gateway", lambda: None)
monkeypatch.setattr(worker, "_ensure_gateway", fake_ensure_gateway)
monkeypatch.setattr(worker, "_preflight_browser_support_for_tasks", fake_preflight_browser_support_for_tasks)
monkeypatch.setattr("clawbench.worker.BenchmarkHarness", FakeHarness)
job = SimpleNamespace(
request=SimpleNamespace(
model="anthropic/claude-sonnet-4-6",
provider="anthropic",
judge_model="judge-model",
judge_affects_score=True,
runs_per_task=1,
tier="tier1",
scenario=None,
prompt_variant="clear",
)
)
progress = JobProgressTracker(total_tasks=1, runs_per_task=1, requested_parallel_lanes=1)
await worker._run_serial_benchmark(
job,
[DummyTask("t1-bugfix-discount", "tier1", "coding")],
progress,
)
assert captured["judge_model"] == "judge-model"
assert captured["judge_affects_score"] is True
@pytest.mark.asyncio
async def test_ensure_gateway_closes_parent_log_handle(monkeypatch):
worker = EvalWorker(JobQueue())