clawbench/tests/test_cli.py
2026-04-28 23:08:58 -07:00

52 lines
1.4 KiB
Python

from click.testing import CliRunner
from clawbench.cli import SCENARIO_CHOICES, cli
from clawbench.schemas import ScenarioDomain
def test_cli_scenario_choices_track_schema_enum():
assert SCENARIO_CHOICES == [scenario.value for scenario in ScenarioDomain]
def test_run_command_forwards_judge_score_gate(monkeypatch, tmp_path):
captured: dict[str, object] = {}
class FakeResult:
submission_id = "submission-1"
def model_dump(self):
return {"submission_id": self.submission_id}
class FakeHarness:
def __init__(self, **kwargs):
captured.update(kwargs)
async def run(self):
return FakeResult()
monkeypatch.setattr("clawbench.cli.BenchmarkHarness", FakeHarness)
output = tmp_path / "result.json"
result = CliRunner().invoke(
cli,
[
"run",
"--model",
"anthropic/claude-sonnet-4-6",
"--judge-model",
"judge-model",
"--judge-affects-score",
"--runs",
"1",
"--task",
"t1-bugfix-discount",
"--output",
str(output),
],
)
assert result.exit_code == 0, result.output
assert captured["judge_model"] == "judge-model"
assert captured["judge_affects_score"] is True
assert output.read_text(encoding="utf-8")