52 lines
1.4 KiB
Python
52 lines
1.4 KiB
Python
from click.testing import CliRunner
|
|
|
|
from clawbench.cli import SCENARIO_CHOICES, cli
|
|
from clawbench.schemas import ScenarioDomain
|
|
|
|
|
|
def test_cli_scenario_choices_track_schema_enum():
|
|
assert SCENARIO_CHOICES == [scenario.value for scenario in ScenarioDomain]
|
|
|
|
|
|
def test_run_command_forwards_judge_score_gate(monkeypatch, tmp_path):
|
|
captured: dict[str, object] = {}
|
|
|
|
class FakeResult:
|
|
submission_id = "submission-1"
|
|
|
|
def model_dump(self):
|
|
return {"submission_id": self.submission_id}
|
|
|
|
class FakeHarness:
|
|
def __init__(self, **kwargs):
|
|
captured.update(kwargs)
|
|
|
|
async def run(self):
|
|
return FakeResult()
|
|
|
|
monkeypatch.setattr("clawbench.cli.BenchmarkHarness", FakeHarness)
|
|
|
|
output = tmp_path / "result.json"
|
|
result = CliRunner().invoke(
|
|
cli,
|
|
[
|
|
"run",
|
|
"--model",
|
|
"anthropic/claude-sonnet-4-6",
|
|
"--judge-model",
|
|
"judge-model",
|
|
"--judge-affects-score",
|
|
"--runs",
|
|
"1",
|
|
"--task",
|
|
"t1-bugfix-discount",
|
|
"--output",
|
|
str(output),
|
|
],
|
|
)
|
|
|
|
assert result.exit_code == 0, result.output
|
|
assert captured["judge_model"] == "judge-model"
|
|
assert captured["judge_affects_score"] is True
|
|
assert output.read_text(encoding="utf-8")
|