Compare commits
1 Commits
main
...
test/exten
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
07eba26f98 |
@ -1,6 +1,51 @@
|
|||||||
from clawbench.cli import SCENARIO_CHOICES
|
from click.testing import CliRunner
|
||||||
|
|
||||||
|
from clawbench.cli import SCENARIO_CHOICES, cli
|
||||||
from clawbench.schemas import ScenarioDomain
|
from clawbench.schemas import ScenarioDomain
|
||||||
|
|
||||||
|
|
||||||
def test_cli_scenario_choices_track_schema_enum():
|
def test_cli_scenario_choices_track_schema_enum():
|
||||||
assert SCENARIO_CHOICES == [scenario.value for scenario in ScenarioDomain]
|
assert SCENARIO_CHOICES == [scenario.value for scenario in ScenarioDomain]
|
||||||
|
|
||||||
|
|
||||||
|
def test_run_command_forwards_judge_score_gate(monkeypatch, tmp_path):
|
||||||
|
captured: dict[str, object] = {}
|
||||||
|
|
||||||
|
class FakeResult:
|
||||||
|
submission_id = "submission-1"
|
||||||
|
|
||||||
|
def model_dump(self):
|
||||||
|
return {"submission_id": self.submission_id}
|
||||||
|
|
||||||
|
class FakeHarness:
|
||||||
|
def __init__(self, **kwargs):
|
||||||
|
captured.update(kwargs)
|
||||||
|
|
||||||
|
async def run(self):
|
||||||
|
return FakeResult()
|
||||||
|
|
||||||
|
monkeypatch.setattr("clawbench.cli.BenchmarkHarness", FakeHarness)
|
||||||
|
|
||||||
|
output = tmp_path / "result.json"
|
||||||
|
result = CliRunner().invoke(
|
||||||
|
cli,
|
||||||
|
[
|
||||||
|
"run",
|
||||||
|
"--model",
|
||||||
|
"anthropic/claude-sonnet-4-6",
|
||||||
|
"--judge-model",
|
||||||
|
"judge-model",
|
||||||
|
"--judge-affects-score",
|
||||||
|
"--runs",
|
||||||
|
"1",
|
||||||
|
"--task",
|
||||||
|
"t1-bugfix-discount",
|
||||||
|
"--output",
|
||||||
|
str(output),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result.exit_code == 0, result.output
|
||||||
|
assert captured["judge_model"] == "judge-model"
|
||||||
|
assert captured["judge_affects_score"] is True
|
||||||
|
assert output.read_text(encoding="utf-8")
|
||||||
|
|||||||
@ -1,8 +1,11 @@
|
|||||||
|
import pytest
|
||||||
|
|
||||||
from clawbench.scorer import (
|
from clawbench.scorer import (
|
||||||
classify_delivery_outcome,
|
classify_delivery_outcome,
|
||||||
classify_failure_mode,
|
classify_failure_mode,
|
||||||
combine_run_score,
|
combine_run_score,
|
||||||
evaluate_behavior,
|
evaluate_behavior,
|
||||||
|
score_task_run,
|
||||||
)
|
)
|
||||||
from clawbench.schemas import (
|
from clawbench.schemas import (
|
||||||
BehaviorExpectations,
|
BehaviorExpectations,
|
||||||
@ -22,6 +25,17 @@ from clawbench.schemas import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _task_with_user() -> TaskDefinition:
|
||||||
|
return TaskDefinition(
|
||||||
|
id="test-task",
|
||||||
|
name="Test Task",
|
||||||
|
tier=Tier.TIER1,
|
||||||
|
family=TaskFamily.CODING,
|
||||||
|
surface="coding",
|
||||||
|
user=SimulatedUser(turns=[UserTurn(message="Fix it")]),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_combine_run_score_uses_normalized_weighted_average():
|
def test_combine_run_score_uses_normalized_weighted_average():
|
||||||
assert combine_run_score(completion=1.0, trajectory=1.0, behavior=1.0) == 1.0
|
assert combine_run_score(completion=1.0, trajectory=1.0, behavior=1.0) == 1.0
|
||||||
assert combine_run_score(completion=0.0, trajectory=0.0, behavior=0.0) == 0.0
|
assert combine_run_score(completion=0.0, trajectory=0.0, behavior=0.0) == 0.0
|
||||||
@ -109,6 +123,57 @@ def test_combine_run_score_semantic_only_task_lets_judge_dominate():
|
|||||||
assert abs(semantic - 0.5) < 1e-4
|
assert abs(semantic - 0.5) < 1e-4
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("judge_affects_score", "expected_score"),
|
||||||
|
[
|
||||||
|
(False, 1.0),
|
||||||
|
(True, 0.9),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
async def test_score_task_run_keeps_judge_advisory_until_gate_enabled(
|
||||||
|
monkeypatch,
|
||||||
|
tmp_path,
|
||||||
|
judge_affects_score: bool,
|
||||||
|
expected_score: float,
|
||||||
|
):
|
||||||
|
async def fake_verify_completion(*args, **kwargs):
|
||||||
|
return CompletionResult(total_assertions=1, passed_assertions=1, score=1.0)
|
||||||
|
|
||||||
|
async def fake_judge_task_run(*args, **kwargs):
|
||||||
|
from clawbench.schemas import JudgeResult
|
||||||
|
|
||||||
|
return JudgeResult(enabled=True, model="judge-model", score=0.0, passed=False)
|
||||||
|
|
||||||
|
monkeypatch.setattr("clawbench.scorer.verify_completion", fake_verify_completion)
|
||||||
|
monkeypatch.setattr("clawbench.scorer.judge_task_run", fake_judge_task_run)
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"clawbench.scorer.evaluate_trajectory",
|
||||||
|
lambda transcript, expectations: TrajectoryResult(score=1.0),
|
||||||
|
)
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"clawbench.scorer.evaluate_behavior",
|
||||||
|
lambda expectations, transcript: BehaviorResult(score=1.0),
|
||||||
|
)
|
||||||
|
|
||||||
|
result = await score_task_run(
|
||||||
|
task=_task_with_user(),
|
||||||
|
transcript=Transcript(),
|
||||||
|
workspace=tmp_path,
|
||||||
|
client=object(), # type: ignore[arg-type]
|
||||||
|
session_key="session",
|
||||||
|
agent_id="agent",
|
||||||
|
duration_ms=100,
|
||||||
|
runtime_values={},
|
||||||
|
judge_model="judge-model",
|
||||||
|
judge_affects_score=judge_affects_score,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result.judge_result.enabled is True
|
||||||
|
assert result.judge_result.score == 0.0
|
||||||
|
assert result.run_score == expected_score
|
||||||
|
|
||||||
|
|
||||||
def test_evaluate_behavior_counts_later_tool_work_as_progress():
|
def test_evaluate_behavior_counts_later_tool_work_as_progress():
|
||||||
transcript = Transcript(
|
transcript = Transcript(
|
||||||
messages=[
|
messages=[
|
||||||
@ -130,14 +195,7 @@ def test_evaluate_behavior_counts_later_tool_work_as_progress():
|
|||||||
|
|
||||||
|
|
||||||
def test_classify_failure_mode_flags_hallucinated_completion():
|
def test_classify_failure_mode_flags_hallucinated_completion():
|
||||||
task = TaskDefinition(
|
task = _task_with_user()
|
||||||
id="test-task",
|
|
||||||
name="Test Task",
|
|
||||||
tier=Tier.TIER1,
|
|
||||||
family=TaskFamily.CODING,
|
|
||||||
surface="coding",
|
|
||||||
user=SimulatedUser(turns=[UserTurn(message="Fix it")]),
|
|
||||||
)
|
|
||||||
transcript = Transcript(messages=[TranscriptMessage(role="assistant", text="All done. Tests pass now.")])
|
transcript = Transcript(messages=[TranscriptMessage(role="assistant", text="All done. Tests pass now.")])
|
||||||
failure_mode = classify_failure_mode(
|
failure_mode = classify_failure_mode(
|
||||||
task=task,
|
task=task,
|
||||||
@ -152,14 +210,7 @@ def test_classify_failure_mode_flags_hallucinated_completion():
|
|||||||
|
|
||||||
|
|
||||||
def test_classify_failure_mode_prefers_unsafe_mutation():
|
def test_classify_failure_mode_prefers_unsafe_mutation():
|
||||||
task = TaskDefinition(
|
task = _task_with_user()
|
||||||
id="test-task",
|
|
||||||
name="Test Task",
|
|
||||||
tier=Tier.TIER1,
|
|
||||||
family=TaskFamily.CODING,
|
|
||||||
surface="coding",
|
|
||||||
user=SimulatedUser(turns=[UserTurn(message="Fix it")]),
|
|
||||||
)
|
|
||||||
failure_mode = classify_failure_mode(
|
failure_mode = classify_failure_mode(
|
||||||
task=task,
|
task=task,
|
||||||
transcript=Transcript(),
|
transcript=Transcript(),
|
||||||
@ -173,14 +224,7 @@ def test_classify_failure_mode_prefers_unsafe_mutation():
|
|||||||
|
|
||||||
|
|
||||||
def test_classify_delivery_outcome_supports_partial_credit():
|
def test_classify_delivery_outcome_supports_partial_credit():
|
||||||
task = TaskDefinition(
|
task = _task_with_user()
|
||||||
id="test-task",
|
|
||||||
name="Test Task",
|
|
||||||
tier=Tier.TIER1,
|
|
||||||
family=TaskFamily.CODING,
|
|
||||||
surface="coding",
|
|
||||||
user=SimulatedUser(turns=[UserTurn(message="Fix it")]),
|
|
||||||
)
|
|
||||||
|
|
||||||
assert (
|
assert (
|
||||||
classify_delivery_outcome(
|
classify_delivery_outcome(
|
||||||
|
|||||||
@ -171,6 +171,54 @@ def test_materialize_lane_runtime_spaces_ports_and_copies_auth(tmp_path: Path, m
|
|||||||
assert (lane1.state_dir / "agents" / "main" / "agent" / "auth-profiles.json").exists()
|
assert (lane1.state_dir / "agents" / "main" / "agent" / "auth-profiles.json").exists()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_run_serial_benchmark_forwards_judge_score_gate(monkeypatch):
|
||||||
|
queue = JobQueue()
|
||||||
|
worker = EvalWorker(queue)
|
||||||
|
captured: dict[str, object] = {}
|
||||||
|
|
||||||
|
async def fake_ensure_gateway() -> None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def fake_preflight_browser_support_for_tasks(*args, **kwargs) -> None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
class FakeHarness:
|
||||||
|
def __init__(self, **kwargs):
|
||||||
|
captured.update(kwargs)
|
||||||
|
|
||||||
|
async def run(self):
|
||||||
|
return SimpleNamespace(submission_id="submission-1")
|
||||||
|
|
||||||
|
monkeypatch.setattr(worker, "_stop_gateway", lambda: None)
|
||||||
|
monkeypatch.setattr(worker, "_ensure_gateway", fake_ensure_gateway)
|
||||||
|
monkeypatch.setattr(worker, "_preflight_browser_support_for_tasks", fake_preflight_browser_support_for_tasks)
|
||||||
|
monkeypatch.setattr("clawbench.worker.BenchmarkHarness", FakeHarness)
|
||||||
|
|
||||||
|
job = SimpleNamespace(
|
||||||
|
request=SimpleNamespace(
|
||||||
|
model="anthropic/claude-sonnet-4-6",
|
||||||
|
provider="anthropic",
|
||||||
|
judge_model="judge-model",
|
||||||
|
judge_affects_score=True,
|
||||||
|
runs_per_task=1,
|
||||||
|
tier="tier1",
|
||||||
|
scenario=None,
|
||||||
|
prompt_variant="clear",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
progress = JobProgressTracker(total_tasks=1, runs_per_task=1, requested_parallel_lanes=1)
|
||||||
|
|
||||||
|
await worker._run_serial_benchmark(
|
||||||
|
job,
|
||||||
|
[DummyTask("t1-bugfix-discount", "tier1", "coding")],
|
||||||
|
progress,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert captured["judge_model"] == "judge-model"
|
||||||
|
assert captured["judge_affects_score"] is True
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_ensure_gateway_closes_parent_log_handle(monkeypatch):
|
async def test_ensure_gateway_closes_parent_log_handle(monkeypatch):
|
||||||
worker = EvalWorker(JobQueue())
|
worker = EvalWorker(JobQueue())
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user