clawbench/tests/test_harness.py

218 lines
7.5 KiB
Python

from pathlib import Path
import pytest
from clawbench.client import GatewayConfig
from clawbench.harness import BenchmarkHarness
from clawbench.schemas import CompletionResult, JudgeResult, TaskRunResult
from clawbench.tasks import load_all_tasks
class FakeGatewayClient:
def __init__(self) -> None:
self.create_agent_calls: list[tuple[str, str]] = []
async def create_agent(self, *, name: str, workspace: str) -> str:
self.create_agent_calls.append((name, workspace))
return "agent-test-123"
@pytest.mark.asyncio
async def test_run_agent_uses_staged_run_workspace(tmp_path: Path):
task = next(task for task in load_all_tasks() if task.id == "t1-bugfix-discount")
harness = BenchmarkHarness(gateway_config=GatewayConfig(), model="test-model", randomize_order=False)
workspace = tmp_path / "run-workspace"
workspace.mkdir(parents=True, exist_ok=True)
client = FakeGatewayClient()
agent_id = await harness._create_run_agent(
client, # type: ignore[arg-type]
task=task,
workspace=workspace,
run_index=2,
)
assert agent_id == "agent-test-123"
assert client.create_agent_calls == [(client.create_agent_calls[0][0], str(workspace))]
assert task.id in client.create_agent_calls[0][0]
@pytest.mark.asyncio
async def test_prepare_run_hook_executes_before_each_run(monkeypatch):
task = next(task for task in load_all_tasks() if task.id == "t1-bugfix-discount")
calls: list[tuple[str, int]] = []
async def prepare_run(current_task, run_index: int) -> None:
calls.append((current_task.id, run_index))
async def fake_run_single(self, current_task, run_index: int):
from clawbench.schemas import TaskRunResult
return TaskRunResult(
task_id=current_task.id,
tier=current_task.tier.value,
family=current_task.family.value,
run_index=run_index,
run_score=1.0,
)
monkeypatch.setattr("clawbench.harness.load_all_tasks", lambda **_: [task])
monkeypatch.setattr(BenchmarkHarness, "_run_single", fake_run_single)
harness = BenchmarkHarness(
gateway_config=GatewayConfig(),
model="test-model",
task_ids=[task.id],
runs_per_task=2,
randomize_order=False,
prepare_run=prepare_run,
)
await harness.run()
assert calls == [(task.id, 0), (task.id, 1)]
def test_aggregate_reports_advisory_judge_metrics():
task = next(task for task in load_all_tasks() if task.id == "t5-hallucination-resistant-evidence")
harness = BenchmarkHarness(
gateway_config=GatewayConfig(),
model="test-model",
judge_model="judge-model",
task_ids=[task.id],
randomize_order=False,
)
runs = [
TaskRunResult(
task_id=task.id,
tier=task.tier.value,
family=task.family.value,
run_index=0,
run_score=0.9,
completion_result=CompletionResult(total_assertions=1, passed_assertions=1, score=1.0),
judge_result=JudgeResult(enabled=True, model="judge-model", score=0.9, confidence=0.7, passed=True),
),
TaskRunResult(
task_id=task.id,
tier=task.tier.value,
family=task.family.value,
run_index=1,
run_score=0.6,
completion_result=CompletionResult(total_assertions=1, passed_assertions=1, score=1.0),
judge_result=JudgeResult(enabled=True, model="judge-model", score=0.5, confidence=0.9, passed=False),
),
]
result = harness._aggregate([task], {task.id: runs})
task_result = result.task_results[0]
assert result.judge_model == "judge-model"
assert result.overall_judge_score == pytest.approx(0.7)
assert result.overall_judge_confidence == pytest.approx(0.8)
assert result.overall_judge_pass_rate == pytest.approx(0.5)
assert result.judge_task_coverage == 1.0
assert task_result.mean_judge_score == pytest.approx(0.7)
assert task_result.mean_judge_confidence == pytest.approx(0.8)
assert task_result.judge_pass_rate == pytest.approx(0.5)
assert task_result.judged_runs == 2
def test_compose_result_from_task_stats_supports_parallel_environment_metadata():
task = next(task for task in load_all_tasks() if task.id == "t1-bugfix-discount")
harness = BenchmarkHarness(
gateway_config=GatewayConfig(),
model="test-model",
task_ids=[task.id],
randomize_order=False,
print_report=False,
quiet=True,
)
runs = [
TaskRunResult(
task_id=task.id,
tier=task.tier.value,
family=task.family.value,
run_index=0,
run_score=0.9,
completion_result=CompletionResult(total_assertions=1, passed_assertions=1, score=1.0),
),
TaskRunResult(
task_id=task.id,
tier=task.tier.value,
family=task.family.value,
run_index=1,
run_score=0.7,
completion_result=CompletionResult(total_assertions=1, passed_assertions=1, score=1.0),
),
]
base_result = harness._aggregate([task], {task.id: runs})
merged_result = harness.compose_result_from_task_stats(
base_result.task_results,
tasks=[task],
environment_extra={
"parallel_lanes": 2,
"requested_parallel_lanes": 3,
"browser_tasks_serialized": False,
},
print_report=False,
)
assert merged_result.overall_score == pytest.approx(base_result.overall_score)
assert merged_result.overall_completion == pytest.approx(base_result.overall_completion)
assert merged_result.environment["parallel_lanes"] == 2
assert merged_result.environment["requested_parallel_lanes"] == 3
assert merged_result.environment["browser_tasks_serialized"] is False
@pytest.mark.asyncio
async def test_run_records_adapter_surface(monkeypatch):
task = next(task for task in load_all_tasks() if task.id == "t1-bugfix-discount")
async def fake_run_single(self, current_task, run_index: int):
return TaskRunResult(
task_id=current_task.id,
tier=current_task.tier.value,
family=current_task.family.value,
run_index=run_index,
run_score=1.0,
completion_result=CompletionResult(total_assertions=1, passed_assertions=1, score=1.0),
)
monkeypatch.setattr("clawbench.harness.load_all_tasks", lambda **_: [task])
monkeypatch.setattr(BenchmarkHarness, "_run_single", fake_run_single)
harness = BenchmarkHarness(
gateway_config=GatewayConfig(),
model="test-model",
adapter="openclaw",
runs_per_task=1,
randomize_order=False,
print_report=False,
quiet=True,
)
result = await harness.run()
assert result.environment["adapter"] == "openclaw"
assert "hermes" in result.environment["known_adapters"]
@pytest.mark.asyncio
async def test_run_rejects_registered_but_unwired_adapter(monkeypatch):
task = next(task for task in load_all_tasks() if task.id == "t1-bugfix-discount")
monkeypatch.setattr("clawbench.harness.load_all_tasks", lambda **_: [task])
harness = BenchmarkHarness(
gateway_config=GatewayConfig(),
model="test-model",
adapter="hermes",
runs_per_task=1,
randomize_order=False,
print_report=False,
quiet=True,
)
with pytest.raises(ValueError, match="not yet wired"):
await harness.run()