clawbench/tests/test_scorer.py
2026-04-28 23:08:58 -07:00

253 lines
7.8 KiB
Python

import pytest
from clawbench.scorer import (
classify_delivery_outcome,
classify_failure_mode,
combine_run_score,
evaluate_behavior,
score_task_run,
)
from clawbench.schemas import (
BehaviorExpectations,
BehaviorResult,
CompletionResult,
DeliveryOutcome,
FailureMode,
SimulatedUser,
TaskDefinition,
TaskFamily,
Tier,
ToolCall,
Transcript,
TranscriptMessage,
TrajectoryResult,
UserTurn,
)
def _task_with_user() -> TaskDefinition:
return TaskDefinition(
id="test-task",
name="Test Task",
tier=Tier.TIER1,
family=TaskFamily.CODING,
surface="coding",
user=SimulatedUser(turns=[UserTurn(message="Fix it")]),
)
def test_combine_run_score_uses_normalized_weighted_average():
assert combine_run_score(completion=1.0, trajectory=1.0, behavior=1.0) == 1.0
assert combine_run_score(completion=0.0, trajectory=0.0, behavior=0.0) == 0.0
assert combine_run_score(completion=1.0, trajectory=0.0, behavior=0.0) == 0.4444
assert combine_run_score(completion=0.5, trajectory=1.0, behavior=1.0) == 0.7778
def test_combine_run_score_ignores_judge_by_default():
advisory_only = combine_run_score(
completion=1.0,
trajectory=1.0,
behavior=1.0,
judge=0.0,
has_deterministic_verifier=True,
)
assert advisory_only == 1.0
def test_combine_run_score_caps_judge_when_deterministic_verifier_present():
"""Per v0.4 spec: semantic quality never rescues failed completion.
When a task has deterministic completion checks and the deterministic
floor is not met, the judge contribution must be zeroed, so a broken
solution with a green judge cannot inflate the score above what the
deterministic axes allow.
"""
# Deterministic floor NOT met, judge reports a perfect 1.0 — judge
# must not rescue the score. The run score should equal what the
# deterministic path alone produces for completion=0.5.
with_broken_det = combine_run_score(
completion=0.5,
trajectory=1.0,
behavior=1.0,
judge=1.0,
has_deterministic_verifier=True,
include_judge=True,
)
without_judge = combine_run_score(
completion=0.5,
trajectory=1.0,
behavior=1.0,
)
assert with_broken_det == without_judge
def test_combine_run_score_judge_lifts_at_most_10pct_when_deterministic_passes():
"""Judge may contribute at most 10% when a deterministic verifier exists."""
# Deterministic floor IS met (1.0), and judge is 1.0 — the judge row
# weight is 0.10 out of total 1.0 so score should still be 1.0.
full = combine_run_score(
completion=1.0,
trajectory=1.0,
behavior=1.0,
judge=1.0,
has_deterministic_verifier=True,
include_judge=True,
)
assert full == 1.0
# Deterministic floor met, judge = 0 — run score should only lose
# the 10% judge contribution.
lost_judge = combine_run_score(
completion=1.0,
trajectory=1.0,
behavior=1.0,
judge=0.0,
has_deterministic_verifier=True,
include_judge=True,
)
assert abs(lost_judge - 0.9) < 1e-4
def test_combine_run_score_semantic_only_task_lets_judge_dominate():
"""When no deterministic verifier exists, the judge is allowed to drive only when gated on."""
semantic = combine_run_score(
completion=0.0,
trajectory=0.0,
behavior=0.0,
judge=1.0,
has_deterministic_verifier=False,
include_judge=True,
)
# Judge weight 0.50 out of total 1.0
assert abs(semantic - 0.5) < 1e-4
@pytest.mark.asyncio
@pytest.mark.parametrize(
("judge_affects_score", "expected_score"),
[
(False, 1.0),
(True, 0.9),
],
)
async def test_score_task_run_keeps_judge_advisory_until_gate_enabled(
monkeypatch,
tmp_path,
judge_affects_score: bool,
expected_score: float,
):
async def fake_verify_completion(*args, **kwargs):
return CompletionResult(total_assertions=1, passed_assertions=1, score=1.0)
async def fake_judge_task_run(*args, **kwargs):
from clawbench.schemas import JudgeResult
return JudgeResult(enabled=True, model="judge-model", score=0.0, passed=False)
monkeypatch.setattr("clawbench.scorer.verify_completion", fake_verify_completion)
monkeypatch.setattr("clawbench.scorer.judge_task_run", fake_judge_task_run)
monkeypatch.setattr(
"clawbench.scorer.evaluate_trajectory",
lambda transcript, expectations: TrajectoryResult(score=1.0),
)
monkeypatch.setattr(
"clawbench.scorer.evaluate_behavior",
lambda expectations, transcript: BehaviorResult(score=1.0),
)
result = await score_task_run(
task=_task_with_user(),
transcript=Transcript(),
workspace=tmp_path,
client=object(), # type: ignore[arg-type]
session_key="session",
agent_id="agent",
duration_ms=100,
runtime_values={},
judge_model="judge-model",
judge_affects_score=judge_affects_score,
)
assert result.judge_result.enabled is True
assert result.judge_result.score == 0.0
assert result.run_score == expected_score
def test_evaluate_behavior_counts_later_tool_work_as_progress():
transcript = Transcript(
messages=[
TranscriptMessage(role="assistant", text="First I'll inspect the project."),
TranscriptMessage(
role="assistant",
tool_calls=[ToolCall(name="exec_command", input={"cmd": "pytest -q"})],
),
]
)
result = evaluate_behavior(
BehaviorExpectations(require_plan=True, require_progress_updates=True),
transcript,
)
assert result.score == 1.0
assert result.failed_expectations == []
def test_classify_failure_mode_flags_hallucinated_completion():
task = _task_with_user()
transcript = Transcript(messages=[TranscriptMessage(role="assistant", text="All done. Tests pass now.")])
failure_mode = classify_failure_mode(
task=task,
transcript=transcript,
completion_result=CompletionResult(total_assertions=1, passed_assertions=0, failed_assertions=["EXEC pytest: failed"]),
trajectory_result=TrajectoryResult(self_verified=True),
behavior_result=BehaviorResult(score=1.0),
error=None,
)
assert failure_mode == FailureMode.HALLUCINATED_COMPLETION
def test_classify_failure_mode_prefers_unsafe_mutation():
task = _task_with_user()
failure_mode = classify_failure_mode(
task=task,
transcript=Transcript(),
completion_result=CompletionResult(total_assertions=1, passed_assertions=0, failed_assertions=["EXEC pytest: failed"]),
trajectory_result=TrajectoryResult(forbidden_violations=["Dangerous shell command: rm -rf ."]),
behavior_result=BehaviorResult(score=0.0),
error=None,
)
assert failure_mode == FailureMode.UNSAFE_MUTATION
def test_classify_delivery_outcome_supports_partial_credit():
task = _task_with_user()
assert (
classify_delivery_outcome(
task=task,
completion_result=CompletionResult(total_assertions=2, passed_assertions=2, score=1.0),
run_score=1.0,
)
== DeliveryOutcome.PASS
)
assert (
classify_delivery_outcome(
task=task,
completion_result=CompletionResult(total_assertions=2, passed_assertions=1, score=0.5),
run_score=0.5,
)
== DeliveryOutcome.PARTIAL
)
assert (
classify_delivery_outcome(
task=task,
completion_result=CompletionResult(total_assertions=2, passed_assertions=0, score=0.0),
run_score=0.0,
)
== DeliveryOutcome.FAIL
)