clawbench/tests/test_judge.py

192 lines
6.0 KiB
Python

from pathlib import Path
import pytest
from clawbench.judge import build_judge_prompt, parse_judge_response
from clawbench.schemas import (
CompletionResult,
JudgeExpectations,
SimulatedUser,
TaskDefinition,
TaskFamily,
Tier,
ToolCall,
Transcript,
TranscriptMessage,
UserTurn,
)
def _make_task(judge: JudgeExpectations) -> TaskDefinition:
return TaskDefinition(
id="judge-task",
name="Judge Task",
tier=Tier.TIER5,
family=TaskFamily.ADVERSARIAL,
surface="coding",
user=SimulatedUser(turns=[UserTurn(message="Do the thing")]),
judge=judge,
)
def test_build_judge_prompt_includes_artifacts_completion_feedback_and_transcript(tmp_path: Path):
(tmp_path / "answer.txt").write_text("Support window: 18 months\n", encoding="utf-8")
judge = JudgeExpectations(
rubric="Check that the answer is grounded and auditable.",
artifact_paths=["answer.txt"],
include_transcript=True,
include_completion_feedback=True,
)
task = _make_task(judge)
transcript = Transcript(
messages=[
TranscriptMessage(role="user", text="Use the local docs only."),
TranscriptMessage(
role="assistant",
text="First I'll inspect the docs, then I'll write the answer.",
tool_calls=[ToolCall(name="read_file", family="read", success=True)],
),
]
)
completion_result = CompletionResult(
total_assertions=2,
passed_assertions=1,
failed_assertions=["EXEC python3 verify_answer.py: wrong quote"],
score=0.5,
)
prompt = build_judge_prompt(
task=task,
judge=judge,
transcript=transcript,
workspace=tmp_path,
completion_result=completion_result,
)
assert "Judge threshold: 0.70" in prompt
assert "=== answer.txt ===" in prompt
assert "Support window: 18 months" in prompt
assert "completion assertions: 1/2" in prompt
assert "wrong quote" in prompt
assert "tool families: read x1" in prompt
def test_build_judge_prompt_rejects_artifact_paths_outside_workspace(tmp_path: Path):
outside = tmp_path.parent / "outside-judge.txt"
outside.write_text("do not leak", encoding="utf-8")
judge = JudgeExpectations(
rubric="Check that the answer is grounded and auditable.",
artifact_paths=["../outside-judge.txt"],
)
task = _make_task(judge)
prompt = build_judge_prompt(
task=task,
judge=judge,
transcript=Transcript(),
workspace=tmp_path,
completion_result=CompletionResult(score=1.0),
)
assert "invalid path" in prompt
assert "do not leak" not in prompt
def test_parse_judge_response_accepts_wrapped_json_and_computes_pass():
result = parse_judge_response(
'Score summary:\n{"score": 0.82, "confidence": 0.66, "reason": "Strong evidence.", "rubric_hits": ["grounded"], "rubric_misses": []}',
passing_threshold=0.8,
)
assert result.enabled is True
assert result.score == 0.82
assert result.confidence == 0.66
assert result.passed is True
assert result.rubric_hits == ["grounded"]
assert result.error is None
def test_parse_judge_response_reports_invalid_json():
result = parse_judge_response("not json at all", passing_threshold=0.7)
assert result.enabled is True
assert result.error == "Judge response did not contain valid JSON."
assert result.passed is False
def test_parse_judge_response_falls_back_to_labeled_text():
result = parse_judge_response(
"""
score: 0.84
confidence: 0.72
reason: Grounded in the artifact and easy to audit.
rubric_hits:
- cites the right file
- keeps the answer precise
rubric_misses:
- could quote a shorter line
""",
passing_threshold=0.8,
)
assert result.score == 0.84
assert result.confidence == 0.72
assert result.passed is True
assert result.rubric_hits == ["cites the right file", "keeps the answer precise"]
@pytest.mark.asyncio
async def test_judge_task_run_uses_gateway_session_and_parses_response(tmp_path: Path):
from clawbench.judge import judge_task_run
(tmp_path / "answer.txt").write_text("Support window: 18 months\n", encoding="utf-8")
judge = JudgeExpectations(
rubric="Check whether the answer is grounded.",
artifact_paths=["answer.txt"],
)
task = _make_task(judge)
transcript = Transcript(messages=[TranscriptMessage(role="assistant", text="I checked the docs.")])
class FakeClient:
def __init__(self) -> None:
self.deleted: list[str] = []
async def create_session(self, *, model: str, label: str) -> str:
assert model == "judge-model"
assert label.startswith("clawbench-judge-")
return "judge-session-1"
async def subscribe(self, session_key: str) -> None:
assert session_key == "judge-session-1"
async def send_and_wait(self, session_key: str, message: str):
assert session_key == "judge-session-1"
assert "Support window: 18 months" in message
return Transcript(
messages=[
TranscriptMessage(
role="assistant",
text='{"score": 0.91, "confidence": 0.75, "reason": "Grounded.", "rubric_hits": ["exact"], "rubric_misses": []}',
)
]
)
async def delete_session(self, session_key: str) -> None:
self.deleted.append(session_key)
client = FakeClient()
result = await judge_task_run(
task=task,
transcript=transcript,
workspace=tmp_path,
client=client, # type: ignore[arg-type]
judge_model="judge-model",
completion_result=CompletionResult(total_assertions=1, passed_assertions=1, score=1.0),
)
assert result.enabled is True
assert result.model == "judge-model"
assert result.score == 0.91
assert result.passed is True
assert client.deleted == ["judge-session-1"]