clawbench/tests/test_dynamics_cli.py

76 lines
2.3 KiB
Python

from pathlib import Path
from click.testing import CliRunner
from clawbench.cli import cli
from clawbench.dynamics_archive import safe_model_name
from clawbench.schemas import TaskRunResult, TokenUsage, ToolCall, Transcript, TranscriptMessage
def _msg(role: str, text: str = "", family: str | None = None, ts: int = 0) -> TranscriptMessage:
tool_calls = []
if family is not None:
tool_calls.append(
ToolCall(
name=f"tool_{family}",
family=family,
success=True,
error="",
mutating=family == "edit",
)
)
return TranscriptMessage(
role=role,
text=text,
tool_calls=tool_calls,
timestamp_ms=ts,
usage=TokenUsage(input_tokens=10, output_tokens=5, total_tokens=15),
)
def _run(task_id: str, run_index: int = 0) -> TaskRunResult:
transcript = Transcript(
messages=[
_msg("user", f"Solve {task_id}"),
_msg("assistant", "inspect", family="read", ts=1000),
_msg("assistant", "edit", family="edit", ts=2000),
_msg("assistant", "verify", family="execute", ts=3000),
]
)
return TaskRunResult(
task_id=task_id,
run_index=run_index,
transcript=transcript,
run_score=0.8,
duration_ms=3000,
token_usage=transcript.total_usage,
)
def test_dynamics_report_cli_supports_no_plots(tmp_path: Path):
model_dir = tmp_path / safe_model_name("ollama/gpt-oss:20b") / "t1-demo-task"
model_dir.mkdir(parents=True)
run = _run("t1-demo-task")
(model_dir / "run0.json").write_text(run.model_dump_json(indent=2), encoding="utf-8")
runner = CliRunner()
output_dir = tmp_path / "out"
result = runner.invoke(
cli,
[
"dynamics-report",
"--archive-dir",
str(tmp_path),
"--model",
"ollama/gpt-oss:20b",
"--output-dir",
str(output_dir),
"--no-plots",
],
)
assert result.exit_code == 0, result.output
assert "Loaded 1 cached runs across 1 tasks" in result.output
assert "Saved 0 plots" in result.output
assert (output_dir / "dynamics.json").exists()
assert list(output_dir.glob("*.png")) == []