clawbench/tests/test_environment.py

122 lines
3.6 KiB
Python

from pathlib import Path
import pytest
from clawbench.environment import run_execution_check, verify_completion
from clawbench.schemas import CompletionSpec, ExecutionCheck, FileState, MemoryState, ToolCall, Transcript, TranscriptMessage
class MemoryFallbackClient:
async def _rpc(self, method: str, params=None): # noqa: ANN001
if method == "memory.search":
raise RuntimeError("unknown method: memory.search")
raise AssertionError(f"Unexpected RPC: {method} {params}")
async def get_agent_file(self, agent_id: str, name: str): # noqa: ARG002
if name == "MEMORY.md":
return {
"file": {
"content": "beta rollout regions: us, eu; retry budget: 3\n",
}
}
return {"file": {"content": ""}}
@pytest.mark.asyncio
async def test_memory_completion_falls_back_to_agent_memory_files(tmp_path: Path):
completion = CompletionSpec(
memory=[
MemoryState(
key_pattern="beta rollout regions",
value_contains=["us", "eu", "3"],
)
]
)
result = await verify_completion(
completion,
workspace=tmp_path,
client=MemoryFallbackClient(), # type: ignore[arg-type]
session_key="session-test",
agent_id="agent-test",
runtime_values={},
)
assert result.score == 1.0
@pytest.mark.asyncio
async def test_file_completion_rejects_paths_outside_workspace(tmp_path: Path):
outside = tmp_path.parent / "outside.txt"
outside.write_text("secret", encoding="utf-8")
completion = CompletionSpec(files=[FileState(path="../outside.txt")])
result = await verify_completion(
completion,
workspace=tmp_path,
client=MemoryFallbackClient(), # type: ignore[arg-type]
session_key="session-test",
runtime_values={},
)
assert result.score == 0.0
assert "escapes workspace" in result.failed_assertions[0]
@pytest.mark.asyncio
async def test_execution_check_rejects_expected_file_outside_workspace(tmp_path: Path):
result = await run_execution_check(
ExecutionCheck(
name="unsafe-expected",
command="printf secret",
expected_stdout_file="../outside.txt",
),
workspace=tmp_path,
runtime_values={},
)
assert result.passed is False
assert "escapes workspace" in result.reason
@pytest.mark.asyncio
async def test_memory_completion_falls_back_to_transcript_when_memory_rpc_is_unavailable(tmp_path: Path):
completion = CompletionSpec(
memory=[
MemoryState(
key_pattern="beta rollout regions",
value_contains=["us", "eu", "3"],
)
]
)
transcript = Transcript(
messages=[
TranscriptMessage(
role="assistant",
tool_calls=[
ToolCall(
name="write",
family="edit",
input={
"path": "memory/notes.md",
"content": "beta rollout regions: us, eu; retry budget: 3\n",
},
success=True,
)
],
)
]
)
result = await verify_completion(
completion,
workspace=tmp_path,
client=MemoryFallbackClient(), # type: ignore[arg-type]
session_key="session-test",
agent_id="agent-test",
runtime_values={},
transcript=transcript,
)
assert result.score == 1.0