Compare commits
1 Commits
main
...
test/envir
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
314de6a8e3 |
@ -3,7 +3,18 @@ from pathlib import Path
|
||||
import pytest
|
||||
|
||||
from clawbench.environment import run_execution_check, verify_completion
|
||||
from clawbench.schemas import CompletionSpec, ExecutionCheck, FileState, MemoryState, ToolCall, Transcript, TranscriptMessage
|
||||
from clawbench.schemas import (
|
||||
CompletionSpec,
|
||||
CronState,
|
||||
ExecutionCheck,
|
||||
FileState,
|
||||
GatewayAssertion,
|
||||
MemoryState,
|
||||
SessionState,
|
||||
ToolCall,
|
||||
Transcript,
|
||||
TranscriptMessage,
|
||||
)
|
||||
|
||||
|
||||
class MemoryFallbackClient:
|
||||
@ -22,6 +33,30 @@ class MemoryFallbackClient:
|
||||
return {"file": {"content": ""}}
|
||||
|
||||
|
||||
class CompletionClient:
|
||||
async def _rpc(self, method: str, params=None): # noqa: ANN001
|
||||
if method == "sessions.resolve":
|
||||
return {"payload": {"model": "anthropic/claude-sonnet-4-6"}}
|
||||
if method == "cron.list":
|
||||
return {"payload": {"jobs": [{"description": "nightly cleanup"}]}}
|
||||
if method == "tools.inventory":
|
||||
return {
|
||||
"payload": {
|
||||
"groups": [
|
||||
{
|
||||
"tools": [
|
||||
{
|
||||
"id": "browser",
|
||||
"status": "available",
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
raise AssertionError(f"Unexpected RPC: {method} {params}")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_memory_completion_falls_back_to_agent_memory_files(tmp_path: Path):
|
||||
completion = CompletionSpec(
|
||||
@ -45,6 +80,50 @@ async def test_memory_completion_falls_back_to_agent_memory_files(tmp_path: Path
|
||||
assert result.score == 1.0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_verify_completion_scores_mixed_successful_assertions(tmp_path: Path):
|
||||
report = tmp_path / "report.txt"
|
||||
report.write_text("status: green\nowner: benchmark\n", encoding="utf-8")
|
||||
completion = CompletionSpec(
|
||||
files=[
|
||||
FileState(
|
||||
path="report.txt",
|
||||
content_contains=["green"],
|
||||
content_not_contains=["red"],
|
||||
content_matches=r"owner:\s+benchmark",
|
||||
min_size_bytes=10,
|
||||
)
|
||||
],
|
||||
session=SessionState(model_should_be="claude-sonnet"),
|
||||
cron=[CronState(description_contains="cleanup")],
|
||||
gateway_assertions=[
|
||||
GatewayAssertion(
|
||||
method="tools.inventory",
|
||||
assert_path="$.groups[0].tools[0].id",
|
||||
assert_equals="browser",
|
||||
),
|
||||
GatewayAssertion(
|
||||
method="tools.inventory",
|
||||
assert_path="$.groups[0].tools[0].status",
|
||||
assert_contains="avail",
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
result = await verify_completion(
|
||||
completion,
|
||||
workspace=tmp_path,
|
||||
client=CompletionClient(), # type: ignore[arg-type]
|
||||
session_key="session-test",
|
||||
runtime_values={},
|
||||
)
|
||||
|
||||
assert result.total_assertions == 5
|
||||
assert result.passed_assertions == 5
|
||||
assert result.failed_assertions == []
|
||||
assert result.score == 1.0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_file_completion_rejects_paths_outside_workspace(tmp_path: Path):
|
||||
outside = tmp_path.parent / "outside.txt"
|
||||
@ -63,6 +142,45 @@ async def test_file_completion_rejects_paths_outside_workspace(tmp_path: Path):
|
||||
assert "escapes workspace" in result.failed_assertions[0]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_execution_check_supports_cwd_env_and_expected_json_file(tmp_path: Path):
|
||||
expected = tmp_path / "expected.json"
|
||||
expected.write_text('{"status": "ok"}', encoding="utf-8")
|
||||
workdir = tmp_path / "subdir"
|
||||
workdir.mkdir()
|
||||
|
||||
result = await run_execution_check(
|
||||
ExecutionCheck(
|
||||
name="json-check",
|
||||
command='python -c "import json, os; print(json.dumps({\'status\': os.environ[\'CHECK_STATUS\']}))"',
|
||||
cwd="subdir",
|
||||
env={"CHECK_STATUS": "ok"},
|
||||
expected_json_file="expected.json",
|
||||
),
|
||||
workspace=tmp_path,
|
||||
runtime_values={},
|
||||
)
|
||||
|
||||
assert result.passed is True
|
||||
assert result.reason == "OK"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_execution_check_rejects_cwd_outside_workspace(tmp_path: Path):
|
||||
result = await run_execution_check(
|
||||
ExecutionCheck(
|
||||
name="unsafe-cwd",
|
||||
command="true",
|
||||
cwd="../outside",
|
||||
),
|
||||
workspace=tmp_path,
|
||||
runtime_values={},
|
||||
)
|
||||
|
||||
assert result.passed is False
|
||||
assert "escapes workspace" in result.reason
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_execution_check_rejects_expected_file_outside_workspace(tmp_path: Path):
|
||||
result = await run_execution_check(
|
||||
|
||||
Loading…
Reference in New Issue
Block a user