Compare commits

...

1 Commits

Author SHA1 Message Date
Vincent Koc
3946e63c7d
fix(runtime): harden benchmark cache and task paths
Some checks failed
CI / Python ${{ matrix.python-version }} test suite (3.11) (push) Has been cancelled
CI / Python ${{ matrix.python-version }} test suite (3.12) (push) Has been cancelled
2026-04-28 22:37:07 -07:00
14 changed files with 252 additions and 30 deletions

View File

@ -9,10 +9,11 @@ Runs the repository test suite automatically on:
- manual dispatch from the Actions tab
It uses Python 3.11 and 3.12, installs the package with
`pip install -e .`, runs `python -m pytest -q`, then builds a wheel and
checks that runtime data such as `tasks-public/`, `profiles/`, and
`baselines/` are included. Runs under the `openclaw` organization use the
Blacksmith Ubuntu runner; forks fall back to GitHub-hosted `ubuntu-latest`.
`pip install -e .[dev]`, runs full Ruff lint plus `python -m pytest -q`,
then builds a wheel and checks that runtime data such as `tasks-public/`,
`tasks-domain/`, `profiles/`, and `baselines/` are included. Runs under the
`openclaw` organization use the Blacksmith Ubuntu runner; forks fall back to
GitHub-hosted `ubuntu-latest`.
## `ci-check-testbox.yml` — Blacksmith Testbox warmup

View File

@ -37,7 +37,7 @@ jobs:
python -m pip install -e .[dev]
- name: Run static lint
run: python -m ruff check clawbench app.py scripts tests --select F,E9
run: python -m ruff check clawbench app.py scripts tests
- name: Run test suite
run: python -m pytest -q
@ -54,6 +54,7 @@ jobs:
names = set(archive.namelist())
required = [
"tasks-public/MANIFEST.yaml",
"tasks-domain/MANIFEST.yaml",
"profiles/example_research_stack.yaml",
"baselines/BASELINE_SOURCES.md",
]

View File

@ -504,6 +504,8 @@ clawbench/
│ ├── tier1/ ... tier5/ # 19 task YAMLs with verification specs
│ └── assets/ # 19 asset packs (verifiers + fixtures)
├── tasks-domain/ # Planned domain coverage scaffold
├── tasks/ # PRIVATE 40-task dev pool (gitignored)
├── scripts/ # Reproducibility + analysis pipeline

View File

@ -11,6 +11,7 @@ from pathlib import Path
from typing import Any
from clawbench.client import GatewayClient
from clawbench.paths import resolve_workspace_path
from clawbench.render import render_template, render_value
from clawbench.schemas import (
CompletionResult,
@ -109,7 +110,20 @@ async def run_execution_check(
runtime_values: dict[str, Any],
) -> ExecutionCheckResult:
rendered_command = render_template(spec.command, runtime_values)
rendered_cwd = workspace / render_template(spec.cwd, runtime_values)
try:
rendered_cwd = resolve_workspace_path(
workspace,
render_template(spec.cwd, runtime_values),
field=f"execution check cwd for {spec.name}",
)
except ValueError as exc:
return ExecutionCheckResult(
name=spec.name,
command=rendered_command,
exit_code=-1,
passed=False,
reason=str(exc),
)
rendered_env = render_value(spec.env, runtime_values)
import os
import sys
@ -219,7 +233,14 @@ def _evaluate_execution_result(
return False, "stdout did not match expected text"
if spec.expected_stdout_file:
expected_path = workspace / render_template(spec.expected_stdout_file, runtime_values)
try:
expected_path = resolve_workspace_path(
workspace,
render_template(spec.expected_stdout_file, runtime_values),
field=f"expected_stdout_file for {spec.name}",
)
except ValueError as exc:
return False, str(exc)
if stdout.strip() != expected_path.read_text(encoding="utf-8").strip():
return False, f"stdout did not match {spec.expected_stdout_file}"
@ -232,7 +253,14 @@ def _evaluate_execution_result(
return False, "stdout JSON did not match expected JSON"
if spec.expected_json_file:
expected_path = workspace / render_template(spec.expected_json_file, runtime_values)
try:
expected_path = resolve_workspace_path(
workspace,
render_template(spec.expected_json_file, runtime_values),
field=f"expected_json_file for {spec.name}",
)
except ValueError as exc:
return False, str(exc)
try:
parsed = json.loads(stdout)
except json.JSONDecodeError as exc:
@ -245,7 +273,14 @@ def _evaluate_execution_result(
def _verify_file(spec: FileState, workspace: Path, runtime_values: dict[str, Any]) -> tuple[bool, str]:
path = workspace / render_template(spec.path, runtime_values)
try:
path = resolve_workspace_path(
workspace,
render_template(spec.path, runtime_values),
field=f"completion file {spec.path}",
)
except ValueError as exc:
return False, str(exc)
exists = path.exists() and path.is_file()
if not spec.exists:

View File

@ -5,6 +5,7 @@ from __future__ import annotations
import asyncio
import datetime
import hashlib
import json
import logging
import os
import shutil
@ -42,6 +43,7 @@ console = Console()
KNOWN_ADAPTERS = ("openclaw", "hermes", "codex", "claude-code")
EXECUTABLE_ADAPTERS = {"openclaw"}
RUN_CACHE_SCHEMA_VERSION = 2
class _NullCtx:
@ -278,8 +280,7 @@ class BenchmarkHarness:
cache_dir_env = os.environ.get("CLAWBENCH_RUN_CACHE_DIR", "/data/run_cache")
cache_path: Path | None = None
if cache_dir_env:
safe_model = self.model.replace("/", "_").replace(":", "_")
cache_path = Path(cache_dir_env) / safe_model / task.id / f"run{run_index}.json"
cache_path = self._run_cache_path(Path(cache_dir_env), task, run_index)
if cache_path.exists():
try:
cached = TaskRunResult.model_validate_json(cache_path.read_text(encoding="utf-8"))
@ -536,6 +537,27 @@ class BenchmarkHarness:
target.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(item, target)
def _run_cache_path(self, cache_root: Path, task: TaskDefinition, run_index: int) -> Path:
identity = {
"schema": RUN_CACHE_SCHEMA_VERSION,
"model": self.model,
"adapter": self.adapter,
"prompt_variant": self.prompt_variant,
"judge_model": self.judge_model,
"benchmark_version": __version__,
"task_fingerprint": _task_definition_fingerprint(task),
}
scope = hashlib.sha256(
json.dumps(identity, sort_keys=True, separators=(",", ":"), default=str).encode("utf-8")
).hexdigest()[:16]
return (
cache_root
/ _safe_cache_component(self.model)
/ f"v{RUN_CACHE_SCHEMA_VERSION}-{scope}"
/ _safe_cache_component(task.id)
/ f"run{run_index}.json"
)
async def _assert_browser_support(self, client: GatewayClient, session_key: str) -> None:
inventory = await client.get_effective_tools(session_key)
tool_ids = {
@ -929,5 +951,17 @@ def _count_values(values) -> dict[str, int]:
return counts
def _safe_cache_component(value: str) -> str:
cleaned = "".join(char if char.isalnum() or char in "._-" else "_" for char in value.strip())
return cleaned.strip("._-") or "unknown"
def _task_definition_fingerprint(task: TaskDefinition) -> str:
payload = task.model_dump(mode="json")
return hashlib.sha256(
json.dumps(payload, sort_keys=True, separators=(",", ":"), default=str).encode("utf-8")
).hexdigest()
def _now_ms() -> int:
return int(time.monotonic() * 1000)

View File

@ -11,6 +11,7 @@ from pathlib import Path
from typing import Any
from clawbench.client import GatewayClient
from clawbench.paths import resolve_workspace_path
from clawbench.session_labels import unique_session_label
from clawbench.schemas import (
CompletionResult,
@ -51,7 +52,6 @@ async def judge_task_run(
)
await client.subscribe(session_key)
judge_transcript = await client.send_and_wait(session_key, prompt)
# Temporary debug: log first 800 chars of raw judge response when parsing fails
raw_text = judge_transcript.assistant_text
parsed = parse_judge_response(
raw_text,
@ -59,9 +59,10 @@ async def judge_task_run(
)
if parsed.error:
logger.warning(
"Judge parse failed for %s. Raw response (first 800 chars):\n%s",
"Judge parse failed for %s: %s (response length=%d)",
task.id,
raw_text[:800] if raw_text else "(empty)",
parsed.error,
len(raw_text or ""),
)
parsed.enabled = True
parsed.model = judge_model
@ -185,14 +186,22 @@ def _render_artifacts(*, artifact_paths: list[str], workspace: Path, max_chars:
remaining = max_chars
blocks: list[str] = []
for rel_path in artifact_paths:
target = workspace / rel_path
if not target.exists():
block = f"=== {rel_path} ===\n(missing)"
elif target.is_dir():
block = f"=== {rel_path} ===\n(directory)"
try:
target = resolve_workspace_path(
workspace,
rel_path,
field=f"judge artifact {rel_path}",
)
except ValueError as exc:
block = f"=== {rel_path} ===\n(invalid path: {exc})"
else:
content = target.read_text(encoding="utf-8", errors="replace")
block = f"=== {rel_path} ===\n{_truncate_text(content, max(0, remaining - len(rel_path) - 20))}"
if not target.exists():
block = f"=== {rel_path} ===\n(missing)"
elif target.is_dir():
block = f"=== {rel_path} ===\n(directory)"
else:
content = target.read_text(encoding="utf-8", errors="replace")
block = f"=== {rel_path} ===\n{_truncate_text(content, max(0, remaining - len(rel_path) - 20))}"
if remaining <= 0:
break

16
clawbench/paths.py Normal file
View File

@ -0,0 +1,16 @@
"""Path helpers for task-owned workspace references."""
from __future__ import annotations
from pathlib import Path
def resolve_workspace_path(workspace: Path, path: str, *, field: str = "path") -> Path:
"""Resolve a task-declared path and reject workspace escapes."""
root = workspace.resolve()
candidate = (workspace / path).resolve()
try:
candidate.relative_to(root)
except ValueError as exc:
raise ValueError(f"{field} escapes workspace: {path}") from exc
return candidate

View File

@ -15,6 +15,7 @@ from typing import Any
import httpx
from clawbench.paths import resolve_workspace_path
from clawbench.render import render_template, render_value
from clawbench.schemas import BackgroundService
@ -80,7 +81,11 @@ async def start_background_services(
service_env.setdefault("PYTHONUNBUFFERED", "1")
command = render_template(spec.command, values)
cwd = workspace / render_template(spec.cwd, values)
cwd = resolve_workspace_path(
workspace,
render_template(spec.cwd, values),
field=f"background service cwd for {spec.name}",
)
log_dir = workspace / ".clawbench-services"
log_dir.mkdir(parents=True, exist_ok=True)
log_path = log_dir / f"{spec.name}.log"
@ -120,11 +125,13 @@ async def _wait_for_service_ready(
) -> None:
spec = service.spec
deadline = time.monotonic() + spec.startup_timeout_seconds
ready_file = (
workspace / render_template(spec.ready_file, runtime_values)
if spec.ready_file
else None
)
ready_file = None
if spec.ready_file:
ready_file = resolve_workspace_path(
workspace,
render_template(spec.ready_file, runtime_values),
field=f"background service ready_file for {spec.name}",
)
ready_url = None
if service.base_url and spec.ready_path:
ready_url = f"{service.base_url.rstrip('/')}/{spec.ready_path.lstrip('/')}"

View File

@ -47,7 +47,7 @@ build-backend = "hatchling.build"
[tool.hatch.build.targets.wheel]
packages = ["clawbench"]
force-include = { "tasks-public" = "tasks-public", "profiles" = "profiles", "baselines" = "baselines", "CLAWBENCH_V0_4_SPEC.md" = "CLAWBENCH_V0_4_SPEC.md", "PARTNER_TRACE_SPEC.md" = "PARTNER_TRACE_SPEC.md" }
force-include = { "tasks-public" = "tasks-public", "tasks-domain" = "tasks-domain", "profiles" = "profiles", "baselines" = "baselines", "CLAWBENCH_V0_4_SPEC.md" = "CLAWBENCH_V0_4_SPEC.md", "PARTNER_TRACE_SPEC.md" = "PARTNER_TRACE_SPEC.md" }
[tool.pytest.ini_options]
asyncio_mode = "auto"

View File

@ -2,8 +2,8 @@ from pathlib import Path
import pytest
from clawbench.environment import verify_completion
from clawbench.schemas import CompletionSpec, MemoryState, ToolCall, Transcript, TranscriptMessage
from clawbench.environment import run_execution_check, verify_completion
from clawbench.schemas import CompletionSpec, ExecutionCheck, FileState, MemoryState, ToolCall, Transcript, TranscriptMessage
class MemoryFallbackClient:
@ -45,6 +45,40 @@ async def test_memory_completion_falls_back_to_agent_memory_files(tmp_path: Path
assert result.score == 1.0
@pytest.mark.asyncio
async def test_file_completion_rejects_paths_outside_workspace(tmp_path: Path):
outside = tmp_path.parent / "outside.txt"
outside.write_text("secret", encoding="utf-8")
completion = CompletionSpec(files=[FileState(path="../outside.txt")])
result = await verify_completion(
completion,
workspace=tmp_path,
client=MemoryFallbackClient(), # type: ignore[arg-type]
session_key="session-test",
runtime_values={},
)
assert result.score == 0.0
assert "escapes workspace" in result.failed_assertions[0]
@pytest.mark.asyncio
async def test_execution_check_rejects_expected_file_outside_workspace(tmp_path: Path):
result = await run_execution_check(
ExecutionCheck(
name="unsafe-expected",
command="printf secret",
expected_stdout_file="../outside.txt",
),
workspace=tmp_path,
runtime_values={},
)
assert result.passed is False
assert "escapes workspace" in result.reason
@pytest.mark.asyncio
async def test_memory_completion_falls_back_to_transcript_when_memory_rpc_is_unavailable(tmp_path: Path):
completion = CompletionSpec(

View File

@ -165,6 +165,49 @@ def test_compose_result_from_task_stats_supports_parallel_environment_metadata()
assert merged_result.environment["browser_tasks_serialized"] is False
def test_run_cache_path_includes_scoring_inputs(tmp_path: Path):
task = next(task for task in load_all_tasks() if task.id == "t1-bugfix-discount")
base = BenchmarkHarness(
gateway_config=GatewayConfig(),
model="test/model",
task_ids=[task.id],
prompt_variant="clear",
judge_model="judge-a",
randomize_order=False,
)
same = BenchmarkHarness(
gateway_config=GatewayConfig(),
model="test/model",
task_ids=[task.id],
prompt_variant="clear",
judge_model="judge-a",
randomize_order=False,
)
different_judge = BenchmarkHarness(
gateway_config=GatewayConfig(),
model="test/model",
task_ids=[task.id],
prompt_variant="clear",
judge_model="judge-b",
randomize_order=False,
)
different_prompt = BenchmarkHarness(
gateway_config=GatewayConfig(),
model="test/model",
task_ids=[task.id],
prompt_variant="ambiguous",
judge_model="judge-a",
randomize_order=False,
)
base_path = base._run_cache_path(tmp_path, task, 0)
assert "v2-" in str(base_path)
assert base_path == same._run_cache_path(tmp_path, task, 0)
assert base_path != different_judge._run_cache_path(tmp_path, task, 0)
assert base_path != different_prompt._run_cache_path(tmp_path, task, 0)
@pytest.mark.asyncio
async def test_run_records_adapter_surface(monkeypatch):
task = next(task for task in load_all_tasks() if task.id == "t1-bugfix-discount")

View File

@ -71,6 +71,27 @@ def test_build_judge_prompt_includes_artifacts_completion_feedback_and_transcrip
assert "tool families: read x1" in prompt
def test_build_judge_prompt_rejects_artifact_paths_outside_workspace(tmp_path: Path):
outside = tmp_path.parent / "outside-judge.txt"
outside.write_text("do not leak", encoding="utf-8")
judge = JudgeExpectations(
rubric="Check that the answer is grounded and auditable.",
artifact_paths=["../outside-judge.txt"],
)
task = _make_task(judge)
prompt = build_judge_prompt(
task=task,
judge=judge,
transcript=Transcript(),
workspace=tmp_path,
completion_result=CompletionResult(score=1.0),
)
assert "invalid path" in prompt
assert "do not leak" not in prompt
def test_parse_judge_response_accepts_wrapped_json_and_computes_pass():
result = parse_judge_response(
'Score summary:\n{"score": 0.82, "confidence": 0.66, "reason": "Strong evidence.", "rubric_hits": ["grounded"], "rubric_misses": []}',

View File

@ -7,5 +7,6 @@ def test_wheel_includes_runtime_data_directories():
force_include = pyproject["tool"]["hatch"]["build"]["targets"]["wheel"]["force-include"]
assert force_include["tasks-public"] == "tasks-public"
assert force_include["tasks-domain"] == "tasks-domain"
assert force_include["profiles"] == "profiles"
assert force_include["baselines"] == "baselines"

View File

@ -35,3 +35,21 @@ async def test_background_service_waits_for_ready_file(tmp_path: Path):
finally:
await stop_background_services(services)
@pytest.mark.asyncio
async def test_background_service_rejects_cwd_outside_workspace(tmp_path: Path):
runtime_values = build_runtime_values(workspace=tmp_path, repo_root=Path.cwd())
service = BackgroundService(
name="bad_service",
command="true",
cwd="..",
ready_path=None,
)
with pytest.raises(ValueError, match="escapes workspace"):
await start_background_services(
[service],
workspace=tmp_path,
repo_root=Path.cwd(),
runtime_values=runtime_values,
)