fix(runtime): harden benchmark cache and task paths

2026-04-28 22:37:07 -07:00
14 changed files with 252 additions and 30 deletions
--- a/.github/workflows/README.md
+++ b/.github/workflows/README.md
@ -9,10 +9,11 @@ Runs the repository test suite automatically on:
 - manual dispatch from the Actions tab

 It uses Python 3.11 and 3.12, installs the package with
-`pip install -e .`, runs `python -m pytest -q`, then builds a wheel and
-checks that runtime data such as `tasks-public/`, `profiles/`, and
-`baselines/` are included. Runs under the `openclaw` organization use the
-Blacksmith Ubuntu runner; forks fall back to GitHub-hosted `ubuntu-latest`.
+`pip install -e .[dev]`, runs full Ruff lint plus `python -m pytest -q`,
+then builds a wheel and checks that runtime data such as `tasks-public/`,
+`tasks-domain/`, `profiles/`, and `baselines/` are included. Runs under the
+`openclaw` organization use the Blacksmith Ubuntu runner; forks fall back to
+GitHub-hosted `ubuntu-latest`.

 ## `ci-check-testbox.yml` — Blacksmith Testbox warmup

--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -37,7 +37,7 @@ jobs:
          python -m pip install -e .[dev]

      - name: Run static lint
-        run: python -m ruff check clawbench app.py scripts tests --select F,E9
+        run: python -m ruff check clawbench app.py scripts tests

      - name: Run test suite
        run: python -m pytest -q
@ -54,6 +54,7 @@ jobs:
              names = set(archive.namelist())
          required = [
              "tasks-public/MANIFEST.yaml",
+              "tasks-domain/MANIFEST.yaml",
              "profiles/example_research_stack.yaml",
              "baselines/BASELINE_SOURCES.md",
          ]
--- a/README.md
+++ b/README.md
@ -504,6 +504,8 @@ clawbench/
 │   ├── tier1/ ... tier5/           # 19 task YAMLs with verification specs
 │   └── assets/                     # 19 asset packs (verifiers + fixtures)
 │
+├── tasks-domain/                   # Planned domain coverage scaffold
+│
 ├── tasks/                          # PRIVATE 40-task dev pool (gitignored)
 │
 ├── scripts/                        # Reproducibility + analysis pipeline
--- a/clawbench/environment.py
+++ b/clawbench/environment.py
@ -11,6 +11,7 @@ from pathlib import Path
 from typing import Any

 from clawbench.client import GatewayClient
+from clawbench.paths import resolve_workspace_path
 from clawbench.render import render_template, render_value
 from clawbench.schemas import (
    CompletionResult,
@ -109,7 +110,20 @@ async def run_execution_check(
    runtime_values: dict[str, Any],
 ) -> ExecutionCheckResult:
    rendered_command = render_template(spec.command, runtime_values)
-    rendered_cwd = workspace / render_template(spec.cwd, runtime_values)
+    try:
+        rendered_cwd = resolve_workspace_path(
+            workspace,
+            render_template(spec.cwd, runtime_values),
+            field=f"execution check cwd for {spec.name}",
+        )
+    except ValueError as exc:
+        return ExecutionCheckResult(
+            name=spec.name,
+            command=rendered_command,
+            exit_code=-1,
+            passed=False,
+            reason=str(exc),
+        )
    rendered_env = render_value(spec.env, runtime_values)
    import os
    import sys
@ -219,7 +233,14 @@ def _evaluate_execution_result(
            return False, "stdout did not match expected text"

    if spec.expected_stdout_file:
-        expected_path = workspace / render_template(spec.expected_stdout_file, runtime_values)
+        try:
+            expected_path = resolve_workspace_path(
+                workspace,
+                render_template(spec.expected_stdout_file, runtime_values),
+                field=f"expected_stdout_file for {spec.name}",
+            )
+        except ValueError as exc:
+            return False, str(exc)
        if stdout.strip() != expected_path.read_text(encoding="utf-8").strip():
            return False, f"stdout did not match {spec.expected_stdout_file}"

@ -232,7 +253,14 @@ def _evaluate_execution_result(
            return False, "stdout JSON did not match expected JSON"

    if spec.expected_json_file:
-        expected_path = workspace / render_template(spec.expected_json_file, runtime_values)
+        try:
+            expected_path = resolve_workspace_path(
+                workspace,
+                render_template(spec.expected_json_file, runtime_values),
+                field=f"expected_json_file for {spec.name}",
+            )
+        except ValueError as exc:
+            return False, str(exc)
        try:
            parsed = json.loads(stdout)
        except json.JSONDecodeError as exc:
@ -245,7 +273,14 @@ def _evaluate_execution_result(


 def _verify_file(spec: FileState, workspace: Path, runtime_values: dict[str, Any]) -> tuple[bool, str]:
-    path = workspace / render_template(spec.path, runtime_values)
+    try:
+        path = resolve_workspace_path(
+            workspace,
+            render_template(spec.path, runtime_values),
+            field=f"completion file {spec.path}",
+        )
+    except ValueError as exc:
+        return False, str(exc)
    exists = path.exists() and path.is_file()

    if not spec.exists:
--- a/clawbench/harness.py
+++ b/clawbench/harness.py
@ -5,6 +5,7 @@ from __future__ import annotations
 import asyncio
 import datetime
 import hashlib
+import json
 import logging
 import os
 import shutil
@ -42,6 +43,7 @@ console = Console()

 KNOWN_ADAPTERS = ("openclaw", "hermes", "codex", "claude-code")
 EXECUTABLE_ADAPTERS = {"openclaw"}
+RUN_CACHE_SCHEMA_VERSION = 2


 class _NullCtx:
@ -278,8 +280,7 @@ class BenchmarkHarness:
        cache_dir_env = os.environ.get("CLAWBENCH_RUN_CACHE_DIR", "/data/run_cache")
        cache_path: Path | None = None
        if cache_dir_env:
-            safe_model = self.model.replace("/", "_").replace(":", "_")
-            cache_path = Path(cache_dir_env) / safe_model / task.id / f"run{run_index}.json"
+            cache_path = self._run_cache_path(Path(cache_dir_env), task, run_index)
            if cache_path.exists():
                try:
                    cached = TaskRunResult.model_validate_json(cache_path.read_text(encoding="utf-8"))
@ -536,6 +537,27 @@ class BenchmarkHarness:
                target.parent.mkdir(parents=True, exist_ok=True)
                shutil.copy2(item, target)

+    def _run_cache_path(self, cache_root: Path, task: TaskDefinition, run_index: int) -> Path:
+        identity = {
+            "schema": RUN_CACHE_SCHEMA_VERSION,
+            "model": self.model,
+            "adapter": self.adapter,
+            "prompt_variant": self.prompt_variant,
+            "judge_model": self.judge_model,
+            "benchmark_version": __version__,
+            "task_fingerprint": _task_definition_fingerprint(task),
+        }
+        scope = hashlib.sha256(
+            json.dumps(identity, sort_keys=True, separators=(",", ":"), default=str).encode("utf-8")
+        ).hexdigest()[:16]
+        return (
+            cache_root
+            / _safe_cache_component(self.model)
+            / f"v{RUN_CACHE_SCHEMA_VERSION}-{scope}"
+            / _safe_cache_component(task.id)
+            / f"run{run_index}.json"
+        )
+
    async def _assert_browser_support(self, client: GatewayClient, session_key: str) -> None:
        inventory = await client.get_effective_tools(session_key)
        tool_ids = {
@ -929,5 +951,17 @@ def _count_values(values) -> dict[str, int]:
    return counts


+def _safe_cache_component(value: str) -> str:
+    cleaned = "".join(char if char.isalnum() or char in "._-" else "_" for char in value.strip())
+    return cleaned.strip("._-") or "unknown"
+
+
+def _task_definition_fingerprint(task: TaskDefinition) -> str:
+    payload = task.model_dump(mode="json")
+    return hashlib.sha256(
+        json.dumps(payload, sort_keys=True, separators=(",", ":"), default=str).encode("utf-8")
+    ).hexdigest()
+
+
 def _now_ms() -> int:
    return int(time.monotonic() * 1000)
--- a/clawbench/judge.py
+++ b/clawbench/judge.py
@ -11,6 +11,7 @@ from pathlib import Path
 from typing import Any

 from clawbench.client import GatewayClient
+from clawbench.paths import resolve_workspace_path
 from clawbench.session_labels import unique_session_label
 from clawbench.schemas import (
    CompletionResult,
@ -51,7 +52,6 @@ async def judge_task_run(
        )
        await client.subscribe(session_key)
        judge_transcript = await client.send_and_wait(session_key, prompt)
-        # Temporary debug: log first 800 chars of raw judge response when parsing fails
        raw_text = judge_transcript.assistant_text
        parsed = parse_judge_response(
            raw_text,
@ -59,9 +59,10 @@ async def judge_task_run(
        )
        if parsed.error:
            logger.warning(
-                "Judge parse failed for %s. Raw response (first 800 chars):\n%s",
+                "Judge parse failed for %s: %s (response length=%d)",
                task.id,
-                raw_text[:800] if raw_text else "(empty)",
+                parsed.error,
+                len(raw_text or ""),
            )
        parsed.enabled = True
        parsed.model = judge_model
@ -185,14 +186,22 @@ def _render_artifacts(*, artifact_paths: list[str], workspace: Path, max_chars:
    remaining = max_chars
    blocks: list[str] = []
    for rel_path in artifact_paths:
-        target = workspace / rel_path
-        if not target.exists():
-            block = f"=== {rel_path} ===\n(missing)"
-        elif target.is_dir():
-            block = f"=== {rel_path} ===\n(directory)"
+        try:
+            target = resolve_workspace_path(
+                workspace,
+                rel_path,
+                field=f"judge artifact {rel_path}",
+            )
+        except ValueError as exc:
+            block = f"=== {rel_path} ===\n(invalid path: {exc})"
        else:
-            content = target.read_text(encoding="utf-8", errors="replace")
-            block = f"=== {rel_path} ===\n{_truncate_text(content, max(0, remaining - len(rel_path) - 20))}"
+            if not target.exists():
+                block = f"=== {rel_path} ===\n(missing)"
+            elif target.is_dir():
+                block = f"=== {rel_path} ===\n(directory)"
+            else:
+                content = target.read_text(encoding="utf-8", errors="replace")
+                block = f"=== {rel_path} ===\n{_truncate_text(content, max(0, remaining - len(rel_path) - 20))}"

        if remaining <= 0:
            break
--- a/clawbench/paths.py
+++ b/clawbench/paths.py
@ -0,0 +1,16 @@
+"""Path helpers for task-owned workspace references."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+
+def resolve_workspace_path(workspace: Path, path: str, *, field: str = "path") -> Path:
+    """Resolve a task-declared path and reject workspace escapes."""
+    root = workspace.resolve()
+    candidate = (workspace / path).resolve()
+    try:
+        candidate.relative_to(root)
+    except ValueError as exc:
+        raise ValueError(f"{field} escapes workspace: {path}") from exc
+    return candidate
--- a/clawbench/services.py
+++ b/clawbench/services.py
@ -15,6 +15,7 @@ from typing import Any

 import httpx

+from clawbench.paths import resolve_workspace_path
 from clawbench.render import render_template, render_value
 from clawbench.schemas import BackgroundService

@ -80,7 +81,11 @@ async def start_background_services(
        service_env.setdefault("PYTHONUNBUFFERED", "1")

        command = render_template(spec.command, values)
-        cwd = workspace / render_template(spec.cwd, values)
+        cwd = resolve_workspace_path(
+            workspace,
+            render_template(spec.cwd, values),
+            field=f"background service cwd for {spec.name}",
+        )
        log_dir = workspace / ".clawbench-services"
        log_dir.mkdir(parents=True, exist_ok=True)
        log_path = log_dir / f"{spec.name}.log"
@ -120,11 +125,13 @@ async def _wait_for_service_ready(
 ) -> None:
    spec = service.spec
    deadline = time.monotonic() + spec.startup_timeout_seconds
-    ready_file = (
-        workspace / render_template(spec.ready_file, runtime_values)
-        if spec.ready_file
-        else None
-    )
+    ready_file = None
+    if spec.ready_file:
+        ready_file = resolve_workspace_path(
+            workspace,
+            render_template(spec.ready_file, runtime_values),
+            field=f"background service ready_file for {spec.name}",
+        )
    ready_url = None
    if service.base_url and spec.ready_path:
        ready_url = f"{service.base_url.rstrip('/')}/{spec.ready_path.lstrip('/')}"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -47,7 +47,7 @@ build-backend = "hatchling.build"

 [tool.hatch.build.targets.wheel]
 packages = ["clawbench"]
-force-include = { "tasks-public" = "tasks-public", "profiles" = "profiles", "baselines" = "baselines", "CLAWBENCH_V0_4_SPEC.md" = "CLAWBENCH_V0_4_SPEC.md", "PARTNER_TRACE_SPEC.md" = "PARTNER_TRACE_SPEC.md" }
+force-include = { "tasks-public" = "tasks-public", "tasks-domain" = "tasks-domain", "profiles" = "profiles", "baselines" = "baselines", "CLAWBENCH_V0_4_SPEC.md" = "CLAWBENCH_V0_4_SPEC.md", "PARTNER_TRACE_SPEC.md" = "PARTNER_TRACE_SPEC.md" }

 [tool.pytest.ini_options]
 asyncio_mode = "auto"
--- a/tests/test_environment.py
+++ b/tests/test_environment.py
@ -2,8 +2,8 @@ from pathlib import Path

 import pytest

-from clawbench.environment import verify_completion
-from clawbench.schemas import CompletionSpec, MemoryState, ToolCall, Transcript, TranscriptMessage
+from clawbench.environment import run_execution_check, verify_completion
+from clawbench.schemas import CompletionSpec, ExecutionCheck, FileState, MemoryState, ToolCall, Transcript, TranscriptMessage


 class MemoryFallbackClient:
@ -45,6 +45,40 @@ async def test_memory_completion_falls_back_to_agent_memory_files(tmp_path: Path
    assert result.score == 1.0


+@pytest.mark.asyncio
+async def test_file_completion_rejects_paths_outside_workspace(tmp_path: Path):
+    outside = tmp_path.parent / "outside.txt"
+    outside.write_text("secret", encoding="utf-8")
+    completion = CompletionSpec(files=[FileState(path="../outside.txt")])
+
+    result = await verify_completion(
+        completion,
+        workspace=tmp_path,
+        client=MemoryFallbackClient(),  # type: ignore[arg-type]
+        session_key="session-test",
+        runtime_values={},
+    )
+
+    assert result.score == 0.0
+    assert "escapes workspace" in result.failed_assertions[0]
+
+
+@pytest.mark.asyncio
+async def test_execution_check_rejects_expected_file_outside_workspace(tmp_path: Path):
+    result = await run_execution_check(
+        ExecutionCheck(
+            name="unsafe-expected",
+            command="printf secret",
+            expected_stdout_file="../outside.txt",
+        ),
+        workspace=tmp_path,
+        runtime_values={},
+    )
+
+    assert result.passed is False
+    assert "escapes workspace" in result.reason
+
+
@pytest.mark.asyncio
 async def test_memory_completion_falls_back_to_transcript_when_memory_rpc_is_unavailable(tmp_path: Path):
    completion = CompletionSpec(
--- a/tests/test_harness.py
+++ b/tests/test_harness.py
@ -165,6 +165,49 @@ def test_compose_result_from_task_stats_supports_parallel_environment_metadata()
    assert merged_result.environment["browser_tasks_serialized"] is False


+def test_run_cache_path_includes_scoring_inputs(tmp_path: Path):
+    task = next(task for task in load_all_tasks() if task.id == "t1-bugfix-discount")
+    base = BenchmarkHarness(
+        gateway_config=GatewayConfig(),
+        model="test/model",
+        task_ids=[task.id],
+        prompt_variant="clear",
+        judge_model="judge-a",
+        randomize_order=False,
+    )
+    same = BenchmarkHarness(
+        gateway_config=GatewayConfig(),
+        model="test/model",
+        task_ids=[task.id],
+        prompt_variant="clear",
+        judge_model="judge-a",
+        randomize_order=False,
+    )
+    different_judge = BenchmarkHarness(
+        gateway_config=GatewayConfig(),
+        model="test/model",
+        task_ids=[task.id],
+        prompt_variant="clear",
+        judge_model="judge-b",
+        randomize_order=False,
+    )
+    different_prompt = BenchmarkHarness(
+        gateway_config=GatewayConfig(),
+        model="test/model",
+        task_ids=[task.id],
+        prompt_variant="ambiguous",
+        judge_model="judge-a",
+        randomize_order=False,
+    )
+
+    base_path = base._run_cache_path(tmp_path, task, 0)
+
+    assert "v2-" in str(base_path)
+    assert base_path == same._run_cache_path(tmp_path, task, 0)
+    assert base_path != different_judge._run_cache_path(tmp_path, task, 0)
+    assert base_path != different_prompt._run_cache_path(tmp_path, task, 0)
+
+
@pytest.mark.asyncio
 async def test_run_records_adapter_surface(monkeypatch):
    task = next(task for task in load_all_tasks() if task.id == "t1-bugfix-discount")
--- a/tests/test_judge.py
+++ b/tests/test_judge.py
@ -71,6 +71,27 @@ def test_build_judge_prompt_includes_artifacts_completion_feedback_and_transcrip
    assert "tool families: read x1" in prompt


+def test_build_judge_prompt_rejects_artifact_paths_outside_workspace(tmp_path: Path):
+    outside = tmp_path.parent / "outside-judge.txt"
+    outside.write_text("do not leak", encoding="utf-8")
+    judge = JudgeExpectations(
+        rubric="Check that the answer is grounded and auditable.",
+        artifact_paths=["../outside-judge.txt"],
+    )
+    task = _make_task(judge)
+
+    prompt = build_judge_prompt(
+        task=task,
+        judge=judge,
+        transcript=Transcript(),
+        workspace=tmp_path,
+        completion_result=CompletionResult(score=1.0),
+    )
+
+    assert "invalid path" in prompt
+    assert "do not leak" not in prompt
+
+
 def test_parse_judge_response_accepts_wrapped_json_and_computes_pass():
    result = parse_judge_response(
        'Score summary:\n{"score": 0.82, "confidence": 0.66, "reason": "Strong evidence.", "rubric_hits": ["grounded"], "rubric_misses": []}',
--- a/tests/test_packaging.py
+++ b/tests/test_packaging.py
@ -7,5 +7,6 @@ def test_wheel_includes_runtime_data_directories():
    force_include = pyproject["tool"]["hatch"]["build"]["targets"]["wheel"]["force-include"]

    assert force_include["tasks-public"] == "tasks-public"
+    assert force_include["tasks-domain"] == "tasks-domain"
    assert force_include["profiles"] == "profiles"
    assert force_include["baselines"] == "baselines"
--- a/tests/test_services.py
+++ b/tests/test_services.py
@ -35,3 +35,21 @@ async def test_background_service_waits_for_ready_file(tmp_path: Path):
    finally:
        await stop_background_services(services)

+
+@pytest.mark.asyncio
+async def test_background_service_rejects_cwd_outside_workspace(tmp_path: Path):
+    runtime_values = build_runtime_values(workspace=tmp_path, repo_root=Path.cwd())
+    service = BackgroundService(
+        name="bad_service",
+        command="true",
+        cwd="..",
+        ready_path=None,
+    )
+
+    with pytest.raises(ValueError, match="escapes workspace"):
+        await start_background_services(
+            [service],
+            workspace=tmp_path,
+            repo_root=Path.cwd(),
+            runtime_values=runtime_values,
+        )