Compare commits
1 Commits
main
...
fix/clawbe
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3946e63c7d |
9
.github/workflows/README.md
vendored
9
.github/workflows/README.md
vendored
@ -9,10 +9,11 @@ Runs the repository test suite automatically on:
|
||||
- manual dispatch from the Actions tab
|
||||
|
||||
It uses Python 3.11 and 3.12, installs the package with
|
||||
`pip install -e .`, runs `python -m pytest -q`, then builds a wheel and
|
||||
checks that runtime data such as `tasks-public/`, `profiles/`, and
|
||||
`baselines/` are included. Runs under the `openclaw` organization use the
|
||||
Blacksmith Ubuntu runner; forks fall back to GitHub-hosted `ubuntu-latest`.
|
||||
`pip install -e .[dev]`, runs full Ruff lint plus `python -m pytest -q`,
|
||||
then builds a wheel and checks that runtime data such as `tasks-public/`,
|
||||
`tasks-domain/`, `profiles/`, and `baselines/` are included. Runs under the
|
||||
`openclaw` organization use the Blacksmith Ubuntu runner; forks fall back to
|
||||
GitHub-hosted `ubuntu-latest`.
|
||||
|
||||
## `ci-check-testbox.yml` — Blacksmith Testbox warmup
|
||||
|
||||
|
||||
3
.github/workflows/ci.yml
vendored
3
.github/workflows/ci.yml
vendored
@ -37,7 +37,7 @@ jobs:
|
||||
python -m pip install -e .[dev]
|
||||
|
||||
- name: Run static lint
|
||||
run: python -m ruff check clawbench app.py scripts tests --select F,E9
|
||||
run: python -m ruff check clawbench app.py scripts tests
|
||||
|
||||
- name: Run test suite
|
||||
run: python -m pytest -q
|
||||
@ -54,6 +54,7 @@ jobs:
|
||||
names = set(archive.namelist())
|
||||
required = [
|
||||
"tasks-public/MANIFEST.yaml",
|
||||
"tasks-domain/MANIFEST.yaml",
|
||||
"profiles/example_research_stack.yaml",
|
||||
"baselines/BASELINE_SOURCES.md",
|
||||
]
|
||||
|
||||
@ -504,6 +504,8 @@ clawbench/
|
||||
│ ├── tier1/ ... tier5/ # 19 task YAMLs with verification specs
|
||||
│ └── assets/ # 19 asset packs (verifiers + fixtures)
|
||||
│
|
||||
├── tasks-domain/ # Planned domain coverage scaffold
|
||||
│
|
||||
├── tasks/ # PRIVATE 40-task dev pool (gitignored)
|
||||
│
|
||||
├── scripts/ # Reproducibility + analysis pipeline
|
||||
|
||||
@ -11,6 +11,7 @@ from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from clawbench.client import GatewayClient
|
||||
from clawbench.paths import resolve_workspace_path
|
||||
from clawbench.render import render_template, render_value
|
||||
from clawbench.schemas import (
|
||||
CompletionResult,
|
||||
@ -109,7 +110,20 @@ async def run_execution_check(
|
||||
runtime_values: dict[str, Any],
|
||||
) -> ExecutionCheckResult:
|
||||
rendered_command = render_template(spec.command, runtime_values)
|
||||
rendered_cwd = workspace / render_template(spec.cwd, runtime_values)
|
||||
try:
|
||||
rendered_cwd = resolve_workspace_path(
|
||||
workspace,
|
||||
render_template(spec.cwd, runtime_values),
|
||||
field=f"execution check cwd for {spec.name}",
|
||||
)
|
||||
except ValueError as exc:
|
||||
return ExecutionCheckResult(
|
||||
name=spec.name,
|
||||
command=rendered_command,
|
||||
exit_code=-1,
|
||||
passed=False,
|
||||
reason=str(exc),
|
||||
)
|
||||
rendered_env = render_value(spec.env, runtime_values)
|
||||
import os
|
||||
import sys
|
||||
@ -219,7 +233,14 @@ def _evaluate_execution_result(
|
||||
return False, "stdout did not match expected text"
|
||||
|
||||
if spec.expected_stdout_file:
|
||||
expected_path = workspace / render_template(spec.expected_stdout_file, runtime_values)
|
||||
try:
|
||||
expected_path = resolve_workspace_path(
|
||||
workspace,
|
||||
render_template(spec.expected_stdout_file, runtime_values),
|
||||
field=f"expected_stdout_file for {spec.name}",
|
||||
)
|
||||
except ValueError as exc:
|
||||
return False, str(exc)
|
||||
if stdout.strip() != expected_path.read_text(encoding="utf-8").strip():
|
||||
return False, f"stdout did not match {spec.expected_stdout_file}"
|
||||
|
||||
@ -232,7 +253,14 @@ def _evaluate_execution_result(
|
||||
return False, "stdout JSON did not match expected JSON"
|
||||
|
||||
if spec.expected_json_file:
|
||||
expected_path = workspace / render_template(spec.expected_json_file, runtime_values)
|
||||
try:
|
||||
expected_path = resolve_workspace_path(
|
||||
workspace,
|
||||
render_template(spec.expected_json_file, runtime_values),
|
||||
field=f"expected_json_file for {spec.name}",
|
||||
)
|
||||
except ValueError as exc:
|
||||
return False, str(exc)
|
||||
try:
|
||||
parsed = json.loads(stdout)
|
||||
except json.JSONDecodeError as exc:
|
||||
@ -245,7 +273,14 @@ def _evaluate_execution_result(
|
||||
|
||||
|
||||
def _verify_file(spec: FileState, workspace: Path, runtime_values: dict[str, Any]) -> tuple[bool, str]:
|
||||
path = workspace / render_template(spec.path, runtime_values)
|
||||
try:
|
||||
path = resolve_workspace_path(
|
||||
workspace,
|
||||
render_template(spec.path, runtime_values),
|
||||
field=f"completion file {spec.path}",
|
||||
)
|
||||
except ValueError as exc:
|
||||
return False, str(exc)
|
||||
exists = path.exists() and path.is_file()
|
||||
|
||||
if not spec.exists:
|
||||
|
||||
@ -5,6 +5,7 @@ from __future__ import annotations
|
||||
import asyncio
|
||||
import datetime
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
@ -42,6 +43,7 @@ console = Console()
|
||||
|
||||
KNOWN_ADAPTERS = ("openclaw", "hermes", "codex", "claude-code")
|
||||
EXECUTABLE_ADAPTERS = {"openclaw"}
|
||||
RUN_CACHE_SCHEMA_VERSION = 2
|
||||
|
||||
|
||||
class _NullCtx:
|
||||
@ -278,8 +280,7 @@ class BenchmarkHarness:
|
||||
cache_dir_env = os.environ.get("CLAWBENCH_RUN_CACHE_DIR", "/data/run_cache")
|
||||
cache_path: Path | None = None
|
||||
if cache_dir_env:
|
||||
safe_model = self.model.replace("/", "_").replace(":", "_")
|
||||
cache_path = Path(cache_dir_env) / safe_model / task.id / f"run{run_index}.json"
|
||||
cache_path = self._run_cache_path(Path(cache_dir_env), task, run_index)
|
||||
if cache_path.exists():
|
||||
try:
|
||||
cached = TaskRunResult.model_validate_json(cache_path.read_text(encoding="utf-8"))
|
||||
@ -536,6 +537,27 @@ class BenchmarkHarness:
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
shutil.copy2(item, target)
|
||||
|
||||
def _run_cache_path(self, cache_root: Path, task: TaskDefinition, run_index: int) -> Path:
|
||||
identity = {
|
||||
"schema": RUN_CACHE_SCHEMA_VERSION,
|
||||
"model": self.model,
|
||||
"adapter": self.adapter,
|
||||
"prompt_variant": self.prompt_variant,
|
||||
"judge_model": self.judge_model,
|
||||
"benchmark_version": __version__,
|
||||
"task_fingerprint": _task_definition_fingerprint(task),
|
||||
}
|
||||
scope = hashlib.sha256(
|
||||
json.dumps(identity, sort_keys=True, separators=(",", ":"), default=str).encode("utf-8")
|
||||
).hexdigest()[:16]
|
||||
return (
|
||||
cache_root
|
||||
/ _safe_cache_component(self.model)
|
||||
/ f"v{RUN_CACHE_SCHEMA_VERSION}-{scope}"
|
||||
/ _safe_cache_component(task.id)
|
||||
/ f"run{run_index}.json"
|
||||
)
|
||||
|
||||
async def _assert_browser_support(self, client: GatewayClient, session_key: str) -> None:
|
||||
inventory = await client.get_effective_tools(session_key)
|
||||
tool_ids = {
|
||||
@ -929,5 +951,17 @@ def _count_values(values) -> dict[str, int]:
|
||||
return counts
|
||||
|
||||
|
||||
def _safe_cache_component(value: str) -> str:
|
||||
cleaned = "".join(char if char.isalnum() or char in "._-" else "_" for char in value.strip())
|
||||
return cleaned.strip("._-") or "unknown"
|
||||
|
||||
|
||||
def _task_definition_fingerprint(task: TaskDefinition) -> str:
|
||||
payload = task.model_dump(mode="json")
|
||||
return hashlib.sha256(
|
||||
json.dumps(payload, sort_keys=True, separators=(",", ":"), default=str).encode("utf-8")
|
||||
).hexdigest()
|
||||
|
||||
|
||||
def _now_ms() -> int:
|
||||
return int(time.monotonic() * 1000)
|
||||
|
||||
@ -11,6 +11,7 @@ from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from clawbench.client import GatewayClient
|
||||
from clawbench.paths import resolve_workspace_path
|
||||
from clawbench.session_labels import unique_session_label
|
||||
from clawbench.schemas import (
|
||||
CompletionResult,
|
||||
@ -51,7 +52,6 @@ async def judge_task_run(
|
||||
)
|
||||
await client.subscribe(session_key)
|
||||
judge_transcript = await client.send_and_wait(session_key, prompt)
|
||||
# Temporary debug: log first 800 chars of raw judge response when parsing fails
|
||||
raw_text = judge_transcript.assistant_text
|
||||
parsed = parse_judge_response(
|
||||
raw_text,
|
||||
@ -59,9 +59,10 @@ async def judge_task_run(
|
||||
)
|
||||
if parsed.error:
|
||||
logger.warning(
|
||||
"Judge parse failed for %s. Raw response (first 800 chars):\n%s",
|
||||
"Judge parse failed for %s: %s (response length=%d)",
|
||||
task.id,
|
||||
raw_text[:800] if raw_text else "(empty)",
|
||||
parsed.error,
|
||||
len(raw_text or ""),
|
||||
)
|
||||
parsed.enabled = True
|
||||
parsed.model = judge_model
|
||||
@ -185,14 +186,22 @@ def _render_artifacts(*, artifact_paths: list[str], workspace: Path, max_chars:
|
||||
remaining = max_chars
|
||||
blocks: list[str] = []
|
||||
for rel_path in artifact_paths:
|
||||
target = workspace / rel_path
|
||||
if not target.exists():
|
||||
block = f"=== {rel_path} ===\n(missing)"
|
||||
elif target.is_dir():
|
||||
block = f"=== {rel_path} ===\n(directory)"
|
||||
try:
|
||||
target = resolve_workspace_path(
|
||||
workspace,
|
||||
rel_path,
|
||||
field=f"judge artifact {rel_path}",
|
||||
)
|
||||
except ValueError as exc:
|
||||
block = f"=== {rel_path} ===\n(invalid path: {exc})"
|
||||
else:
|
||||
content = target.read_text(encoding="utf-8", errors="replace")
|
||||
block = f"=== {rel_path} ===\n{_truncate_text(content, max(0, remaining - len(rel_path) - 20))}"
|
||||
if not target.exists():
|
||||
block = f"=== {rel_path} ===\n(missing)"
|
||||
elif target.is_dir():
|
||||
block = f"=== {rel_path} ===\n(directory)"
|
||||
else:
|
||||
content = target.read_text(encoding="utf-8", errors="replace")
|
||||
block = f"=== {rel_path} ===\n{_truncate_text(content, max(0, remaining - len(rel_path) - 20))}"
|
||||
|
||||
if remaining <= 0:
|
||||
break
|
||||
|
||||
16
clawbench/paths.py
Normal file
16
clawbench/paths.py
Normal file
@ -0,0 +1,16 @@
|
||||
"""Path helpers for task-owned workspace references."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def resolve_workspace_path(workspace: Path, path: str, *, field: str = "path") -> Path:
|
||||
"""Resolve a task-declared path and reject workspace escapes."""
|
||||
root = workspace.resolve()
|
||||
candidate = (workspace / path).resolve()
|
||||
try:
|
||||
candidate.relative_to(root)
|
||||
except ValueError as exc:
|
||||
raise ValueError(f"{field} escapes workspace: {path}") from exc
|
||||
return candidate
|
||||
@ -15,6 +15,7 @@ from typing import Any
|
||||
|
||||
import httpx
|
||||
|
||||
from clawbench.paths import resolve_workspace_path
|
||||
from clawbench.render import render_template, render_value
|
||||
from clawbench.schemas import BackgroundService
|
||||
|
||||
@ -80,7 +81,11 @@ async def start_background_services(
|
||||
service_env.setdefault("PYTHONUNBUFFERED", "1")
|
||||
|
||||
command = render_template(spec.command, values)
|
||||
cwd = workspace / render_template(spec.cwd, values)
|
||||
cwd = resolve_workspace_path(
|
||||
workspace,
|
||||
render_template(spec.cwd, values),
|
||||
field=f"background service cwd for {spec.name}",
|
||||
)
|
||||
log_dir = workspace / ".clawbench-services"
|
||||
log_dir.mkdir(parents=True, exist_ok=True)
|
||||
log_path = log_dir / f"{spec.name}.log"
|
||||
@ -120,11 +125,13 @@ async def _wait_for_service_ready(
|
||||
) -> None:
|
||||
spec = service.spec
|
||||
deadline = time.monotonic() + spec.startup_timeout_seconds
|
||||
ready_file = (
|
||||
workspace / render_template(spec.ready_file, runtime_values)
|
||||
if spec.ready_file
|
||||
else None
|
||||
)
|
||||
ready_file = None
|
||||
if spec.ready_file:
|
||||
ready_file = resolve_workspace_path(
|
||||
workspace,
|
||||
render_template(spec.ready_file, runtime_values),
|
||||
field=f"background service ready_file for {spec.name}",
|
||||
)
|
||||
ready_url = None
|
||||
if service.base_url and spec.ready_path:
|
||||
ready_url = f"{service.base_url.rstrip('/')}/{spec.ready_path.lstrip('/')}"
|
||||
|
||||
@ -47,7 +47,7 @@ build-backend = "hatchling.build"
|
||||
|
||||
[tool.hatch.build.targets.wheel]
|
||||
packages = ["clawbench"]
|
||||
force-include = { "tasks-public" = "tasks-public", "profiles" = "profiles", "baselines" = "baselines", "CLAWBENCH_V0_4_SPEC.md" = "CLAWBENCH_V0_4_SPEC.md", "PARTNER_TRACE_SPEC.md" = "PARTNER_TRACE_SPEC.md" }
|
||||
force-include = { "tasks-public" = "tasks-public", "tasks-domain" = "tasks-domain", "profiles" = "profiles", "baselines" = "baselines", "CLAWBENCH_V0_4_SPEC.md" = "CLAWBENCH_V0_4_SPEC.md", "PARTNER_TRACE_SPEC.md" = "PARTNER_TRACE_SPEC.md" }
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
asyncio_mode = "auto"
|
||||
|
||||
@ -2,8 +2,8 @@ from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from clawbench.environment import verify_completion
|
||||
from clawbench.schemas import CompletionSpec, MemoryState, ToolCall, Transcript, TranscriptMessage
|
||||
from clawbench.environment import run_execution_check, verify_completion
|
||||
from clawbench.schemas import CompletionSpec, ExecutionCheck, FileState, MemoryState, ToolCall, Transcript, TranscriptMessage
|
||||
|
||||
|
||||
class MemoryFallbackClient:
|
||||
@ -45,6 +45,40 @@ async def test_memory_completion_falls_back_to_agent_memory_files(tmp_path: Path
|
||||
assert result.score == 1.0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_file_completion_rejects_paths_outside_workspace(tmp_path: Path):
|
||||
outside = tmp_path.parent / "outside.txt"
|
||||
outside.write_text("secret", encoding="utf-8")
|
||||
completion = CompletionSpec(files=[FileState(path="../outside.txt")])
|
||||
|
||||
result = await verify_completion(
|
||||
completion,
|
||||
workspace=tmp_path,
|
||||
client=MemoryFallbackClient(), # type: ignore[arg-type]
|
||||
session_key="session-test",
|
||||
runtime_values={},
|
||||
)
|
||||
|
||||
assert result.score == 0.0
|
||||
assert "escapes workspace" in result.failed_assertions[0]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_execution_check_rejects_expected_file_outside_workspace(tmp_path: Path):
|
||||
result = await run_execution_check(
|
||||
ExecutionCheck(
|
||||
name="unsafe-expected",
|
||||
command="printf secret",
|
||||
expected_stdout_file="../outside.txt",
|
||||
),
|
||||
workspace=tmp_path,
|
||||
runtime_values={},
|
||||
)
|
||||
|
||||
assert result.passed is False
|
||||
assert "escapes workspace" in result.reason
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_memory_completion_falls_back_to_transcript_when_memory_rpc_is_unavailable(tmp_path: Path):
|
||||
completion = CompletionSpec(
|
||||
|
||||
@ -165,6 +165,49 @@ def test_compose_result_from_task_stats_supports_parallel_environment_metadata()
|
||||
assert merged_result.environment["browser_tasks_serialized"] is False
|
||||
|
||||
|
||||
def test_run_cache_path_includes_scoring_inputs(tmp_path: Path):
|
||||
task = next(task for task in load_all_tasks() if task.id == "t1-bugfix-discount")
|
||||
base = BenchmarkHarness(
|
||||
gateway_config=GatewayConfig(),
|
||||
model="test/model",
|
||||
task_ids=[task.id],
|
||||
prompt_variant="clear",
|
||||
judge_model="judge-a",
|
||||
randomize_order=False,
|
||||
)
|
||||
same = BenchmarkHarness(
|
||||
gateway_config=GatewayConfig(),
|
||||
model="test/model",
|
||||
task_ids=[task.id],
|
||||
prompt_variant="clear",
|
||||
judge_model="judge-a",
|
||||
randomize_order=False,
|
||||
)
|
||||
different_judge = BenchmarkHarness(
|
||||
gateway_config=GatewayConfig(),
|
||||
model="test/model",
|
||||
task_ids=[task.id],
|
||||
prompt_variant="clear",
|
||||
judge_model="judge-b",
|
||||
randomize_order=False,
|
||||
)
|
||||
different_prompt = BenchmarkHarness(
|
||||
gateway_config=GatewayConfig(),
|
||||
model="test/model",
|
||||
task_ids=[task.id],
|
||||
prompt_variant="ambiguous",
|
||||
judge_model="judge-a",
|
||||
randomize_order=False,
|
||||
)
|
||||
|
||||
base_path = base._run_cache_path(tmp_path, task, 0)
|
||||
|
||||
assert "v2-" in str(base_path)
|
||||
assert base_path == same._run_cache_path(tmp_path, task, 0)
|
||||
assert base_path != different_judge._run_cache_path(tmp_path, task, 0)
|
||||
assert base_path != different_prompt._run_cache_path(tmp_path, task, 0)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_run_records_adapter_surface(monkeypatch):
|
||||
task = next(task for task in load_all_tasks() if task.id == "t1-bugfix-discount")
|
||||
|
||||
@ -71,6 +71,27 @@ def test_build_judge_prompt_includes_artifacts_completion_feedback_and_transcrip
|
||||
assert "tool families: read x1" in prompt
|
||||
|
||||
|
||||
def test_build_judge_prompt_rejects_artifact_paths_outside_workspace(tmp_path: Path):
|
||||
outside = tmp_path.parent / "outside-judge.txt"
|
||||
outside.write_text("do not leak", encoding="utf-8")
|
||||
judge = JudgeExpectations(
|
||||
rubric="Check that the answer is grounded and auditable.",
|
||||
artifact_paths=["../outside-judge.txt"],
|
||||
)
|
||||
task = _make_task(judge)
|
||||
|
||||
prompt = build_judge_prompt(
|
||||
task=task,
|
||||
judge=judge,
|
||||
transcript=Transcript(),
|
||||
workspace=tmp_path,
|
||||
completion_result=CompletionResult(score=1.0),
|
||||
)
|
||||
|
||||
assert "invalid path" in prompt
|
||||
assert "do not leak" not in prompt
|
||||
|
||||
|
||||
def test_parse_judge_response_accepts_wrapped_json_and_computes_pass():
|
||||
result = parse_judge_response(
|
||||
'Score summary:\n{"score": 0.82, "confidence": 0.66, "reason": "Strong evidence.", "rubric_hits": ["grounded"], "rubric_misses": []}',
|
||||
|
||||
@ -7,5 +7,6 @@ def test_wheel_includes_runtime_data_directories():
|
||||
force_include = pyproject["tool"]["hatch"]["build"]["targets"]["wheel"]["force-include"]
|
||||
|
||||
assert force_include["tasks-public"] == "tasks-public"
|
||||
assert force_include["tasks-domain"] == "tasks-domain"
|
||||
assert force_include["profiles"] == "profiles"
|
||||
assert force_include["baselines"] == "baselines"
|
||||
|
||||
@ -35,3 +35,21 @@ async def test_background_service_waits_for_ready_file(tmp_path: Path):
|
||||
finally:
|
||||
await stop_background_services(services)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_background_service_rejects_cwd_outside_workspace(tmp_path: Path):
|
||||
runtime_values = build_runtime_values(workspace=tmp_path, repo_root=Path.cwd())
|
||||
service = BackgroundService(
|
||||
name="bad_service",
|
||||
command="true",
|
||||
cwd="..",
|
||||
ready_path=None,
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match="escapes workspace"):
|
||||
await start_background_services(
|
||||
[service],
|
||||
workspace=tmp_path,
|
||||
repo_root=Path.cwd(),
|
||||
runtime_values=runtime_values,
|
||||
)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user