fix(worker): harden runtime result writes

2026-04-29 13:16:40 -07:00 · 2026-04-29 13:16:40 -07:00 · 82bcfc1891
commit 82bcfc1891
parent ea17c715b3
3 changed files with 339 additions and 0 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -39,6 +39,9 @@ jobs:
      - name: Run static lint
        run: python -m ruff check clawbench app.py scripts tests

+      - name: Run runtime contract smoke tests
+        run: python -m pytest -q tests/test_runtime_contracts.py
+
      - name: Run test suite
        run: python -m pytest -q

--- a/clawbench/worker.py
+++ b/clawbench/worker.py
@ -225,6 +225,7 @@ class EvalWorker:
                job.job_id,
                progress.mark_status("Uploading results", clear_active=True),
            )
+            RESULTS_DIR.mkdir(parents=True, exist_ok=True)
            result_path = RESULTS_DIR / f"{result.submission_id}.json"
            result_path.write_text(json.dumps(result.model_dump(), indent=2), encoding="utf-8")

--- a/tests/test_runtime_contracts.py
+++ b/tests/test_runtime_contracts.py
@ -0,0 +1,335 @@
+from __future__ import annotations
+
+import datetime
+import importlib
+import json
+import sys
+import threading
+from pathlib import Path
+
+import pytest
+
+from clawbench.client import GatewayConfig
+from clawbench.harness import BenchmarkHarness
+from clawbench.queue import Job, JobQueue, JobStatus, SubmissionRequest
+import clawbench.queue as queue_module
+from clawbench.schemas import (
+    CompletionSpec,
+    ExecutionCheck,
+    SimulatedUser,
+    TaskDefinition,
+    TaskFamily,
+    Tier,
+    ToolCall,
+    TrajectoryExpectations,
+    Transcript,
+    TranscriptMessage,
+    UserTurn,
+)
+from clawbench.worker import EvalWorker
+
+
+def _runtime_task() -> TaskDefinition:
+    return TaskDefinition(
+        id="runtime-contract-smoke",
+        name="Runtime Contract Smoke",
+        tier=Tier.TIER1,
+        family=TaskFamily.TOOLS,
+        surface="tools",
+        user=SimulatedUser(
+            max_turns=1,
+            turns=[UserTurn(message="create answer.txt with runtime ok, then verify it")],
+        ),
+        completion=CompletionSpec(
+            execution_checks=[
+                ExecutionCheck(
+                    name="answer artifact",
+                    command=(
+                        "{python_exe} -c "
+                        "\"from pathlib import Path; "
+                        "assert Path('answer.txt').read_text(encoding='utf-8') == 'runtime ok\\n'\""
+                    ),
+                )
+            ]
+        ),
+        trajectory=TrajectoryExpectations(
+            required_families=["read", "edit", "execute"],
+            min_distinct_families=3,
+            require_read_before_mutation=True,
+            require_self_verification=True,
+        ),
+    )
+
+
+class _GatewayState:
+    def __init__(self) -> None:
+        self.agent_workspaces: dict[str, Path] = {}
+        self.session_agents: dict[str, str] = {}
+        self.deleted_sessions: list[str] = []
+        self.deleted_agents: list[str] = []
+
+
+class _SuccessfulGatewayClient:
+    state = _GatewayState()
+
+    def __init__(self, config: GatewayConfig | None = None) -> None:
+        self.config = config or GatewayConfig()
+
+    async def __aenter__(self) -> _SuccessfulGatewayClient:
+        return self
+
+    async def __aexit__(self, *exc: object) -> None:
+        return None
+
+    async def create_agent(self, *, name: str, workspace: str) -> str:
+        agent_id = f"agent-{len(self.state.agent_workspaces) + 1}"
+        self.state.agent_workspaces[agent_id] = Path(workspace)
+        return agent_id
+
+    async def create_session(self, *, model: str, agent_id: str, label: str) -> str:  # noqa: ARG002
+        session_key = f"session-{len(self.state.session_agents) + 1}"
+        self.state.session_agents[session_key] = agent_id
+        return session_key
+
+    async def subscribe(self, session_key: str) -> None:  # noqa: ARG002
+        return None
+
+    async def send_and_wait(self, session_key: str, message: str, *, timeout: float) -> Transcript:  # noqa: ARG002
+        workspace = self.state.agent_workspaces[self.state.session_agents[session_key]]
+        (workspace / "answer.txt").write_text("runtime ok\n", encoding="utf-8")
+        return Transcript(
+            messages=[
+                TranscriptMessage(
+                    role="assistant",
+                    text="i'll inspect, write the answer, then verify it.",
+                    tool_calls=[
+                        ToolCall(
+                            name="read_file",
+                            input={"path": "answer.txt"},
+                            output="missing",
+                            success=True,
+                        ),
+                        ToolCall(
+                            name="write_file",
+                            input={"path": "answer.txt"},
+                            output="wrote answer.txt",
+                            success=True,
+                        ),
+                        ToolCall(
+                            name="shell",
+                            input={"command": "python -m pytest -q"},
+                            output="1 passed",
+                            success=True,
+                        ),
+                    ],
+                ),
+                TranscriptMessage(role="assistant", text="done, verified."),
+            ]
+        )
+
+    async def delete_session(self, session_key: str) -> None:
+        self.state.deleted_sessions.append(session_key)
+
+    async def delete_agent(self, agent_id: str, *, delete_files: bool = False) -> None:  # noqa: ARG002
+        self.state.deleted_agents.append(agent_id)
+
+
+class _DisconnectingGatewayClient(_SuccessfulGatewayClient):
+    async def send_and_wait(self, session_key: str, message: str, *, timeout: float) -> Transcript:  # noqa: ARG002
+        raise ConnectionError("gateway connection dropped")
+
+
+@pytest.mark.asyncio
+async def test_queue_worker_harness_scorer_happy_path_writes_result(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+):
+    queue_dir = tmp_path / "queue"
+    results_dir = tmp_path / "results"
+    state_dir = tmp_path / "state"
+    monkeypatch.setattr(queue_module, "LOCAL_QUEUE_DIR", queue_dir)
+    monkeypatch.setattr(queue_module, "HF_TOKEN", "")
+    monkeypatch.setattr("clawbench.worker.RESULTS_DIR", results_dir)
+    monkeypatch.setenv("OPENCLAW_STATE_DIR", str(state_dir))
+    monkeypatch.setenv("CLAWBENCH_RUN_CACHE_DIR", str(tmp_path / "run-cache"))
+    monkeypatch.setattr("clawbench.harness.GatewayClient", _SuccessfulGatewayClient)
+
+    async def fake_upload_result(result) -> None:  # noqa: ANN001
+        return None
+
+    async def fake_ensure_gateway() -> None:
+        return None
+
+    async def fake_preflight_browser_support_for_tasks(*args, **kwargs) -> None:  # noqa: ANN002, ANN003
+        return None
+
+    task = _runtime_task()
+    queue = JobQueue()
+    job = await queue.submit(
+        SubmissionRequest(
+            model="test/model",
+            provider="test",
+            runs_per_task=1,
+            max_parallel_lanes=1,
+        )
+    )
+    claimed = await queue.claim_pending()
+    assert [claimed_job.job_id for claimed_job in claimed] == [job.job_id]
+
+    worker = EvalWorker(queue)
+    monkeypatch.setattr(worker, "_load_job_tasks", lambda current_job: [task])
+    monkeypatch.setattr("clawbench.harness.load_all_tasks", lambda **kwargs: [task])
+    monkeypatch.setattr(worker, "_ensure_gateway", fake_ensure_gateway)
+    monkeypatch.setattr(worker, "_preflight_browser_support_for_tasks", fake_preflight_browser_support_for_tasks)
+    monkeypatch.setattr(worker, "_stop_gateway", lambda: None)
+    monkeypatch.setattr(worker, "_stop_parallel_gateways", lambda: None)
+    monkeypatch.setattr("clawbench.upload.upload_result", fake_upload_result)
+
+    await worker._process_job(claimed[0])
+
+    finished = await queue.get_status(job.job_id)
+    assert finished is not None
+    assert finished.status == JobStatus.FINISHED
+    assert finished.result_id is not None
+    assert finished.progress_message == "Finished"
+    result_path = results_dir / f"{finished.result_id}.json"
+    result = json.loads(result_path.read_text(encoding="utf-8"))
+    assert result["model"] == "test/model"
+    assert result["overall_completion"] == 1.0
+    assert result["overall_pass_hat_k"] == 1.0
+    assert result["task_results"][0]["task_id"] == "runtime-contract-smoke"
+
+
+@pytest.mark.asyncio
+async def test_harness_turn_disconnect_becomes_failed_run(tmp_path: Path, monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.setenv("OPENCLAW_STATE_DIR", str(tmp_path / "state"))
+    monkeypatch.setenv("CLAWBENCH_RUN_CACHE_DIR", str(tmp_path / "run-cache"))
+    monkeypatch.setattr("clawbench.harness.GatewayClient", _DisconnectingGatewayClient)
+
+    harness = BenchmarkHarness(
+        gateway_config=GatewayConfig(),
+        model="test/model",
+        randomize_order=False,
+        print_report=False,
+        quiet=True,
+    )
+
+    result = await harness._run_single(_runtime_task(), 0)
+
+    assert result.run_score == 0.0
+    assert result.delivery_outcome.value == "fail"
+    assert result.failure_mode is not None
+    assert result.failure_mode.value == "environment_unavailable"
+    assert "gateway connection dropped" in (result.error or "")
+
+
+@pytest.mark.asyncio
+async def test_harness_scorer_exception_becomes_failed_run(tmp_path: Path, monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.setenv("OPENCLAW_STATE_DIR", str(tmp_path / "state"))
+    monkeypatch.setenv("CLAWBENCH_RUN_CACHE_DIR", str(tmp_path / "run-cache"))
+    monkeypatch.setattr("clawbench.harness.GatewayClient", _SuccessfulGatewayClient)
+
+    async def fail_score_task_run(**kwargs):  # noqa: ANN003
+        raise RuntimeError("scorer exploded")
+
+    monkeypatch.setattr("clawbench.harness.score_task_run", fail_score_task_run)
+    harness = BenchmarkHarness(
+        gateway_config=GatewayConfig(),
+        model="test/model",
+        randomize_order=False,
+        print_report=False,
+        quiet=True,
+    )
+
+    result = await harness._run_single(_runtime_task(), 0)
+
+    assert result.run_score == 0.0
+    assert result.delivery_outcome.value == "fail"
+    assert result.failure_mode is not None
+    assert result.failure_mode.value == "state_regression"
+    assert result.error == "scorer exploded"
+
+
+@pytest.mark.asyncio
+async def test_stale_evaluating_job_can_be_reclaimed_and_claimed_again(monkeypatch: pytest.MonkeyPatch):
+    queue = JobQueue()
+    stale_started_at = (
+        datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(hours=1)
+    ).isoformat()
+    queue._jobs = {
+        "job-1": Job(
+            job_id="job-1",
+            status=JobStatus.EVALUATING,
+            started_at=stale_started_at,
+            last_progress_at=stale_started_at,
+            current_task_id="runtime-contract-smoke",
+            current_run_index=1,
+            current_run_total=1,
+            attempt_count=1,
+            request=SubmissionRequest(model="test/model"),
+        )
+    }
+    monkeypatch.setattr(queue, "_save_local", lambda: None)
+
+    async def fake_sync_to_hub() -> None:
+        return None
+
+    monkeypatch.setattr(queue, "_sync_to_hub", fake_sync_to_hub)
+
+    reclaimed = await queue.reclaim_stale_jobs(stale_after_seconds=300)
+    claimed = await queue.claim_pending()
+
+    assert [job.job_id for job in reclaimed] == ["job-1"]
+    assert [job.job_id for job in claimed] == ["job-1"]
+    job = queue._jobs["job-1"]
+    assert job.status == JobStatus.EVALUATING
+    assert job.attempt_count == 2
+    assert job.stale_requeues == 1
+    assert job.current_task_id is None
+    assert job.current_run_index is None
+    assert job.progress_message == "Queued for evaluation"
+
+
+def test_leaderboard_skips_malformed_local_result_file(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+):
+    class NoopThread:
+        def __init__(self, *args, **kwargs) -> None:  # noqa: ANN002, ANN003
+            return None
+
+        def start(self) -> None:
+            return None
+
+    monkeypatch.setattr(threading, "Thread", NoopThread)
+    monkeypatch.setattr(queue_module, "LOCAL_QUEUE_DIR", tmp_path / "queue")
+    monkeypatch.setattr(queue_module, "HF_TOKEN", "")
+    sys.modules.pop("app", None)
+    app = importlib.import_module("app")
+
+    results_dir = tmp_path / "results"
+    results_dir.mkdir()
+    (results_dir / "bad.json").write_text("{not json", encoding="utf-8")
+    (results_dir / "good.json").write_text(
+        json.dumps(
+            {
+                "model": "test/model",
+                "timestamp": "2026-04-29T00:00:00+00:00",
+                "overall_score": 0.91,
+                "overall_completion": 1.0,
+                "overall_trajectory": 0.8,
+                "overall_behavior": 1.0,
+                "overall_pass_hat_k": 1.0,
+                "environment": {"prompt_variant": "clear", "scenario": "all"},
+                "task_results": [{"task_id": "runtime-contract-smoke"}],
+            }
+        ),
+        encoding="utf-8",
+    )
+    monkeypatch.setattr(app, "RESULTS_DIR", results_dir)
+    monkeypatch.setattr(app, "dataset_has_submission_results", lambda api, repo: False)
+
+    frame = app._load_leaderboard_uncached()
+
+    assert list(frame["Model"]) == ["test/model"]
+    assert list(frame["Score"]) == [0.91]