Compare commits

...

3 Commits

Author SHA1 Message Date
scoootscooob
abf3500f69 fix(harness): keep gateway RPC sockets alive
Some checks failed
CI / Python ${{ matrix.python-version }} test suite (3.11) (push) Has been cancelled
CI / Python ${{ matrix.python-version }} test suite (3.12) (push) Has been cancelled
2026-05-02 14:51:52 -07:00
scoootscooob
cebd1c8026 chore(repo): clean public benchmark surface
Some checks are pending
CI / Python ${{ matrix.python-version }} test suite (3.11) (push) Waiting to run
CI / Python ${{ matrix.python-version }} test suite (3.12) (push) Waiting to run
2026-05-02 12:18:58 -07:00
scoootscooob
7eb854710f feat(eval): stabilize full-suite adapter runs 2026-05-02 10:24:03 -07:00
67 changed files with 8906 additions and 2595 deletions

19
.dockerignore Normal file
View File

@ -0,0 +1,19 @@
.git
.venv
__pycache__
.pytest_cache
.mypy_cache
.ruff_cache
.DS_Store
data
results
.clawbench
.tmp/*
!.tmp/hermes-agent
!.tmp/hermes-agent/**
**/node_modules
**/__pycache__
**/.pytest_cache

View File

@ -104,11 +104,24 @@ Each task will declare:
- `family`
- `surface`
- `capabilities`
- `category`
- `domain`
- `functionality`
- `trace_distribution`
- `tool_surface`
- `risk_tags`
- `pool`
- `variant_group`
- `official`
- `semantic_judge`
The added dimensions are flat, orthogonal leaderboard axes. They are not
sublevels of tier or scenario, and they must not encode a specific agent
product. The result schema aggregates scores by each axis so OpenClaw,
Hermes, plugin-backed runs, and other third-party harnesses can compare
the same verifier set by task mix without rewarding a harness-specific
setup.
Recommended capability tags:
- `bugfix`

View File

@ -1,8 +1,8 @@
# ClawBench HF Docker Space
# Layer the benchmark harness on top of a pinned OpenClaw image.
# Layer the benchmark harness on top of the official OpenClaw image.
ARG OPENCLAW_IMAGE=ghcr.io/openclaw/openclaw@sha256:2e32f4f2e4f653f12d5dc6e5c93cc71e60f49d1dfaf061b18e53c3e61a38fb48
FROM ${OPENCLAW_IMAGE}
ARG BASE=ghcr.io/openclaw/openclaw:latest
FROM ${BASE}
USER root
@ -13,8 +13,10 @@ RUN apt-get update && \
RUN ln -s /app /openclaw
ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
RUN npx -y playwright@1.59.1 install --with-deps chromium && \
ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright \
NODE_PATH=/usr/local/lib/node_modules
RUN npm install -g playwright@1.59.1 && \
playwright install --with-deps chromium && \
CHROME_PATH="$(find /ms-playwright -path '*/chrome' -type f | sort | head -n 1)" && \
test -x "$CHROME_PATH" && \
ln -sf "$CHROME_PATH" /usr/bin/chromium
@ -38,7 +40,7 @@ RUN mkdir -p \
/home/node/.openclaw/agents/dev \
/home/node/.openclaw/agents/main/agent && \
chown -R node:node /data /home/node/.openclaw && \
chmod -R 775 /data /home/node/.openclaw
chmod -R 777 /data /home/node/.openclaw
USER node

View File

@ -0,0 +1,53 @@
# ClawBench HF Docker Space with OpenClaw 2026.4.26 agent-create race hotfix.
ARG BASE=openclaw-426-agent-hotfix:latest
FROM ${BASE}
USER root
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && \
apt-get install -y python3-pip python-is-python3 && \
rm -rf /var/lib/apt/lists/*
RUN ln -s /app /openclaw
ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright \
NODE_PATH=/usr/local/lib/node_modules
RUN npm install -g playwright@1.59.1 && \
playwright install --with-deps chromium && \
CHROME_PATH="$(find /ms-playwright -path '*/chrome' -type f | sort | head -n 1)" && \
test -x "$CHROME_PATH" && \
ln -sf "$CHROME_PATH" /usr/bin/chromium
ENV HOME=/home/node PATH=/home/node/.local/bin:$PATH
WORKDIR /home/node/app
COPY --chown=node:node pyproject.toml README.md CLAWBENCH_V0_4_SPEC.md PARTNER_TRACE_SPEC.md ./
COPY --chown=node:node clawbench/ clawbench/
COPY --chown=node:node scripts/ scripts/
COPY --chown=node:node profiles/ profiles/
COPY --chown=node:node tasks/ tasks/
COPY --chown=node:node tasks-public/ tasks-public/
COPY --chown=node:node tasks-domain/ tasks-domain/
COPY --chown=node:node baselines/ baselines/
COPY --chown=node:node app.py .
RUN python3 -m pip install --break-system-packages --no-cache-dir .
RUN mkdir -p \
/data/results \
/data/queue \
/home/node/.openclaw/agents/dev \
/home/node/.openclaw/agents/main/agent && \
chown -R node:node /data /home/node/.openclaw && \
chmod -R 777 /data /home/node/.openclaw
USER node
ENV GATEWAY_PORT=18789
ENV OPENCLAW_HOME=/home/node
ENV OPENCLAW_STATE_DIR=/home/node/.openclaw
EXPOSE 7860
CMD ["python", "app.py"]

113
Dockerfile.gbrain Normal file
View File

@ -0,0 +1,113 @@
# ClawBench + latest upstream GBrain for OpenClaw harness comparisons.
#
# Secrets are not baked into this image. Runtime API keys are read from the
# mounted OpenClaw config/env by scripts/setup_gbrain_runtime.sh.
ARG BASE=ghcr.io/openclaw/openclaw:latest
FROM ${BASE}
USER root
ARG GBRAIN_REPO=https://github.com/garrytan/gbrain.git
ARG GBRAIN_REF=be8fffad71ea36bc51c2d58564762b0fe271e8f4
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && \
apt-get install -y --no-install-recommends \
ca-certificates curl git jq python3-pip python-is-python3 unzip && \
rm -rf /var/lib/apt/lists/*
RUN ln -s /app /openclaw
ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright \
NODE_PATH=/usr/local/lib/node_modules
RUN npm install -g playwright@1.59.1 && \
playwright install --with-deps chromium && \
CHROME_PATH="$(find /ms-playwright -path '*/chrome' -type f | sort | head -n 1)" && \
test -x "$CHROME_PATH" && \
ln -sf "$CHROME_PATH" /usr/bin/chromium
ENV BUN_INSTALL=/usr/local/bun
RUN mkdir -p /usr/local/bun && \
curl -fsSL https://bun.sh/install | bash
RUN git clone "${GBRAIN_REPO}" /opt/gbrain && \
cd /opt/gbrain && \
git checkout "${GBRAIN_REF}" && \
/usr/local/bun/bin/bun install --frozen-lockfile
RUN mkdir -p /opt/gbrain/.codex-plugin /opt/gbrain/bin && \
printf '%s\n' \
'#!/usr/bin/env bash' \
'set -euo pipefail' \
'cd /opt/gbrain' \
'exec /usr/local/bun/bin/bun run src/cli.ts "$@"' \
> /opt/gbrain/bin/gbrain && \
printf '%s\n' \
'{' \
' "id": "gbrain",' \
' "name": "gbrain",' \
' "description": "Personal knowledge brain with PGLite-backed CLI, skills, and MCP server",' \
' "version": "0.22.6",' \
' "skills": "skills",' \
' "mcpServers": {' \
' "gbrain": {' \
' "command": "/opt/gbrain/bin/gbrain",' \
' "args": ["serve"],' \
' "cwd": "/opt/gbrain",' \
' "connectionTimeoutMs": 120000,' \
' "env": {' \
' "PATH": "/opt/gbrain/bin:/usr/local/bun/bin:/usr/local/bin:/usr/bin:/bin"' \
' }' \
' }' \
' },' \
' "configSchema": {' \
' "type": "object",' \
' "additionalProperties": true,' \
' "properties": {' \
' "database_url": {"type": "string"},' \
' "openai_api_key": {"type": "string"}' \
' }' \
' }' \
'}' \
> /opt/gbrain/.codex-plugin/plugin.json && \
chmod +x /opt/gbrain/bin/gbrain && \
ln -sf /opt/gbrain/bin/gbrain /usr/local/bin/gbrain && \
ln -sf /usr/local/bun/bin/bun /usr/local/bin/bun && \
chown -R node:node /opt/gbrain && \
git config --system --add safe.directory /opt/gbrain
ENV PATH=/opt/gbrain/bin:/usr/local/bun/bin:/home/node/.local/bin:$PATH \
HOME=/home/node \
CLAWBENCH_ENABLE_GBRAIN=1 \
CLAWBENCH_LANE_PREPARE_CMD=/home/node/app/scripts/setup_gbrain_runtime.sh \
GBRAIN_ALLOW_SHELL_JOBS=1
WORKDIR /home/node/app
COPY --chown=node:node pyproject.toml README.md ./
COPY --chown=node:node clawbench/ clawbench/
COPY --chown=node:node tasks-public/ tasks-public/
COPY --chown=node:node tasks-domain/ tasks-domain/
COPY --chown=node:node baselines/ baselines/
COPY --chown=node:node scripts/container_adapter_eval.sh scripts/container_lane_eval.sh scripts/setup_gbrain_runtime.sh scripts/
COPY --chown=node:node app.py .
RUN chmod +x scripts/container_adapter_eval.sh scripts/container_lane_eval.sh scripts/setup_gbrain_runtime.sh && \
python3 -m pip install --break-system-packages --no-cache-dir .
RUN mkdir -p \
/data/results \
/data/queue \
/home/node/.openclaw/agents/dev \
/home/node/.openclaw/agents/main/agent && \
chown -R node:node /data /home/node/.openclaw && \
chmod -R 777 /data /home/node/.openclaw
USER node
ENV GATEWAY_PORT=18789
ENV OPENCLAW_HOME=/home/node
ENV OPENCLAW_STATE_DIR=/home/node/.openclaw
EXPOSE 7860
CMD ["python", "app.py"]

View File

@ -16,8 +16,10 @@ RUN apt-get update && \
RUN ln -s /app /openclaw
ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
RUN npx -y playwright@1.59.1 install --with-deps chromium && \
ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright \
NODE_PATH=/usr/local/lib/node_modules
RUN npm install -g playwright@1.59.1 && \
playwright install --with-deps chromium && \
CHROME_PATH="$(find /ms-playwright -path '*/chrome' -type f | sort | head -n 1)" && \
test -x "$CHROME_PATH" && \
ln -sf "$CHROME_PATH" /usr/bin/chromium

View File

@ -0,0 +1,8 @@
FROM ghcr.io/openclaw/openclaw:2026.4.26
USER root
COPY patches/patch_openclaw_426_agent_create_queue.mjs /tmp/patch_openclaw_426_agent_create_queue.mjs
RUN node /tmp/patch_openclaw_426_agent_create_queue.mjs && \
rm /tmp/patch_openclaw_426_agent_create_queue.mjs
USER node

View File

@ -35,6 +35,7 @@ Each trace record should have this top-level structure:
"plugins": [],
"skills": [],
"prompts": {},
"task_metadata": {},
"transcript": {
"messages": []
},
@ -58,6 +59,7 @@ These fields should always be present:
- `config`: effective runtime configuration for the run
- `plugins`: plugins or tool bundles available to the agent, even if empty
- `prompts.user`: the user task or user-visible request
- `task_metadata`: benchmark task axes, when the trace corresponds to a ClawBench task
- `transcript.messages`: ordered message list for the run
## Strongly Recommended Fields
@ -75,7 +77,28 @@ These materially improve trace quality and downstream usefulness:
## Metadata We Want
### 1. Harness
### 1. Task Metadata
When a trace maps to a benchmark task, include the same flat task axes
used by ClawBench result aggregation. These axes are intentionally
orthogonal and harness-neutral; do not nest them under an agent product
or plugin stack.
Recommended fields:
```json
{
"task_id": "t4-browser-research-and-code",
"category": "software_engineering",
"domain": "devtools",
"functionality": ["browser_research", "api_contract_extraction", "code_repair"],
"trace_distribution": ["browser_heavy", "read_heavy", "edit_heavy", "execute_heavy"],
"tool_surface": ["browser", "filesystem", "shell", "local_service"],
"risk_tags": ["code_regression", "hallucination"]
}
```
### 2. Harness
Use `harness` to describe the execution framework itself.
@ -95,7 +118,7 @@ Recommended fields:
}
```
### 2. Model
### 3. Model
Use `model` to identify the model under test.
@ -111,7 +134,7 @@ Recommended fields:
}
```
### 3. Config
### 4. Config
Use `config` for the effective runtime settings that could change behavior.
@ -134,7 +157,7 @@ Recommended fields:
If a field is unavailable, omit it rather than inventing a value.
### 4. Plugins
### 5. Plugins
Use `plugins` for tools, plugin bundles, MCP servers, extensions, or other agent capabilities exposed by the harness.
@ -162,7 +185,7 @@ Recommended entry shape:
}
```
### 5. Skills
### 6. Skills
Use `skills` for reusable instruction bundles, templates, internal playbooks, or any named capability layer available to the agent.
@ -186,7 +209,7 @@ Recommended entry shape:
}
```
### 6. Prompts
### 7. Prompts
Use `prompts` for the prompt stack that shaped agent behavior.
@ -217,7 +240,7 @@ Example:
}
```
### 7. Transcript
### 8. Transcript
`transcript.messages` is the core behavioral record.

632
README.md
View File

@ -13,569 +13,197 @@ license: mit
# ClawBench
**Rigorous agent evaluation. Signal-curated tasks. Dynamical-systems diagnostics.**
**Trace-scored agent evaluation for OpenClaw.**
[![Python 3.11+](https://img.shields.io/badge/python-3.11+-3776AB.svg?style=flat-square)](https://www.python.org/downloads/)
[![License: MIT](https://img.shields.io/badge/license-MIT-green.svg?style=flat-square)](LICENSE)
[![Core v1: 19 tasks](https://img.shields.io/badge/Core%20v1-19%20tasks-blue.svg?style=flat-square)](tasks-public/)
[![Diagnostics](https://img.shields.io/badge/diagnostics-dynamical-blueviolet.svg?style=flat-square)](#3-dynamical-systems-diagnostics-how-agents-fail-not-just-whether)
[![HF Dataset](https://img.shields.io/badge/HF-dataset-yellow.svg?style=flat-square)](https://huggingface.co/datasets/openclaw/clawbench-results)
</div>
---
## What's new in Core v1 (2026-04-20)
## What This Repo Contains
A reproducibility-first public release of the benchmark, informed by a full 8-model, 1,080-run sweep audit and five new methodology layers that most agent benchmarks simply don't have:
ClawBench evaluates AI agents by running real local tasks, capturing the
execution trace, and scoring both the final state and the process used to get
there.
| Innovation | What it means | Why it matters |
|---|---|---|
| **Signal-curated task set** | 19 tasks selected from 40-task dev pool by greedy SNR-preserving elimination | Drops tasks where seed noise exceeds capability signal (21 such tasks exist in the raw 40) |
| **Variance decomposition** | Measures and reports seed-noise vs capability-signal ratio per task | **47% of 40-task variance is seed noise** — we quantify it; most benchmarks hide it |
| **Dynamical-systems diagnostics** | Per-run regime classification (trapped / limit-cycle / diffusive / mixed) | Reveals *how* agents fail, not just whether. Inspired by Markov-kernel / attractor-basin framework |
| **Constraint Index C(q)** | Principled task-weighting via participation ratio + entropy + Bayes prediction | Distinguishes "everyone converges" from "everyone diverges" tasks — enables honest weighted ranking |
| **Reproducibility-first infrastructure** | Per-container state isolation, judge-infra rejudge pipeline, documented OpenRouter-routing caveats | Eliminates the cascading-failure / silent-judge-error patterns that bias most agent benchmarks |
The public repository contains:
All of it lives in `scripts/` and `tasks-public/` — auditable code, not opaque numbers.
- `tasks-public/`: Core v1, a 19-task public reproducibility suite.
- `clawbench/`: the benchmark harness, adapters, canonical task conversion,
scoring, statistics, and diagnostics.
- `profiles/`: example model/profile definitions.
- `scripts/`: reusable analysis and container runner utilities.
- `tests/`: unit and integration coverage for the public harness.
---
The private holdout is intentionally not included:
## The problem with every agent benchmark
- private task YAML files,
- private task assets and verifier scripts,
- private expected outputs,
- private run traces, logs, and per-task reports.
You run a benchmark. Model A scores 73%. Model B scores 71%. You pick Model A.
Internal hidden-suite runs can restore a private `tasks/` directory locally.
The public code is designed to run without that directory by falling back to
`tasks-public/`.
Then Model A deletes your test fixtures, hallucinates that it ran `pytest` (it didn't), and confidently reports "all tests pass" while your CI is on fire. Model B would have taken 10 seconds longer but actually verified its work.
## Core v1
**The benchmark told you Model A was better. Your users would disagree.**
Core v1 is a signal-curated 19-task public release selected from the internal
development pool. It preserves tier and family coverage while avoiding tasks
whose public release would leak holdout material or add mostly run-to-run
noise.
Beyond that, most benchmarks don't tell you:
- Whether the gap is signal or noise
- Which tasks actually discriminate models and which are coin-flips
- How the agent *dynamically* fails — attractor, limit-cycle, goal drift
- Whether re-running gives the same ranking (spoiler: on most benchmarks, no)
- What's driving your score — the model, the plugin stack, or the harness version
| Dimension | Breakdown |
|---|---|
| Tasks | 19 |
| Runs per official comparison | 3 per task |
| Total runs per model | 57 |
| Tiers | T1=2, T2=6, T3=5, T4=5, T5=1 |
| Families | tools=8, coding=2, repo=3, browser=2, multi_tool=3, adversarial=1 |
ClawBench addresses all of this. Below is how.
---
## What makes ClawBench different
### 1. We score from execution traces, not just final output
Every agent run produces a full execution trace: every tool call, every file read, every `pytest` invocation, every retry after failure. Most benchmarks throw this away and check the final state. ClawBench scores *from the trace itself*.
| Axis | Weight | What it measures | Where it comes from |
|------|--------|-----------------|-------------------|
| **Completion** | 40% | Did the work actually get done? | Deterministic verifiers: `pytest`, exit codes, file equality, DOM assertions, memory state |
| **Trajectory** | 30% | Did the agent work well? | Trace analysis: read-before-write ratio, self-verification, recovery after failure, tool-family fit |
| **Behavior** | 20% | Was the agent safe and communicative? | Pattern detection: planning, progress updates, destructive command avoidance |
| **Judge** | 10% | Is the semantic quality good? | LLM evaluation (gated — only contributes when deterministic completion is already near-perfect) |
**The key invariant**: the LLM judge can never rescue a failed deterministic check. If `pytest` fails, the judge score is zeroed. This is enforced in code and tested. You can't game ClawBench by producing output that *looks* correct to an LLM but doesn't actually work.
### 2. We measure reliability AND quantify noise
A model that scores 90% on one run and 20% on the next is not a 55% model. It's an unreliable model. Users experience the worst run, not the average.
ClawBench runs every task 3 times and reports:
- **pass^k** — did ALL runs pass? (not just "did any run pass?")
- **Taguchi Signal-to-Noise** — asymmetrically penalizes the worst runs, because that's what matters in production
- **Bootstrap confidence intervals** — 10,000 resamples per task, so you know when a score difference is real vs. noise
- **Worst-of-n** — the score that actually determines user trust
- **13 failure modes**`hallucinated_completion`, `tool_misuse`, `verification_skipped`, `state_regression`, `graceful_refusal`, and 8 more (not just "pass/fail")
Beyond per-run reliability, we decompose **benchmark-wide variance** into seed-noise vs capability signal:
```
SNR(task) = capability_variance(across models) / mean_seed_variance(per model)
```
Findings from the v4-19-full sweep audit:
- **Only 52.7% of run_score variance is real capability signal**; 47.3% is seed noise
- **2 tasks have SNR ≥ 5** (reliably discriminate models)
- **21 tasks have SNR < 1** (seed noise ≥ capability signal; rankings on these tasks are essentially random)
Core v1 drops the noisy tasks and reports variance decomposition alongside rankings. This is the level of rigor most benchmarks don't attempt.
### 3. Dynamical-systems diagnostics: how agents fail, not just whether
Inspired by *"When LLMs Are Dreaming, Where Do They Go?"* — we treat each agent run as a stochastic trajectory in semantic state space and extract signal that flat `run_score` averages away.
Current code-path formulas:
```text
Per assistant step t:
x_t = [tool_family_proportions(6), error_flag, normalized_tokens, normalized_text_len, progress]
drift_t = cosine_distance(x_0, x_t)
step_t = cosine_distance(x_{t-1}, x_t)
Task-level Constraint Index:
PR(q) = tr(Σ_q)^2 / tr(Σ_q^2)
H(q) = -Σ_i p_i log2 p_i, p_i = λ_i / Σ_j λ_j, λ = eigvals(Σ_q)
BOPS(q) = mean_m mean_{i<j} cos(v_{q,m,i}, v_{q,m,j})
C(q) = -z(PR(q)) - z(H(q)) + z(BOPS(q))
Per-run constraint index used inside the regime classifier:
PR_run = 1 / Σ_i p_i^2
constraint_index_run = 1 - (PR_run - 1) / (d - 1)
Variance decomposition:
seed_var(q) = mean_m Var(run_score_{q,m,*})
cap_var(q) = Var_m Mean(run_score_{q,m,*})
SNR(q) = cap_var(q) / (seed_var(q) + 1e-9)
capability_fraction = mean_q cap_var(q) / (mean_q cap_var(q) + mean_q seed_var(q))
Survival:
T_F = first assistant turn with empty text and no tool calls,
else final assistant turn if run_score < 0.7 and delivery_outcome in {fail, partial}
S(t) = P(T_F > t)
h(t) = P(T_F = t | T_F >= t)
```
Implemented regime classifier in `clawbench/dynamics.py`:
```text
trapped if H_tools < 0.5 or (error_rate > 0.6 and std(drift) < 0.05)
convergent if std(drift_last_quartile) < 0.1 and mean(step_last_quartile) < 0.15 and error_rate < 0.2
diffusive if H_tools > 1.5 and error_rate < 0.15 and constraint_index_run < 0.8
chaotic if H_tools > 2.0 and var(step[1:]) > 0.02
limit_cycle if max autocorr(centered step[1:], lags 2..5) > 0.3
unknown otherwise, or <3 assistant turns
```
The task-level `C(q)` uses a normalized bag-of-words response vector built from the full assistant trajectory text plus tool-call names and compacted inputs, not just the last assistant turn.
From the v4-19 sweep data:
- **Gemini 3.1 Pro** exhibits `trapped` regime on 42/120 runs — commits early, doesn't iterate
- **GPT 5.4** has the most `limit_cycle` runs (20) — tool-use loops, productive or stuck
- **Kimi K2.5** dies at median turn 3 (worst survival); **GPT 5.4** survives to turn 8 at 60% rate (best)
All scripts under `scripts/` run on cached per-run JSONs with plain numpy-based tooling; no torch or sentence-transformers required.
### 4. We ablate configurations, not just models
On realistic tasks, **swapping the plugin configuration produces score swings 10x larger than swapping the model**. The same Claude Sonnet can beat Claude Opus when wrapped in better tooling.
If the configuration drives 10x more variance than the model, the benchmark should measure it. ClawBench's Configuration Diagnostic:
1. **Fingerprint** your plugin configuration into a typed feature vector (hooks, tools, capabilities, slots)
2. **Predict** your score before you spend a dollar on compute (k-NN over historical submissions)
3. **Run** the benchmark and detect surprises (actual vs. predicted deltas)
4. **Explain** which plugins are actually driving your score (fANOVA factor importance)
5. **Recommend** specific, evidence-backed configuration changes with estimated impact
No other benchmark can do this — no other benchmark has access to typed plugin manifests. OpenClaw's plugin-native architecture makes the configuration transparent, not a black box.
### 5. Reproducibility-first infrastructure
The v4-19-full sweep exposed multiple failure modes that silently bias numbers in other benchmarks:
- **Shared state dir contamination** — accumulated `agents/` cruft across sequential sweeps caused `RPC agents.create timed out` cascades. Fixed via per-container `OPENCLAW_STATE_DIR` isolation (`scripts/container_sweep_single.sh`).
- **Gateway judge failures** — the in-process judge returned "Gateway is restarting" / empty scores on infrastructure hiccups. Fixed via direct-API rejudge pipeline (`scripts/rejudge_all.py`).
- **OpenRouter provider routing** — slug `z-ai/glm-5.1` canonically routes to different backing models over time. GLM 5.1 scored 0.79 at 14:00 PST, became untestable by 17:00 PST when OpenRouter repointed the slug to a reasoning-enabled variant with insufficient token budget. Numbers measured against OpenRouter-hosted models are explicitly flagged.
- **Platform version drift** — OpenClaw 4.9 → 4.15-beta.1 shifted scores by +0.13 to +0.29 across all models. When comparing two model runs, build both against the same OpenClaw release.
All of these are documented in code + commit messages. The state-isolation patch + rejudge pipeline + provider caveats turn a flaky harness into one whose drift sources are at least visible.
---
## How trace-based scoring works
Traditional benchmarks check the output: "does `output.json` match `expected.json`?" ClawBench checks the output *and* the process that produced it.
### The execution trace
Every tool call the agent makes is recorded with:
- **Family classification**`read`, `edit`, `search`, `execute`, `browser`, `memory`, `delegate`, `cron`, `plan`
- **Mutation flag** — did this call change state?
- **Success/failure** — and if failed, the error
- **Output** — what the tool returned
- **Timing** — when it happened, how long it took
### What we grade from the trace
**Read-before-write ratio**: Before editing a file, did the agent read it first? Agents that blind-patch without reading produce correct output ~40% of the time but break things the other 60%. The trace catches this.
**Self-verification**: After making changes, did the agent run tests? A model that edits code and immediately says "done" without running `pytest` might get lucky once. It won't get lucky 3 times in a row. The trajectory score penalizes skipping verification.
**Recovery patterns**: When a tool call fails, does the agent retry intelligently or loop on the same broken command? The trace reveals whether the agent actually *reasoned* about the failure.
**Safety violations**: Did the agent run `rm -rf`, `git reset --hard`, `sudo`, or other destructive commands when not appropriate? These get caught and penalized, even if the final output looks fine.
### Why this matters for users
A user doesn't see a pass/fail. They see an agent that reads their code carefully, makes targeted changes, runs the tests, fixes what broke, and communicates what it did. Or they see an agent that blindly rewrites files and claims success. **Both might produce the same final output.** Only trace-based scoring tells them apart.
---
## The 13 failure modes
When an agent fails, "fail" is not useful information. ClawBench classifies every failure into one of 13 deterministic modes:
| Mode | What happened | Example |
|------|--------------|---------|
| `hallucinated_completion` | Agent fabricated work it didn't do | "Tests pass!" (no tests were run) |
| `tool_misuse` | Wrong tool or wrong arguments | Using `edit` on a file that doesn't exist |
| `verification_skipped` | Never ran verification after changes | Edited code, skipped `pytest` |
| `state_regression` | Environment changed unexpectedly | Background service crashed mid-run |
| `graceful_refusal` | Correctly refused an impossible task | "This encryption cannot be reversed" |
| `browser_navigation_failure` | Failed to reach the target page | Form server URL unreachable |
| `memory_miss` | Failed to read/write required memory | Forgot to store context for continuation |
| `repeated_error_loop` | Stuck retrying the same failure | Same command failed 5 times |
| `delegation_failed` | Sub-agent spawning failed | Agent-to-agent handoff broken |
| `unsafe_mutation` | Dangerous command executed | `rm -rf` on production directory |
| `environment_unavailable` | Service not ready or timed out | Database not started yet |
| `timeout` | Exceeded wall-clock budget | 600s hard limit |
| `reward_hack_suspected` | Agent gamed the verifier | Echoed expected output instead of computing it |
These are surfaced per-run in the result, not hidden in logs. They make failures *actionable*.
---
## Core v1 task suite: 19 tasks
Core v1 is a signal-curated public release of 19 tasks from the internal 40-task dev pool. Selected for:
- **0 ranking inversions** — the mean reproduces the reference 8-model order exactly
- **Preserved coverage** — all 5 tiers and 6 families represented
- **Dropped noise** — excludes tasks where cross-model SNR < 0.5
| Tier | Core v1 count | What it tests | Examples |
|------|:---:|---|---|
| **Tier 1** | 2 | Single-tool basics | Bugfix discount calc, quick file note |
| **Tier 2** | 6 | Multi-step, 2-3 tools | Config loader repair, browser form fix, priv redaction |
| **Tier 3** | 5 | Complex orchestration | SQL query analysis, inbox triage, data pipeline report |
| **Tier 4** | 5 | Cross-system reasoning | Cross-repo migration, delegation repair, memory continuation, browser research+code |
| **Tier 5** | 1 | Adversarial | Hallucination-resistant evidence |
Full manifest: [`tasks-public/MANIFEST.yaml`](tasks-public/MANIFEST.yaml).
### Task design principles
**Intentionally vague prompts.** Users don't write numbered step lists. They say "fix the bug and make sure the tests pass." The agent has to figure out what "fix the bug" means.
**Real tool composition.** Tasks require reading files, editing code, running tests, navigating browsers, querying memory, scheduling cron jobs — in combination, not isolation.
**Deterministic verification.** Every task has execution-based verification: `pytest` pass, exit code check, file content match, DOM state assertion, network trace check. The LLM judge is optional and never overrides a deterministic failure.
**Adversarial tier.** Tier 5 tasks are designed to test what most benchmarks can't: does the agent correctly identify when a task is impossible? Does it resist hallucinating evidence that doesn't exist? Does it handle contradictory instructions gracefully? These tasks separate models that are *capable* from models that are *trustworthy*.
### Private holdout (21 tasks)
The remaining 21 tasks from the internal pool stay private:
- **9 ceiling tasks** — all frontier models score >0.85; don't discriminate at the frontier
- **9 low-signal tasks** — SNR < 0.5; either broken verifiers or genuinely ambiguous prompts (scheduled for redesign)
- **3 ranking-inconsistent tasks** — cross-model ordering conflicts with reference ranking (`t2-node-search-patch`, `t5-contradictory-requirements`, `t1-cal-quick-reminder`)
---
## The scoring math
### Per-run score
```
run_score = 0.4 * completion + 0.3 * trajectory + 0.2 * behavior + [0.1 * judge if completion >= 0.9999]
```
The judge term is gated: it only contributes when the deterministic completion score is near-perfect. You can't get a good score by producing output that *looks* right but doesn't pass execution checks.
### Per-task score (across 3 runs)
```
task_score = 0.9 * bootstrap_mean(run_scores) + 0.1 * reliability_score
reliability = 0.5 * pass^k + 0.3 * pass_rate + 0.2 * variance_score
```
`pass^k` is 1 only if ALL runs pass. Not any run — all runs.
### Taguchi Signal-to-Noise (robustness)
```
S/N = -10 * log10( (1/n) * sum(1/y_i^2) )
```
The `1/y_i^2` term means the worst score dominates. A configuration scoring 0.85 average but 0.10 on adversarial tasks is **worse in production** than 0.78 average with a 0.65 floor.
### SNR-weighted alternative (for ranking differentiation)
Flat-mean compresses frontier model gaps. An alternative that weights tasks by their signal density:
```
w_q = max(0, SNR(q)) × |C(q)|
w_q^wins = min(w_q, p95({w_q}))
flat_score(model) = mean_q mean_run_score(model, q) over covered tasks
weighted_score(model) = Σ_q w_q mean_run_score(model, q) / Σ_q w_q
winsorized_score(model) = Σ_q w_q^wins mean_run_score(model, q) / Σ_q w_q^wins
```
Under SNR × |C(q)| winsorized on the same 1,080-run archive, **Opus 4.7 ranks #1** (instead of Opus 4.6 under flat mean) and **GPT 5.4 drops from #3 to #7** — its task-specific cliffs (0.16 on `t3-feature-export`) fall on the highest-signal tasks. This exposes what the flat mean averages away.
Generate alternate rankings: `scripts/snr_weighted_ranking.py`.
---
## Reproducibility caveats
Being honest about what reproduces and what doesn't:
### What reproduces deterministically
- **Fair comparison audit** — given an archive dir, `scripts/audit_runs.py` produces identical numbers every time.
- **Dynamical diagnostics** — C(q), regime classification, variance decomposition, survival curves: all deterministic functions of the archive.
- **Rankings at the aggregate level** — top-cluster ranking stable across multiple sweeps when both runs use the same OpenClaw release + direct-API models.
### What drifts
- **Absolute scores** — seed noise is ~0.02 stddev per task per model. Expect run_score to drift within that envelope.
- **OpenRouter-served models**`openrouter/*` model slugs can silently re-route to different underlying providers. We observed GLM 5.1 at 0.79 then 0.33 within hours as OpenRouter flipped its backing provider. Pin to canonical versions (e.g., `z-ai/glm-5.1-20260406`) for stable measurement.
- **OpenClaw platform drift** — 4.9 → 4.15-beta.1 shifted scores by +0.13 to +0.29 across all models. 60-70% reduction in `tool_misuse` and `verification_skipped` failure modes across that jump. Pin the base to reproduce published numbers.
### Mitigating the drift
Build both sides of any comparison from the same source state:
The manifest is the source of truth:
```bash
docker build -t clawbench .
docker run --rm --entrypoint openclaw clawbench --version
# -> records the OpenClaw version of THIS build
python3 - <<'PY'
import yaml
manifest = yaml.safe_load(open("tasks-public/MANIFEST.yaml"))
for task in manifest["tasks"]:
print(task["id"])
PY
```
When publishing scores, record the OpenClaw version your image
resolved to and treat numbers from a different version as separate
populations.
## Scoring
---
Each run is scored from four signals:
## Quick start
| Axis | Weight | What it measures |
|---|---:|---|
| Completion | 40% | Deterministic task checks such as tests, exact outputs, DOM assertions, and file verification |
| Trajectory | 30% | Tool-use quality such as read-before-write, self-verification, recovery, and tool-family fit |
| Behavior | 20% | Planning, progress updates, blocker handling, and destructive-command avoidance |
| Judge | Up to 10% | Optional semantic quality, gated so it cannot rescue failed deterministic checks |
### Build the image
Reliability is first-class. Official comparisons run each task three times and
report per-task variance, pass rate, pass^k, confidence intervals, and
worst-of-n style robustness signals.
## Quick Start
Install locally:
```bash
git clone git@github.com:openclaw/clawbench.git && cd clawbench
docker build -t clawbench .
# Record the OpenClaw version baked in (for reproducibility):
docker run --rm --entrypoint openclaw clawbench --version
python3.11 -m venv .venv
source .venv/bin/activate
python -m pip install --upgrade pip
python -m pip install -e .
```
### Run Core v1 on a model
List public tasks:
```bash
clawbench list-tasks --tasks-dir tasks-public
```
Run a small public smoke:
```bash
export OPENCLAW_GATEWAY_TOKEN=<your-token>
# Core v1 = 19 specific tasks. List them via the manifest:
python3 -c "import yaml; m = yaml.safe_load(open('tasks-public/MANIFEST.yaml'));
print(' '.join(f'-t {t[\"id\"]}' for t in m['tasks']))"
clawbench run \
--model anthropic/claude-opus-4-6 \
--runs 1 \
--task t1-bugfix-discount \
--task t1-fs-quick-note \
--output results/public_smoke.json
```
Run the full Core v1 task list:
```bash
TASK_ARGS=$(python3 - <<'PY'
import yaml
manifest = yaml.safe_load(open("tasks-public/MANIFEST.yaml"))
print(" ".join(f"--task {task['id']}" for task in manifest["tasks"]))
PY
)
# Then run:
clawbench run \
--model anthropic/claude-opus-4-6 \
--runs 3 \
--concurrency 4 \
--profile profiles/frontier_opus_4_6.yaml \
--judge-model anthropic/claude-sonnet-4-6 \
-t t1-bugfix-discount -t t1-fs-quick-note \
-t t2-add-tests-normalizer -t t2-browser-form-fix \
-t t2-config-loader -t t2-fs-find-that-thing \
-t t2-msg-summarize-thread -t t2-priv-redact-doc \
-t t3-data-pipeline-report -t t3-data-sql-query \
-t t3-feature-export -t t3-msg-inbox-triage \
-t t3-web-research-and-cite \
-t t4-browser-research-and-code -t t4-cross-repo-migration \
-t t4-delegation-repair -t t4-life-trip-plan \
-t t4-memory-recall-continuation \
-t t5-hallucination-resistant-evidence \
-o results/opus46_core_v1.json
$TASK_ARGS \
--output results/core_v1_opus46.json
```
### Analyze a real archive
Build the public Space image:
```bash
# Fair-comparison audit
python3 scripts/audit_runs.py
python3 scripts/generate_fair_report.py --tag v2026-4-19-full
# Posterior dynamics + ranking from cached per-run JSONs
python3 scripts/run_posterior_dynamics_pipeline.py \
--archive-dir .clawbench/run_cache \
--reports-dir results/posterior_reports \
--include-dynamics-report \
--output-dir results/per_model_dynamics
# Writes:
# results/posterior_reports/constraint_index.json
# results/posterior_reports/regimes.json
# results/posterior_reports/variance_decomposition.json
# results/posterior_reports/survival_analysis.json
# results/posterior_reports/snr_weighted_ranking.json
# results/posterior_reports/EVAL_REPORT_DYNAMICAL.md
# results/per_model_dynamics/<safe_model_name>/dynamics.json
# results/per_model_dynamics/<safe_model_name>/*.png
docker build -t clawbench .
docker run --rm --entrypoint openclaw clawbench --version
```
If you only want one model's offline dynamics bundle:
## Hidden-Suite Reproduction
The hidden full-suite runner is public, but the task content is not. To rerun
an internal hidden-suite comparison, restore the private task archive into
`./tasks/` before building the hidden eval image. Do not commit that directory,
its logs, or generated per-task traces.
```bash
clawbench dynamics-report \
--archive-dir .clawbench/run_cache \
--model ollama/gpt-oss:20b \
--output-dir results/gptoss_dynamics
docker build -f Dockerfile.openclaw-426-agent-hotfix \
-t openclaw-426-agent-hotfix:latest .
# Quick CI path: skip plot rendering
clawbench dynamics-report \
--archive-dir .clawbench/run_cache \
--model ollama/gpt-oss:20b \
--output-dir results/gptoss_dynamics \
--no-plots
# Writes:
# results/gptoss_dynamics/dynamics.json
docker build -f Dockerfile.clawbench-426-agent-hotfix \
-t clawbench-openclaw-426-agent-hotfix:latest .
```
### Running locally with small models (Ollama)
The public repo intentionally does not include exact private task IDs, prompts,
assets, expected artifacts, or trace-derived private reports.
A single consumer GPU running an open-weight model is enough to develop plugin profiles and validate algorithmic ideas — no API keys or cloud spend required.
## Analysis Tools
```bash
ollama pull gpt-oss:20b
export OPENCLAW_GATEWAY_TOKEN=<your-gateway-token>
export CLAWBENCH_RUN_CACHE_DIR=$PWD/.clawbench/run_cache
Reusable scripts that operate on public or private result archives:
# Real benchmark run + immediate per-run dynamics bundle
clawbench run \
--model ollama/gpt-oss:20b \
--task t1-fs-quick-note \
--runs 1 \
--dynamics \
-o results/ollama_smoke.json
- `scripts/container_lane_eval.sh`: isolated OpenClaw lane runner.
- `scripts/container_adapter_eval.sh`: adapter/model runner for fair adapter comparisons.
- `scripts/run_posterior_dynamics_pipeline.py`: one-shot offline dynamics analysis.
- `scripts/compute_constraint_index.py`: task-level constraint index.
- `scripts/variance_decomp.py`: seed-noise vs capability-signal decomposition.
- `scripts/survival_analysis.py`: per-turn failure survival curves.
- `scripts/snr_weighted_ranking.py`: SNR-weighted ranking.
# Optional second local model
ollama pull qwen3.5:27b
Generated data, traces, and reports are local artifacts and are ignored by Git.
# Offline posterior analysis reads CLAWBENCH_RUN_CACHE_DIR
python3 scripts/run_posterior_dynamics_pipeline.py \
--archive-dir .clawbench/run_cache \
--reports-dir results/posterior_reports
## Repository Layout
clawbench diagnose profiles/local_ollama_gpt_oss.yaml
```
---
## Partner Trace Spec
ClawBench defines a [JSONL interchange format](PARTNER_TRACE_SPEC.md) for agent execution traces. If you're building an agent framework and want your runs scored by ClawBench, you don't need to integrate with OpenClaw — you just emit traces in this format.
The trace captures:
- **Harness provenance** — git SHA, container image digest, runtime version
- **Full tool-call sequence** — family, arguments, output, success/failure, timing
- **Token accounting** — input, output, reasoning, cache tokens per message
- **Artifacts** — final files, test results, command outputs
- **Redaction metadata** — what was removed for privacy, so scoring can account for it
This means ClawBench scores are **reproducible** across different harness implementations, and **auditable** down to individual tool calls.
---
## Repository layout
```
```text
clawbench/
├── clawbench/ # Core package
│ ├── scorer.py # 4-axis scoring with gated judge
│ ├── trajectory.py # Trace-based process quality grading
│ ├── environment.py # 5 deterministic verifier types
│ ├── judge.py # LLM judge (gated, never rescues failures)
│ ├── harness.py # Benchmark orchestration + parallel lanes
│ ├── schemas.py # 13-mode failure taxonomy + result schemas
│ ├── stats.py # Bootstrap CI + Taguchi S/N
│ ├── profile.py # v0.5 plugin fingerprinting
│ ├── diagnostic.py # Configuration Diagnostic report
│ ├── factor_analysis.py # fANOVA factor importance
│ ├── dynamics.py # Trajectory metrics + sensitivity analysis
│ ├── dynamics_archive.py # Cached-run loading + offline report assembly
│ ├── dynamics_plots.py # Offline dynamics visualizations
│ └── cli.py # CLI entry points
├── tasks-public/ # Core v1 PUBLIC release (19 tasks)
│ ├── MANIFEST.yaml # Task list + reference ranking + metadata
│ ├── README.md # Rationale, build + run instructions
│ ├── tier1/ ... tier5/ # 19 task YAMLs with verification specs
│ └── assets/ # 19 asset packs (verifiers + fixtures)
├── tasks/ # PRIVATE 40-task dev pool (gitignored)
├── scripts/ # Reproducibility + analysis pipeline
│ ├── container_sweep_single.sh # Per-container OPENCLAW_STATE_DIR isolation
│ ├── audit_runs.py # Aggregate coverage + fair-comparison audit
│ ├── audit_per_run.py # Per-run cross-model audit
│ ├── rejudge_all.py # Direct-API rejudge for broken gateway judges
│ ├── generate_fair_report.py # Fair N-model comparison report
│ ├── run_posterior_dynamics_pipeline.py # One-shot posterior analysis driver
│ ├── compute_constraint_index.py # C(q) per task
│ ├── classify_regimes.py # Per-run dynamical regime classifier
│ ├── variance_decomp.py # Seed-noise vs capability-signal decomposition
│ ├── survival_analysis.py # Per-turn failure survival curves
│ ├── snr_weighted_ranking.py # SNR × |C(q)|-weighted ranking
│ └── generate_dynamical_report.py # Combined dynamical-systems report
├── profiles/ # v0.5 plugin profile YAMLs
├── tests/ # Test suite
├── Dockerfile # Layered on a pinned ghcr.io/openclaw/openclaw image
├── CLAWBENCH_V0_4_SPEC.md # Full specification
└── PARTNER_TRACE_SPEC.md # Trace interchange format
├── clawbench/ # Harness, adapters, scoring, diagnostics
├── tasks-public/ # Core v1 public task suite
├── tasks-domain/ # Domain expansion scaffold
├── profiles/ # Model/profile definitions
├── scripts/ # Reusable runners and offline analysis
├── tests/ # Public test suite
├── Dockerfile # Public HF Space image
├── Dockerfile.main # Main-variant public image
├── Dockerfile.openclaw-426-agent-hotfix
├── Dockerfile.clawbench-426-agent-hotfix
├── CLAWBENCH_V0_4_SPEC.md
└── PARTNER_TRACE_SPEC.md
```
---
## How ClawBench compares
| | ClawBench | SWE-bench | HumanEval | LLM-judge leaderboards |
|---|---|---|---|---|
| **Scores process, not just output** | ✓ Trace-based trajectory + behavior | No | No | No |
| **Reliability as first-class metric** | ✓ pass^k, Taguchi S/N, bootstrap CI | Single pass rate | pass@k | Best-of-n |
| **Variance decomposition reported** | ✓ seed-noise vs capability-signal ratio | No | No | No |
| **Per-run dynamical regime** | ✓ trapped / cycle / diffusive | No | No | No |
| **SNR-weighted alternative ranking** | ✓ principled task weighting | No | No | No |
| **Failure taxonomy** | ✓ 13 deterministic modes | Binary pass/fail | Binary | None |
| **LLM judge role** | Capped 10%, gated on deterministic floor | Not used | Not used | Primary scorer |
| **Configuration diagnostics** | ✓ Fingerprint, predict, explain, recommend | No | No | No |
| **State-isolation per run** | ✓ per-container OPENCLAW_STATE_DIR | No | No | No |
| **Multiple runs per task** | 3 runs mandatory, statistical tests | Usually 1 | Varies | Usually 1 |
| **Provider-routing caveats** | ✓ documented (OpenRouter drift) | Not flagged | Not flagged | Not flagged |
| **Real tool composition** | ✓ Browser + code + memory + cron + delegation | Code only | Code only | Varies |
---
## Testing
```bash
python -m pytest -q
```
Key test invariants:
- Judge never rescues failed deterministic completion (`test_scorer.py`)
- Parallel lanes are isolated (`test_parallel_harness.py`)
- Bootstrap CIs are statistically valid (`test_e2e_significance.py`)
- fANOVA factor importance converges (`test_v05_framework.py`)
---
## Version log
| Version | Date | Summary |
|:---:|---|---|
| **Core v1** | 2026-04-20 | 19-task signal-curated public release; dynamical-systems diagnostics (C(q), regimes, survival, SNR-weighted); per-container state isolation; rejudge pipeline |
| v0.5 | earlier | Configuration Diagnostic (fingerprint, predict, fANOVA); plugin-native ablation |
| v0.4 | earlier | 4-axis scoring with gated judge; 13-mode failure taxonomy; Partner Trace Spec |
Planned for Core v2:
- **Tier 6 long-horizon tasks** (100+ turn runs) — unlock real Lyapunov / attractor measurement
- **Paraphrased prompt pairs** — enable perturbation-sensitivity ranking
- **Creative-synthesis tasks** — currently absent from Core v1
- **Human-performance baseline** on 10 tasks — calibrate difficulty
---
The test suite includes public-surface checks to keep the README and Space
description aligned with `tasks-public/MANIFEST.yaml`.
## License
@ -591,13 +219,3 @@ MIT. See `LICENSE`.
url = {https://github.com/openclaw/clawbench}
}
```
---
<div align="center">
**ClawBench** — Rigorous. Reproducible. Dynamical.
[Dataset](https://huggingface.co/datasets/openclaw/clawbench-results) · [Space](https://huggingface.co/spaces/openclaw/clawbench) · [Core v1](tasks-public/) · [Spec](CLAWBENCH_V0_4_SPEC.md)
</div>

View File

@ -13,188 +13,70 @@ license: mit
Execution-first benchmark for AI models acting as OpenClaw agents.
This Space evaluates models on realistic local agent tasks and scores them with a deterministic pipeline that emphasizes:
- **Completion**: did the work actually pass executable checks?
- **Trajectory**: did the agent explore, recover, and use tools well?
- **Behavior**: did the transcript show planning, progress updates, and safe handling?
- **Reliability**: was performance stable across repeated runs?
## Why this benchmark exists
ClawBench is built to avoid three common benchmark failures:
1. trusting what the agent said instead of running the work,
2. rewarding one reference trajectory instead of rewarding good agent properties,
3. hiding instability by reporting only one lucky run.
## Benchmark shape
## Benchmark Shape
```text
tasks : 20
public suite : Core v1
tasks : 19
runs/model : 57 for official Core v1 comparisons
tiers : 5
prompt modes : clear + ambiguous on every task
browser tasks : 2
multi-phase : 1
judge-enabled : 6 advisory tasks
primary metric : pass^k
primary metric : trace-scored task score plus reliability
```
### Tier mix
```text
tier1 | ### 3
tier2 | ##### 5
tier3 | ##### 5
tier4 | #### 4
tier5 | ### 3
```
### Family mix
```text
repo | ###### 6
coding | #### 4
multi_tool | ### 3
adversarial | ### 3
browser | ## 2
tools | ## 2
```
## Official score stack
Per-run score:
```text
normalize(0.4 * completion + 0.3 * trajectory + 0.2 * behavior)
```
Per-task score after repeated runs:
```text
0.9 * mean_run_score + 0.1 * reliability_score
```
Reliability:
```text
0.5 * pass_hat_k + 0.3 * pass_rate + 0.2 * variance_score
```
## What gets verified
## What Gets Scored
| Layer | Verification style |
| --- | --- |
| Completion | `pytest`, `node --test`, exact output checks, browser flow checks, cron checks, memory checks, gateway assertions |
| Trajectory | read-before-write, self-verification, recovery quality, tool-family fit, safety rules |
| Behavior | deterministic transcript rules for planning, progress, blocker handling, refusal quality, destructive-command avoidance |
|---|---|
| Completion | `pytest`, exact output checks, browser flow checks, file checks, and verifier scripts |
| Trajectory | read-before-write, self-verification, recovery quality, tool-family fit, and safety rules |
| Behavior | deterministic transcript checks for planning, progress, blockers, and safe handling |
| Reliability | repeated runs with pass^k, pass rate, and score variance |
The official score stays deterministic.
The advisory judge is optional and cannot replace deterministic verification.
Optional advisory judge results are reported separately and never replace executable verification.
## Runtime flow
## Runtime Flow
```text
task yaml + assets
-> isolated workspace
-> optional local background services
-> OpenClaw agent session(s)
-> OpenClaw agent session
-> transcript + tool-result capture
-> completion / trajectory / behavior scoring
-> repeated runs
-> reliability aggregation
-> leaderboard result
```
## Browser policy
## Public Task Inventory
Browser tasks in this Space are deterministic and local:
The Space uses `tasks-public/MANIFEST.yaml` as the source of truth. Current
Core v1 tasks are:
```text
task-owned local app or docs
-> OpenClaw browser tool
-> real browser interaction
-> deterministic local verification
```
| Task | Tier | Family |
|---|---|---|
| `t1-bugfix-discount` | tier1 | coding |
| `t1-fs-quick-note` | tier1 | tools |
| `t2-add-tests-normalizer` | tier2 | coding |
| `t2-browser-form-fix` | tier2 | browser |
| `t2-config-loader` | tier2 | repo |
| `t2-fs-find-that-thing` | tier2 | tools |
| `t2-msg-summarize-thread` | tier2 | tools |
| `t2-priv-redact-doc` | tier2 | tools |
| `t3-data-pipeline-report` | tier3 | multi_tool |
| `t3-data-sql-query` | tier3 | tools |
| `t3-feature-export` | tier3 | repo |
| `t3-msg-inbox-triage` | tier3 | tools |
| `t3-web-research-and-cite` | tier3 | tools |
| `t4-browser-research-and-code` | tier4 | browser |
| `t4-cross-repo-migration` | tier4 | repo |
| `t4-delegation-repair` | tier4 | multi_tool |
| `t4-life-trip-plan` | tier4 | tools |
| `t4-memory-recall-continuation` | tier4 | multi_tool |
| `t5-hallucination-resistant-evidence` | tier5 | adversarial |
No public websites are used for official browser tasks.
## Holdout Policy
## Parallel Space runtime
On upgraded CPU Spaces, the worker can use conservative parallel lanes:
```text
submission
-> task partitioner
-> lane 1 gateway + lane-local state
-> lane 2 gateway + lane-local state
-> browser lane gateway + lane-local state
-> merged benchmark result
```
Important rule: browser tasks stay serialized on one dedicated lane to avoid Chromium and port-range collisions.
## Submission presets
The Submit tab now exposes two preset audiences so the Space can serve both general Claw users and lower-budget exploratory runs:
- `Claw Users` keeps the full preset catalog, including provider-backed frontier models.
- `Budget Researchers` narrows the list to local or lower-cost presets such as `ollama/gpt-oss:20b`, `ollama/qwen3.5:27b`, `huggingface/Qwen/Qwen3-32B`, and `huggingface/google/gemma-4-26B-A4B-it`.
You can still enter any custom model ID directly; the preset audience only filters the shortcut catalog and the bulk-submit action.
## Task inventory
| Task | Tier | Family | Main verification |
| --- | --- | --- | --- |
| `t1-architecture-brief` | tier1 | tools | fact verifier + smoke command |
| `t1-bugfix-discount` | tier1 | coding | `pytest` |
| `t1-refactor-csv-loader` | tier1 | coding | `pytest` + verification script |
| `t2-add-tests-normalizer` | tier2 | coding | `pytest` + added-test checks |
| `t2-browser-form-fix` | tier2 | browser | local browser flow verification |
| `t2-config-loader` | tier2 | repo | `pytest` |
| `t2-log-analyzer-cli` | tier2 | coding | exact JSON output |
| `t2-node-search-patch` | tier2 | repo | `node --test` |
| `t3-data-pipeline-report` | tier3 | multi_tool | exact report output |
| `t3-debug-timezone-regression` | tier3 | repo | `pytest` |
| `t3-feature-export` | tier3 | repo | `pytest` + CLI smoke |
| `t3-monitoring-automation` | tier3 | tools | script output + cron state |
| `t3-node-multifile-refactor` | tier3 | repo | `node --test` |
| `t4-browser-research-and-code` | tier4 | browser | browser evidence + tests |
| `t4-cross-repo-migration` | tier4 | repo | both test suites pass |
| `t4-delegation-repair` | tier4 | multi_tool | final suite + delegation transcript evidence |
| `t4-memory-recall-continuation` | tier4 | multi_tool | tests + memory assertions |
| `t5-contradictory-requirements` | tier5 | adversarial | latest-instruction artifact checks |
| `t5-hallucination-resistant-evidence` | tier5 | adversarial | exact answer + evidence-first checks |
| `t5-impossible-graceful-fail` | tier5 | adversarial | no harmful mutation + clear refusal |
## Query coverage layer
The benchmark also carries dataset-backed metadata from a spreadsheet-derived query corpus:
- scenario-domain mapping,
- clear vs ambiguous prompt slices,
- pass / partial / fail delivery buckets,
- weighted query-score reporting.
This lets the benchmark report both:
- how strong a model is,
- and what parts of the user-query landscape the suite is actually stressing.
## What makes ClawBench meaningful now
- execution-based completion checks instead of file-exists-only scoring
- property-based trajectory scoring instead of reference-trace matching
- deterministic local browser tasks instead of internet targets
- repeated-run reliability instead of one-shot success stories
- tiered tasks with delegation, memory, browser, repo, and adversarial surfaces
- advisory judge support without making the official score depend on a second model
## Auth model
The benchmark does not require a separate scorer or user-simulation API key.
It uses the model-under-test auth already configured for OpenClaw. If you enable the optional advisory judge, that model can reuse the same general auth path if available.
Private task bodies, assets, expected outputs, verifier details, run traces,
logs, and per-task private reports are not part of the public Space. Public
Core v1 is intended for reproducibility and development; hidden-suite runs use
the same harness with a private task directory restored locally.

313
clawbench/ablation.py Normal file
View File

@ -0,0 +1,313 @@
"""Ablation profiles and fair-comparison helpers.
The benchmark can only explain model, harness, and tool effects if those
axes are represented explicitly in run metadata. This module keeps that
representation small and deterministic: a harness driver plus a tool
profile yields a fingerprint, and result comparison refuses to call a
delta fair when models or task sets drift.
"""
from __future__ import annotations
import hashlib
import json
import subprocess
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Iterable
from pydantic import BaseModel, Field
from clawbench.adapters import get_adapter
from clawbench.adapters.base import AdapterConfig
from clawbench.canonical import AdapterCapability
from clawbench.canonical.convert import from_task_definition
from clawbench.schemas import BenchmarkResult, TaskDefinition
CAPABILITY_TO_INTERFACE: dict[AdapterCapability, str] = {
AdapterCapability.FILES: "filesystem",
AdapterCapability.EXECUTION: "shell",
AdapterCapability.MEMORY: "memory",
AdapterCapability.SESSION: "session",
AdapterCapability.CRON: "scheduler",
AdapterCapability.BROWSER: "browser",
AdapterCapability.GATEWAY_RPC: "gateway_rpc",
AdapterCapability.MULTI_TURN_INJECTION: "multi_turn",
}
class HarnessDescriptor(BaseModel):
"""Identifies the agent loop being measured."""
adapter: str
driver: str = ""
version: str = ""
git_sha: str = ""
source: str = ""
invocation: str = "clawbench"
class ToolProfile(BaseModel):
"""The tools/interfaces exposed to a harness run."""
name: str
mode: str = "native"
interfaces: list[str] = Field(default_factory=list)
adapter_capabilities: list[str] = Field(default_factory=list)
enabled_toolsets: list[str] = Field(default_factory=list)
disabled_toolsets: list[str] = Field(default_factory=list)
tools: list[str] = Field(default_factory=list)
fingerprint: str = ""
def with_fingerprint(self) -> "ToolProfile":
payload = {
"name": self.name,
"mode": self.mode,
"interfaces": sorted(self.interfaces),
"adapter_capabilities": sorted(self.adapter_capabilities),
"enabled_toolsets": sorted(self.enabled_toolsets),
"disabled_toolsets": sorted(self.disabled_toolsets),
"tools": sorted(self.tools),
}
digest = hashlib.sha256(
json.dumps(payload, sort_keys=True, separators=(",", ":")).encode("utf-8")
).hexdigest()
return self.model_copy(update={"fingerprint": digest[:16]})
class AblationProfile(BaseModel):
"""Run-level axis metadata embedded in BenchmarkResult.environment."""
model: str
harness: HarnessDescriptor
tool_profile: ToolProfile
prompt_profile: str = "clear"
fingerprint: str = ""
def with_fingerprint(self) -> "AblationProfile":
tool_profile = self.tool_profile.with_fingerprint()
payload = {
"model": self.model,
"harness": self.harness.model_dump(),
"tool_profile": tool_profile.model_dump(),
"prompt_profile": self.prompt_profile,
}
digest = hashlib.sha256(
json.dumps(payload, sort_keys=True, separators=(",", ":")).encode("utf-8")
).hexdigest()
return self.model_copy(
update={
"tool_profile": tool_profile,
"fingerprint": digest[:16],
}
)
@dataclass(frozen=True)
class FairTaskSet:
task_ids: list[str]
skipped: dict[str, list[str]] = field(default_factory=dict)
def capabilities_to_interfaces(capabilities: Iterable[AdapterCapability | str]) -> list[str]:
values: list[str] = []
for cap in capabilities:
enum_value = cap if isinstance(cap, AdapterCapability) else AdapterCapability(str(cap))
values.append(CAPABILITY_TO_INTERFACE.get(enum_value, enum_value.value))
return sorted(set(values))
def adapter_capabilities(
adapter: str,
config: AdapterConfig | None = None,
) -> set[AdapterCapability]:
adapter_cls = get_adapter(adapter)
return adapter_cls.supported_capabilities(config)
def default_tool_profile(
*,
adapter: str,
config: AdapterConfig | None = None,
name: str | None = None,
mode: str = "native",
enabled_toolsets: list[str] | None = None,
disabled_toolsets: list[str] | None = None,
) -> ToolProfile:
caps = adapter_capabilities(adapter, config)
profile = ToolProfile(
name=name or f"{adapter}-{mode}",
mode=mode,
interfaces=capabilities_to_interfaces(caps),
adapter_capabilities=sorted(cap.value for cap in caps),
enabled_toolsets=enabled_toolsets or [],
disabled_toolsets=disabled_toolsets or [],
)
return profile.with_fingerprint()
def compatible_task_ids(
tasks: Iterable[TaskDefinition],
*,
adapter: str,
config: AdapterConfig | None = None,
) -> tuple[list[str], dict[str, list[str]]]:
caps = adapter_capabilities(adapter, config)
task_ids: list[str] = []
skipped: dict[str, list[str]] = {}
for task in tasks:
canonical = from_task_definition(task)
missing = set(canonical.required_adapter_capabilities) - caps
if missing:
skipped[task.id] = sorted(cap.value for cap in missing)
else:
task_ids.append(task.id)
return task_ids, skipped
def common_compatible_task_set(
tasks: Iterable[TaskDefinition],
adapter_configs: dict[str, tuple[str, AdapterConfig | None]],
) -> FairTaskSet:
task_list = list(tasks)
common: set[str] | None = None
skipped: dict[str, list[str]] = {}
for label, (adapter, config) in adapter_configs.items():
ids, missing = compatible_task_ids(task_list, adapter=adapter, config=config)
ids_set = set(ids)
common = ids_set if common is None else common & ids_set
for task_id, caps in missing.items():
skipped.setdefault(task_id, []).append(f"{label}: {', '.join(caps)}")
ordered = [task.id for task in task_list if task.id in (common or set())]
return FairTaskSet(task_ids=ordered, skipped=skipped)
def build_ablation_profile(
*,
model: str,
adapter: str,
config: AdapterConfig | None = None,
prompt_profile: str = "clear",
harness_version: str = "",
harness_git_sha: str = "",
harness_source: str = "",
driver: str = "",
tool_profile_name: str | None = None,
enabled_toolsets: list[str] | None = None,
disabled_toolsets: list[str] | None = None,
) -> AblationProfile:
harness = HarnessDescriptor(
adapter=adapter,
driver=driver,
version=harness_version,
git_sha=harness_git_sha,
source=harness_source,
)
tool_profile = default_tool_profile(
adapter=adapter,
config=config,
name=tool_profile_name,
enabled_toolsets=enabled_toolsets,
disabled_toolsets=disabled_toolsets,
)
return AblationProfile(
model=model,
harness=harness,
tool_profile=tool_profile,
prompt_profile=prompt_profile,
).with_fingerprint()
def compare_results(results: dict[str, BenchmarkResult]) -> dict[str, Any]:
"""Return score deltas plus fairness checks for result JSONs."""
labels = list(results)
models = {label: result.model for label, result in results.items()}
task_sets = {
label: [task.task_id for task in result.task_results]
for label, result in results.items()
}
first_tasks = next(iter(task_sets.values()), [])
same_task_set = all(tasks == first_tasks for tasks in task_sets.values())
same_model = len(set(models.values())) == 1
snapshot_fingerprints = {
result.task_snapshot_fingerprint
for result in results.values()
if result.task_snapshot_fingerprint
}
same_task_snapshot = len(snapshot_fingerprints) <= 1
prompt_variants = {
str(result.environment.get("prompt_variant", ""))
for result in results.values()
if result.environment.get("prompt_variant", "")
}
same_prompt_variant = len(prompt_variants) <= 1
benchmark_releases = {
result.benchmark_release_id
for result in results.values()
if result.benchmark_release_id
}
same_benchmark_release = len(benchmark_releases) <= 1
task_verifier_fair = same_task_set and same_task_snapshot and same_prompt_variant and same_benchmark_release
rows: dict[str, Any] = {}
for label, result in results.items():
rows[label] = {
"model": result.model,
"adapter": result.environment.get("adapter", ""),
"score": result.overall_score,
"completion": result.overall_completion,
"trajectory": result.overall_trajectory,
"behavior": result.overall_behavior,
"reliability": result.overall_reliability,
"task_count": len(result.task_results),
"task_snapshot_fingerprint": result.task_snapshot_fingerprint,
"benchmark_release_id": result.benchmark_release_id,
"prompt_variant": result.environment.get("prompt_variant", ""),
"dimension_coverage": result.environment.get("dimension_coverage", {}),
"ablation": result.environment.get("ablation_profile", {}),
}
deltas: dict[str, float] = {}
if labels:
baseline = results[labels[0]].overall_score
for label in labels[1:]:
deltas[f"{label}_minus_{labels[0]}"] = round(
results[label].overall_score - baseline,
4,
)
return {
"fair": bool(task_verifier_fair),
"task_verifier_fair": bool(task_verifier_fair),
"controlled_ablation": bool(task_verifier_fair and same_model),
"same_model": same_model,
"same_task_set": same_task_set,
"same_task_snapshot": same_task_snapshot,
"same_prompt_variant": same_prompt_variant,
"same_benchmark_release": same_benchmark_release,
"models": models,
"task_sets": task_sets,
"rows": rows,
"deltas": deltas,
}
def git_head(path: Path) -> tuple[str, str]:
"""Best-effort `(sha, describe)` for harness provenance."""
try:
sha = subprocess.check_output(
["git", "-C", str(path), "rev-parse", "HEAD"],
text=True,
stderr=subprocess.DEVNULL,
).strip()
desc = subprocess.check_output(
["git", "-C", str(path), "describe", "--tags", "--always", "--dirty"],
text=True,
stderr=subprocess.DEVNULL,
).strip()
return sha, desc
except Exception:
return "", ""

View File

@ -0,0 +1,102 @@
"""Agent adapter layer — Phase-4 of CLAWBENCH_V0_4_SPEC.md.
Adapters plug an agent framework (OpenClaw, Hermes, Codex, Claude Code,
Deerflow, ) into ClawBench's canonical task pipeline. Each adapter is
responsible for:
- Setting up the workspace + seed state from a `CanonicalTask`.
- Driving the agent through each `CanonicalPhase`'s simulated user.
- Returning a canonical `Transcript` so the scorer, trajectory analyser,
and judge can score the run unchanged.
- Resolving `StateQuery` assertions that fall under its declared
capabilities; returning `capability_missing=True` for queries that
require a capability the adapter doesn't provide.
The `ADAPTERS` registry is populated by each adapter module at import
time. `get_adapter(name)` is the canonical lookup.
"""
from __future__ import annotations
from clawbench.adapters.base import (
AdapterConfig,
AdapterContext,
AgentAdapter,
PhaseResult,
StateQueryResult,
)
#: Registry of adapter_name → adapter class. Populated by the adapter
#: modules at import time (e.g. `from clawbench.adapters.openclaw import *`
#: registers the OpenClaw adapter). Callers should use `get_adapter`
#: rather than reading this dict directly.
ADAPTERS: dict[str, type[AgentAdapter]] = {}
def register_adapter(cls: type[AgentAdapter]) -> type[AgentAdapter]:
"""Decorator / direct-call helper that registers an adapter class.
Adapters declare themselves via:
```
@register_adapter
class HermesAdapter(AgentAdapter):
name = "hermes"
...
```
"""
name = getattr(cls, "name", "")
if not name:
raise ValueError(f"{cls.__name__} must set a non-empty `name` class attribute")
existing = ADAPTERS.get(name)
if existing is not None and existing is not cls:
raise ValueError(
f"Adapter name collision: '{name}' is already registered "
f"to {existing.__qualname__}"
)
ADAPTERS[name] = cls
return cls
def get_adapter(name: str) -> type[AgentAdapter]:
"""Look up an adapter class by its registered name.
Import the adapter module before calling this so the registration
has run. `clawbench.adapters.openclaw` always loads; optional
adapters (hermes, codex) guard their imports and raise a clear
error if their runtime dep isn't installed.
"""
try:
return ADAPTERS[name]
except KeyError as exc:
available = ", ".join(sorted(ADAPTERS)) or "(none)"
raise KeyError(
f"Unknown adapter '{name}'. Registered adapters: {available}"
) from exc
__all__ = [
"ADAPTERS",
"AdapterConfig",
"AdapterContext",
"AgentAdapter",
"PhaseResult",
"StateQueryResult",
"get_adapter",
"register_adapter",
]
# Register built-in adapters at import time. Each adapter module is
# expected to @register_adapter its class. OpenClaw is always
# available; optional adapters (hermes, codex) guard their imports and
# are registered only when their runtime dep is present.
from clawbench.adapters import openclaw as _openclaw # noqa: E402,F401
try:
from clawbench.adapters import hermes as _hermes # noqa: E402,F401
except Exception:
# hermes-agent is an optional extra; absence is fine.
_hermes = None # type: ignore[assignment]

234
clawbench/adapters/base.py Normal file
View File

@ -0,0 +1,234 @@
"""Agent adapter ABC and associated data shapes.
An `AgentAdapter` is the execution counterpart to a `CanonicalTask`. It
is the only place where framework-specific details (OpenClaw gateway
RPCs, Hermes `MiniSWERunner`, Claude Code SDK, etc.) live. Everything
downstream of the adapter trajectory analysis, scorer, judge, stats
consumes a canonical `Transcript` and `TaskRunResult` produced by the
adapter, so those modules stay unchanged across adapters.
Lifecycle per task run:
1. Harness instantiates `adapter = AdapterClass(config)`.
2. `async with adapter as adapter:` starts subprocesses / websockets
/ whatever this adapter needs to hold open across a run.
3. `await adapter.setup(ctx)` realizes seed state, workspace files,
background services, pre-run state queries.
4. For each `CanonicalPhase`: `await adapter.run_phase(phase, ctx)`
drives the simulated user against the agent, returns a
`PhaseResult` with the transcript increment.
5. For each `StateQuery` in `task.verifier.state_queries`:
`await adapter.verify_state_query(query, ctx)` returns whether
the assertion held, or that the adapter lacks the capability.
6. `await adapter.teardown(ctx)` cleans up agent-side state (the
workspace itself is harness-owned).
"""
from __future__ import annotations
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, ClassVar
from clawbench.canonical import (
AdapterCapability,
CanonicalPhase,
CanonicalTask,
StateQuery,
)
from clawbench.schemas import Transcript, TranscriptMessage
@dataclass
class AdapterConfig:
"""Base config every adapter accepts.
Adapters subclass this to add their own fields. The harness builds
a config instance from CLI flags / env vars and passes it to the
adapter constructor.
"""
#: Primary model identifier. Semantics are adapter-specific (an
#: OpenClaw model id, a Hermes `--model` string, etc.).
model: str = ""
@dataclass
class AdapterContext:
"""Per-run context handed to every adapter method.
`transcript` is mutated in place across phases: each
`run_phase` call appends the messages it observed, so the scorer
sees one consolidated `Transcript` at the end.
"""
task: CanonicalTask
workspace: Path
runtime_values: dict[str, Any]
run_index: int
model: str
transcript: Transcript
#: Free-form adapter-owned scratch state (e.g. the OpenClaw
#: `session_key` and `agent_id`; the Hermes `MiniSWERunner`
#: instance). The harness never reads these — the adapter is free
#: to use the dict as its own in-context cache.
adapter_state: dict[str, Any] = field(default_factory=dict)
@dataclass
class PhaseResult:
"""The transcript increment produced by a single phase."""
messages: list[TranscriptMessage] = field(default_factory=list)
#: Adapter-specific metadata for this phase (token counts returned
#: by the adapter, session identifiers, etc.). Merged into
#: `TaskRunResult` under the `efficiency_result` / adapter metadata
#: fields where applicable.
adapter_metadata: dict[str, Any] = field(default_factory=dict)
#: True if the adapter detected that the agent completed normally
#: (e.g. Hermes's `completed=True`). Not a pass/fail signal — just
#: whether the trajectory ran out of work vs was cut short. The
#: scorer uses this in `delivery_outcome` classification.
completed_normally: bool = True
#: If the phase aborted due to the adapter itself (not the agent),
#: populated with an error message the harness surfaces.
error: str | None = None
@dataclass
class StateQueryResult:
"""Result of resolving a `StateQuery` against the adapter's state.
`capability_missing=True` means "this adapter cannot evaluate this
kind of query". The scorer treats that as neutral (neither pass nor
fail) and records a skip note in the `CompletionResult`; under
`--strict-compat` the harness will have filtered the task out before
the adapter ever saw it.
"""
ok: bool
detail: str = ""
capability_missing: bool = False
class AgentAdapter(ABC):
"""Abstract base class for agent adapters.
Subclasses MUST:
- Set a unique `name: ClassVar[str]`.
- Set a `capabilities: ClassVar[set[AdapterCapability]]` declaring
which state-query kinds the adapter can resolve.
- Implement `setup`, `run_phase`, `verify_state_query`, `teardown`.
- Optionally implement `__aenter__` / `__aexit__` for long-lived
resource setup (a persistent websocket, a subprocess pool).
"""
name: ClassVar[str] = ""
capabilities: ClassVar[set[AdapterCapability]] = set()
def __init__(self, config: AdapterConfig | None = None) -> None:
self.config: AdapterConfig = config or AdapterConfig()
# ------------------------------------------------------------------
# Optional long-lived resource management.
# ------------------------------------------------------------------
async def __aenter__(self) -> "AgentAdapter":
return self
async def __aexit__(self, exc_type: object, exc: object, tb: object) -> None:
return None
# ------------------------------------------------------------------
# Required per-run lifecycle.
# ------------------------------------------------------------------
@abstractmethod
async def setup(self, ctx: AdapterContext) -> None:
"""Realise the workspace, seed state, and any pre-run state.
The harness has already created the workspace dir and expanded
`CanonicalAssets.workspace_files` into it. The adapter is
responsible for:
- Applying `seed_state` entries via an adapter-appropriate
mechanism (OpenClaw memory RPCs; Hermes file writes).
- Starting the agent's process/session so `run_phase` can send
turns immediately.
"""
@abstractmethod
async def run_phase(
self,
phase: CanonicalPhase,
ctx: AdapterContext,
) -> PhaseResult:
"""Drive one `CanonicalPhase` to completion.
The simulated user in `phase.user` dictates what to send and
when. The adapter's job is to deliver those turns, observe the
agent's responses, and append canonical `TranscriptMessage`
entries to `ctx.transcript`.
"""
@abstractmethod
async def verify_state_query(
self,
query: StateQuery,
ctx: AdapterContext,
) -> StateQueryResult:
"""Resolve one `StateQuery` against the agent's post-run state.
Adapters whose `capabilities` don't cover `query.required_capability`
should return `StateQueryResult(ok=False, capability_missing=True)`.
"""
@abstractmethod
async def teardown(self, ctx: AdapterContext) -> None:
"""Release any agent-side state created during `setup`/`run_phase`.
The harness owns the workspace lifecycle; the adapter owns
sessions, subprocesses, and any in-memory caches it held open.
"""
# ------------------------------------------------------------------
# Convenience helpers available to every adapter.
# ------------------------------------------------------------------
@classmethod
def supported_capabilities(
cls,
config: AdapterConfig | None = None,
) -> set[AdapterCapability]:
"""Return capabilities available for a concrete adapter config.
Most adapters have a fixed surface and can use the class-level
`capabilities`. Adapters with multiple driver modes, such as Hermes
MiniSWE vs full AIAgent, override this to keep task gating honest.
"""
return set(cls.capabilities)
@classmethod
def missing_capabilities_for(
cls,
task: CanonicalTask,
config: AdapterConfig | None = None,
) -> set[AdapterCapability]:
"""Return the subset of `task.required_adapter_capabilities` this
adapter cannot cover. Empty set means the task is fully runnable
under this adapter.
"""
return set(task.required_adapter_capabilities) - cls.supported_capabilities(config)
@classmethod
def supports(
cls,
task: CanonicalTask,
config: AdapterConfig | None = None,
) -> bool:
"""True iff this adapter can cover every capability the task needs."""
return not cls.missing_capabilities_for(task, config)

View File

@ -0,0 +1,704 @@
"""Hermes adapter — drives Nous Research `hermes-agent`.
Hermes (https://github.com/NousResearch/hermes-agent) is a Python agent
framework with `MiniSWERunner` as its clean programmatic entry point.
This adapter:
1. Realizes the canonical workspace + seed state (seed_state entries
with `kind="memory"` become files, since Hermes has no memory RPC).
2. Constructs a `MiniSWERunner` scoped to the workspace.
3. For each canonical phase, renders the user turn and calls
`runner.run_task(prompt)` in a worker thread, with the phase's
timeout enforced as a wall clock.
4. Parses the returned `conversations` via
`clawbench.adapters.hermes_xml.parse_conversation` into a canonical
`Transcript` the scorer can consume unchanged.
5. For state queries the adapter can't resolve (session, cron, custom
gateway RPC), returns `capability_missing=True` so the harness
reports a clean skip. Memory queries fall back to workspace file
scanning via `environment_files.verify_memory_fallback`.
`hermes-agent` is an **optional** dependency (`clawbench[hermes]`). The
import is guarded so the base install stays lean; calling this adapter
without the dep installed raises a clear error rather than a cryptic
`ImportError`.
"""
from __future__ import annotations
import asyncio
import importlib.util
import json
import logging
import os
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Any
from urllib.parse import urlparse
from clawbench.adapters import register_adapter
from clawbench.adapters.base import (
AdapterConfig,
AdapterContext,
AgentAdapter,
PhaseResult,
StateQueryResult,
)
from clawbench.adapters.hermes_xml import parse_chat_messages, parse_conversation
from clawbench.canonical import (
AdapterCapability,
CanonicalPhase,
StateQuery,
)
from clawbench.environment_files import verify_memory_fallback
from clawbench.render import render_template
from clawbench.schemas import MemoryState, PromptVariant
from clawbench.simulated_user import UserSimulator
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Optional dependency import — guarded so the base install stays lean.
# ---------------------------------------------------------------------------
def _load_mini_swe_runner() -> tuple[Any, Exception | None]:
try: # pragma: no cover - import-guard branch
from mini_swe_runner import MiniSWERunner as runner_cls # type: ignore[import-not-found]
return runner_cls, None
except Exception as import_exc: # pragma: no cover - import-guard branch
candidates: list[Path] = []
explicit_file = os.environ.get("HERMES_MINI_SWE_RUNNER")
if explicit_file:
candidates.append(Path(explicit_file).expanduser())
for env_name in ("HERMES_AGENT_REPO", "HERMES_INSTALL_DIR"):
value = os.environ.get(env_name)
if value:
candidates.append(Path(value).expanduser() / "mini_swe_runner.py")
hermes_home = Path(os.environ.get("HERMES_HOME", "~/.hermes")).expanduser()
candidates.append(hermes_home / "hermes-agent" / "mini_swe_runner.py")
for path in candidates:
if not path.is_file():
continue
try:
repo_root = str(path.parent)
if repo_root not in sys.path:
sys.path.insert(0, repo_root)
spec = importlib.util.spec_from_file_location(
"_clawbench_hermes_mini_swe_runner",
path,
)
if spec is None or spec.loader is None:
continue
module = importlib.util.module_from_spec(spec)
sys.modules[spec.name] = module
spec.loader.exec_module(module)
return module.MiniSWERunner, None
except Exception as path_exc:
import_exc = path_exc
continue
return None, import_exc
MiniSWERunner, _HERMES_IMPORT_ERROR = _load_mini_swe_runner()
def _load_ai_agent() -> tuple[Any, Exception | None]:
try: # pragma: no cover - import-guard branch
from run_agent import AIAgent as agent_cls # type: ignore[import-not-found]
return agent_cls, None
except Exception as import_exc: # pragma: no cover - import-guard branch
candidates: list[Path] = []
for env_name in ("HERMES_AGENT_REPO", "HERMES_INSTALL_DIR"):
value = os.environ.get(env_name)
if value:
candidates.append(Path(value).expanduser() / "run_agent.py")
hermes_home = Path(os.environ.get("HERMES_HOME", "~/.hermes")).expanduser()
candidates.append(hermes_home / "hermes-agent" / "run_agent.py")
for path in candidates:
if not path.is_file():
continue
try:
repo_root = str(path.parent)
if repo_root not in sys.path:
sys.path.insert(0, repo_root)
spec = importlib.util.spec_from_file_location(
"_clawbench_hermes_run_agent",
path,
)
if spec is None or spec.loader is None:
continue
module = importlib.util.module_from_spec(spec)
sys.modules[spec.name] = module
spec.loader.exec_module(module)
return module.AIAgent, None
except Exception as path_exc:
import_exc = path_exc
continue
return None, import_exc
AIAgent, _HERMES_AGENT_IMPORT_ERROR = _load_ai_agent()
class _CodexToolMessageCompatClient:
"""Client wrapper for Hermes's Codex Responses shim.
The current Hermes MiniSWERunner feeds OpenAI chat-style `role="tool"`
messages back into `chat.completions.create()`. Hermes's Codex
Responses adapter accepts chat-shaped calls but currently forwards
those tool messages to Responses as plain input items, where Codex
rejects the unsupported role. Rewriting tool results as user-visible
text preserves the important observation for the next turn and keeps
the runner moving.
"""
def __init__(self, inner: Any) -> None:
self._inner = inner
self.chat = _CodexToolMessageCompatChat(inner.chat)
self.api_key = getattr(inner, "api_key", None)
self.base_url = getattr(inner, "base_url", None)
def close(self) -> None:
close = getattr(self._inner, "close", None)
if callable(close):
close()
class _CodexToolMessageCompatChat:
def __init__(self, inner_chat: Any) -> None:
self.completions = _CodexToolMessageCompatCompletions(inner_chat.completions)
class _CodexToolMessageCompatCompletions:
def __init__(self, inner_completions: Any) -> None:
self._inner = inner_completions
def create(self, **kwargs: Any) -> Any:
messages = kwargs.get("messages")
if isinstance(messages, list):
kwargs = dict(kwargs)
kwargs["messages"] = [_rewrite_codex_tool_message(message) for message in messages]
return self._inner.create(**kwargs)
def _rewrite_codex_tool_message(message: Any) -> Any:
if not isinstance(message, dict) or message.get("role") != "tool":
return message
content = message.get("content", "")
if not isinstance(content, str):
content = str(content)
tool_call_id = message.get("tool_call_id") or message.get("name") or "tool"
return {
"role": "user",
"content": f"Tool result ({tool_call_id}):\n{content}",
}
# ---------------------------------------------------------------------------
# Config
# ---------------------------------------------------------------------------
@dataclass
class HermesAdapterConfig(AdapterConfig):
"""Config for the Hermes adapter.
Fields map onto `MiniSWERunner` kwargs; ClawBench passes the
canonical model string through verbatim so users pick Hermes-
supported models via the existing `--model` flag.
"""
env_type: str = "local"
max_iterations: int = 15
timeout_seconds: int = 60
base_url: str | None = None
api_key: str | None = None
provider: str | None = None
api_mode: str | None = None
prompt_variant: str = PromptVariant.CLEAR.value
driver_mode: str = "mini_swe"
enabled_toolsets: list[str] | None = None
disabled_toolsets: list[str] | None = None
hermes_home: str | None = None
tool_delay_seconds: float = 0.0
# Optional: an explicit `MiniSWERunner` factory. Used by tests to
# plug in a stub; production code leaves this None and the adapter
# instantiates the real runner lazily.
runner_factory: Any = None
agent_factory: Any = None
@register_adapter
class HermesAdapter(AgentAdapter):
"""Adapter for the Nous Research hermes-agent."""
name = "hermes"
capabilities = {
AdapterCapability.FILES,
AdapterCapability.EXECUTION,
}
@classmethod
def supported_capabilities(cls, config: AdapterConfig | None = None) -> set[AdapterCapability]:
if isinstance(config, HermesAdapterConfig) and config.driver_mode == "ai_agent":
return {
AdapterCapability.FILES,
AdapterCapability.EXECUTION,
AdapterCapability.MEMORY,
AdapterCapability.CRON,
AdapterCapability.BROWSER,
AdapterCapability.MULTI_TURN_INJECTION,
}
return set(cls.capabilities)
def __init__(self, config: HermesAdapterConfig | None = None) -> None:
super().__init__(config or HermesAdapterConfig())
self._config: HermesAdapterConfig = self.config # type: ignore[assignment]
# ------------------------------------------------------------------
# Lifecycle.
# ------------------------------------------------------------------
async def setup(self, ctx: AdapterContext) -> None:
"""Realize memory seed state as files and build the runner.
Hermes-in-`env_type=local` operates directly on the workspace
filesystem, so memory `SeedEntry` entries are written out as
`memory/<key>.md` files. Callers that want a different mapping
can pre-populate the workspace before invoking the adapter.
"""
for seed in ctx.task.assets.seed_state:
if seed.kind == "memory" and seed.key:
target = ctx.workspace / "memory" / f"{seed.key}.md"
target.parent.mkdir(parents=True, exist_ok=True)
content = seed.content or ""
if not isinstance(content, str):
content = str(content)
target.write_text(content, encoding="utf-8")
if self._config.driver_mode == "ai_agent":
agent = self._build_ai_agent(ctx)
ctx.adapter_state["agent"] = agent
ctx.adapter_state["conversation_history"] = []
ctx.adapter_state["hermes_home"] = self._hermes_home(ctx)
else:
runner = self._build_runner(ctx)
ctx.adapter_state["runner"] = runner
ctx.adapter_state.setdefault("api_calls", 0)
def _hermes_home(self, ctx: AdapterContext) -> Path:
configured = self._config.hermes_home
if configured:
return Path(configured).expanduser()
return ctx.workspace / ".hermes"
def _prepare_process_env(self, ctx: AdapterContext) -> None:
hermes_home = self._hermes_home(ctx)
hermes_home.mkdir(parents=True, exist_ok=True)
os.environ["HERMES_HOME"] = str(hermes_home)
os.environ["TERMINAL_CWD"] = str(ctx.workspace)
os.environ.setdefault("TERMINAL_ENV", "local")
cron_jobs = sys.modules.get("cron.jobs")
if cron_jobs is not None:
cron_dir = hermes_home / "cron"
setattr(cron_jobs, "HERMES_DIR", hermes_home)
setattr(cron_jobs, "CRON_DIR", cron_dir)
setattr(cron_jobs, "JOBS_FILE", cron_dir / "jobs.json")
setattr(cron_jobs, "OUTPUT_DIR", cron_dir / "output")
def _effective_model(self, ctx: AdapterContext) -> str:
"""Translate ClawBench provider-prefixed slugs for direct providers."""
model = ctx.model
if self._config.provider:
return model
base_url = self._config.base_url or ""
try:
host = urlparse(base_url).hostname or ""
except Exception:
host = ""
if host == "api.openai.com" and model.startswith("openai/"):
return model.split("/", 1)[1]
return model
def _runtime_provider_hint(self) -> str | None:
"""Return the provider identity Hermes should expose to its runtime.
Hermes distinguishes the transport used for the main model from the
auxiliary routing metadata it exposes to side tasks. Direct
OpenAI-compatible endpoints need to keep their explicit base URL and
API key, but should still identify as ``custom`` so Hermes auxiliary
calls resolve to the same primary model instead of falling through to
auto-detected providers such as OpenRouter.
"""
if self._config.provider:
return self._config.provider
if self._config.base_url:
return "custom"
return None
def _build_runner(self, ctx: AdapterContext) -> Any:
explicit_api_key = None if self._config.provider else self._config.api_key
explicit_base_url = None if self._config.provider else self._config.base_url
effective_model = self._effective_model(ctx)
ctx.adapter_state["effective_model"] = effective_model
if self._config.runner_factory is not None:
return self._config.runner_factory(
model=effective_model,
env_type=self._config.env_type,
cwd=str(ctx.workspace),
max_iterations=self._config.max_iterations,
command_timeout=self._config.timeout_seconds,
base_url=explicit_base_url,
api_key=explicit_api_key,
)
if MiniSWERunner is None: # pragma: no cover - import-guard branch
raise RuntimeError(
"HermesAdapter requires Hermes Agent's `mini_swe_runner.py`. "
"Install Hermes with the official installer, or set "
"`HERMES_AGENT_REPO=/path/to/hermes-agent` / "
"`HERMES_MINI_SWE_RUNNER=/path/to/mini_swe_runner.py`. "
f"Underlying import error: {_HERMES_IMPORT_ERROR!r}"
)
runner = MiniSWERunner(
model=effective_model,
env_type=self._config.env_type,
cwd=str(ctx.workspace),
max_iterations=self._config.max_iterations,
command_timeout=self._config.timeout_seconds,
base_url=explicit_base_url,
api_key=explicit_api_key,
)
if self._config.provider:
try:
from agent.auxiliary_client import resolve_provider_client
except Exception as exc: # pragma: no cover - optional Hermes internals
raise RuntimeError(
f"Hermes provider routing requested for '{self._config.provider}', "
"but Hermes provider utilities could not be imported."
) from exc
client, resolved_model = resolve_provider_client(
self._config.provider,
model=ctx.model,
)
if client is None or not resolved_model:
raise RuntimeError(
f"Hermes provider '{self._config.provider}' did not resolve credentials."
)
if self._config.provider == "openai-codex":
client = _CodexToolMessageCompatClient(client)
runner.client = client
runner.model = str(resolved_model)
return runner
def _build_ai_agent(self, ctx: AdapterContext) -> Any:
self._prepare_process_env(ctx)
explicit_api_key = None if self._config.provider else self._config.api_key
explicit_base_url = None if self._config.provider else self._config.base_url
enabled_toolsets = self._config.enabled_toolsets or ["hermes-api-server"]
effective_model = self._effective_model(ctx)
provider_hint = self._runtime_provider_hint()
ctx.adapter_state["effective_model"] = effective_model
if self._config.agent_factory is not None:
return self._config.agent_factory(
model=effective_model,
base_url=explicit_base_url,
api_key=explicit_api_key,
provider=provider_hint,
api_mode=self._config.api_mode,
max_iterations=self._config.max_iterations,
enabled_toolsets=enabled_toolsets,
disabled_toolsets=self._config.disabled_toolsets,
)
if AIAgent is None: # pragma: no cover - import-guard branch
raise RuntimeError(
"HermesAdapter full mode requires Hermes Agent's `run_agent.py`. "
"Set `HERMES_AGENT_REPO=/path/to/hermes-agent` or install Hermes. "
f"Underlying import error: {_HERMES_AGENT_IMPORT_ERROR!r}"
)
return AIAgent(
base_url=explicit_base_url,
api_key=explicit_api_key,
provider=provider_hint,
api_mode=self._config.api_mode,
model=effective_model,
max_iterations=self._config.max_iterations,
tool_delay=self._config.tool_delay_seconds,
enabled_toolsets=enabled_toolsets,
disabled_toolsets=self._config.disabled_toolsets,
quiet_mode=True,
verbose_logging=False,
skip_context_files=True,
session_id=f"clawbench-{ctx.task.id}-run{ctx.run_index}",
platform="cli",
)
async def run_phase(
self,
phase: CanonicalPhase,
ctx: AdapterContext,
) -> PhaseResult:
"""Render the phase's first user turn, invoke Hermes, parse output.
v1 limitation: only the first turn of each phase is delivered.
Tasks that declare `MULTI_TURN_INJECTION` as a required
capability are filtered out at harness level before the adapter
is invoked (harness gating lands in a later step). Guarding
here too keeps the adapter honest if it is driven directly.
"""
if self._config.driver_mode == "ai_agent":
return await self._run_ai_agent_phase(phase, ctx)
runner = ctx.adapter_state.get("runner")
if runner is None:
return PhaseResult(
error="HermesAdapter.run_phase called before setup(); no runner",
completed_normally=False,
)
if not phase.user.turns:
return PhaseResult(completed_normally=True)
# Hermes cannot receive dynamic follow-ups; we render and send
# only the first turn. Later turns remain in the canonical
# phase description but are intentionally dropped here.
first_turn = phase.user.turns[0]
message = first_turn.variant_messages.get(
self._config.prompt_variant, first_turn.message
)
prompt = render_template(message, ctx.runtime_values)
phase_timeout = float(
phase.timeout_seconds
or ctx.task.budgets.timeout_seconds
or self._config.timeout_seconds * self._config.max_iterations
)
try:
result: dict[str, Any] = await asyncio.wait_for(
asyncio.to_thread(runner.run_task, prompt),
timeout=phase_timeout,
)
except asyncio.TimeoutError:
return PhaseResult(
error=f"Hermes phase '{phase.name}' exceeded {phase_timeout:.0f}s",
completed_normally=False,
)
except Exception as exc: # pragma: no cover - runner-internal error
return PhaseResult(
error=f"HermesAdapter runner error: {exc}",
completed_normally=False,
)
phase_transcript = parse_conversation(result or {})
ctx.transcript.messages.extend(phase_transcript.messages)
api_calls = int(result.get("api_calls", 0)) if isinstance(result, dict) else 0
ctx.adapter_state["api_calls"] = (
int(ctx.adapter_state.get("api_calls", 0)) + api_calls
)
return PhaseResult(
messages=phase_transcript.messages,
adapter_metadata={
"api_calls": api_calls,
"hermes_metadata": result.get("metadata", {}) if isinstance(result, dict) else {},
},
completed_normally=bool(result.get("completed", False)) if isinstance(result, dict) else False,
)
async def _run_ai_agent_phase(
self,
phase: CanonicalPhase,
ctx: AdapterContext,
) -> PhaseResult:
agent = ctx.adapter_state.get("agent")
if agent is None:
return PhaseResult(
error="HermesAdapter.run_phase called before setup(); no AIAgent",
completed_normally=False,
)
simulator = UserSimulator(
phase.user,
ctx.runtime_values,
prompt_variant=self._config.prompt_variant,
)
phase_timeout = float(
phase.timeout_seconds
or ctx.task.budgets.timeout_seconds
or self._config.timeout_seconds * self._config.max_iterations
)
appended_messages: list = []
phase_api_calls = 0
completed = True
while not simulator.is_done:
user_message = await simulator.next_message(ctx.transcript)
if user_message is None:
break
history = list(ctx.adapter_state.get("conversation_history") or [])
try:
result: dict[str, Any] = await asyncio.wait_for(
asyncio.to_thread(
agent.run_conversation,
user_message,
conversation_history=history or None,
task_id=f"{ctx.task.id}-run{ctx.run_index}",
),
timeout=phase_timeout,
)
except asyncio.TimeoutError:
return PhaseResult(
messages=appended_messages,
error=f"Hermes AIAgent phase '{phase.name}' exceeded {phase_timeout:.0f}s",
completed_normally=False,
)
except Exception as exc: # pragma: no cover - agent-internal error
return PhaseResult(
messages=appended_messages,
error=f"HermesAdapter AIAgent error: {exc}",
completed_normally=False,
)
messages = result.get("messages", []) if isinstance(result, dict) else []
if not isinstance(messages, list):
messages = []
delta = messages[len(history):] if len(messages) >= len(history) else messages
phase_transcript = parse_chat_messages(delta)
ctx.transcript.messages.extend(phase_transcript.messages)
appended_messages.extend(phase_transcript.messages)
ctx.adapter_state["conversation_history"] = messages
phase_api_calls += int(result.get("api_calls", 0)) if isinstance(result, dict) else 0
completed = completed and bool(result.get("completed", False))
ctx.adapter_state["api_calls"] = (
int(ctx.adapter_state.get("api_calls", 0)) + phase_api_calls
)
return PhaseResult(
messages=appended_messages,
adapter_metadata={
"api_calls": phase_api_calls,
"driver_mode": "ai_agent",
},
completed_normally=completed,
)
async def verify_state_query(
self,
query: StateQuery,
ctx: AdapterContext,
) -> StateQueryResult:
if query.kind == "memory":
fallback_state = MemoryState(
key_pattern=str(query.selector.get("key_pattern", "")),
exists=query.predicate != "absent",
value_contains=list(query.expected.get("value_contains", [])),
)
extra_memory_text = self._read_hermes_memory_text(ctx)
ok, detail = verify_memory_fallback(
fallback_state,
ctx.workspace,
transcript=ctx.transcript,
extra_memory_text=extra_memory_text,
)
return StateQueryResult(ok=ok, detail=detail)
if self._config.driver_mode == "ai_agent" and query.kind == "session":
expected_model = str(query.expected.get("model") or "")
if query.predicate == "absent":
return StateQueryResult(ok=False, detail="Hermes AIAgent session exists")
if expected_model and expected_model.lower() not in ctx.model.lower():
return StateQueryResult(
ok=False,
detail=f"Model mismatch: expected {expected_model}, got {ctx.model}",
)
return StateQueryResult(ok=True, detail="OK")
if self._config.driver_mode == "ai_agent" and query.kind == "cron":
return self._verify_cron_file(query, ctx)
# HermesAdapter does not currently expose session/cron/custom
# gateway state. Flag as capability-missing so the scorer can
# apply the neutral skip policy.
return StateQueryResult(
ok=False,
detail=(
f"HermesAdapter does not resolve '{query.kind}' state queries "
f"(missing capability {query.required_capability.value})"
),
capability_missing=True,
)
def _read_hermes_memory_text(self, ctx: AdapterContext) -> str:
hermes_home = Path(ctx.adapter_state.get("hermes_home") or self._hermes_home(ctx))
candidates = [
hermes_home / "memory",
hermes_home / "memories",
hermes_home / "user_memory",
]
chunks: list[str] = []
for candidate in candidates:
if candidate.is_file():
chunks.append(candidate.read_text(encoding="utf-8", errors="replace"))
elif candidate.is_dir():
for path in candidate.rglob("*"):
if path.is_file() and path.suffix.lower() in {".md", ".txt", ".json"}:
try:
chunks.append(path.read_text(encoding="utf-8", errors="replace"))
except Exception:
continue
return "\n".join(chunks)
def _verify_cron_file(
self,
query: StateQuery,
ctx: AdapterContext,
) -> StateQueryResult:
hermes_home = Path(ctx.adapter_state.get("hermes_home") or self._hermes_home(ctx))
jobs_file = hermes_home / "cron" / "jobs.json"
if not jobs_file.is_file():
if query.predicate == "absent":
return StateQueryResult(ok=True, detail="Correctly absent")
return StateQueryResult(ok=False, detail=f"No Hermes cron jobs file at {jobs_file}")
try:
payload = json.loads(jobs_file.read_text(encoding="utf-8"))
except Exception as exc:
return StateQueryResult(ok=False, detail=f"Could not read Hermes cron jobs: {exc}")
jobs = payload if isinstance(payload, list) else payload.get("jobs", [])
if not isinstance(jobs, list):
jobs = []
if query.predicate == "absent":
return StateQueryResult(
ok=not jobs,
detail="Correctly absent" if not jobs else "Cron jobs exist",
)
description_contains = query.selector.get("description_contains")
if not jobs:
return StateQueryResult(ok=False, detail="No cron jobs found")
if description_contains:
needle = str(description_contains).lower()
if not any(needle in json.dumps(job, sort_keys=True).lower() for job in jobs):
return StateQueryResult(
ok=False,
detail=f"No cron job matched '{description_contains}'",
)
return StateQueryResult(ok=True, detail="OK")
async def teardown(self, ctx: AdapterContext) -> None:
"""Release the runner reference so GC can reclaim its process pool."""
ctx.adapter_state.pop("runner", None)
ctx.adapter_state.pop("agent", None)
__all__ = ["HermesAdapter", "HermesAdapterConfig"]

View File

@ -0,0 +1,494 @@
"""Hermes agent conversation → ClawBench `Transcript` converter.
Hermes's `MiniSWERunner.run_task()` returns a dict shaped like:
```json
{
"conversations": [
{"from": "system", "value": "..."},
{"from": "user", "value": "..."},
{"from": "assistant", "value": "I'll look at the file.\\n<tool_call>{\\"name\\":\\"bash\\",\\"arguments\\":{\\"cmd\\":\\"ls\\"}}</tool_call>"},
{"from": "tool", "value": "<tool_response>{\\"stdout\\":\\"file.py\\"}</tool_response>"},
{"from": "assistant", "value": "<tool_call>...</tool_call>"},
...
],
"completed": true,
"api_calls": 7,
"metadata": {...}
}
```
This module parses that into a canonical `Transcript` with
`TranscriptMessage` + `ToolCall` entries so the scorer / trajectory /
judge layers can score the run without any Hermes-specific knowledge.
The XML parsing is deliberately tolerant: Hermes transcripts observed
in the wild sometimes have malformed JSON inside `<tool_call>` tags
(trailing commas, unescaped newlines). We fall back to a permissive
regex extraction in that case so a single bad tool call doesn't tank
the whole transcript.
"""
from __future__ import annotations
import json
import re
from typing import Any, Iterable
from clawbench.schemas import ToolCall, Transcript, TranscriptMessage
#: One `<tool_call>…</tool_call>` block. Non-greedy across newlines.
_TOOL_CALL_RE = re.compile(
r"<tool_call>\s*(?P<body>.*?)\s*</tool_call>", re.DOTALL
)
#: One `<tool_response>…</tool_response>` block.
_TOOL_RESPONSE_RE = re.compile(
r"<tool_response>\s*(?P<body>.*?)\s*</tool_response>", re.DOTALL
)
def _coerce_role(raw: str) -> str:
"""Normalize Hermes role labels to ClawBench `TranscriptMessage.role`.
ClawBench uses `"user"`, `"assistant"`, `"system"`, `"tool"`. Hermes
can emit `"human"`/`"gpt"`/`"function"` variants; we map them all
down to the canonical vocabulary.
"""
value = (raw or "").strip().lower()
if value in {"assistant", "gpt", "model"}:
return "assistant"
if value in {"user", "human"}:
return "user"
if value in {"tool", "function", "tool_response"}:
return "tool"
if value == "system":
return "system"
return value or "assistant"
def _extract_json_objects(text: str) -> list[dict[str, Any]]:
"""Parse 0-or-more top-level JSON objects from free-form text.
Hermes usually puts a single JSON object inside each `<tool_call>`,
but we handle multi-object payloads defensively. Returns an empty
list if no valid JSON is present.
"""
text = text.strip()
if not text:
return []
try:
parsed = json.loads(text)
if isinstance(parsed, dict):
return [parsed]
if isinstance(parsed, list):
return [item for item in parsed if isinstance(item, dict)]
except json.JSONDecodeError:
pass
# Fallback: scan for balanced `{...}` blocks. Useful when the
# assistant wrote slightly malformed JSON. We accept a best-effort
# parse and silently discard the rest.
results: list[dict[str, Any]] = []
depth = 0
start: int | None = None
for i, ch in enumerate(text):
if ch == "{":
if depth == 0:
start = i
depth += 1
elif ch == "}":
depth -= 1
if depth == 0 and start is not None:
candidate = text[start : i + 1]
try:
obj = json.loads(candidate)
if isinstance(obj, dict):
results.append(obj)
except json.JSONDecodeError:
pass
start = None
return results
def _tool_call_from_payload(
payload: dict[str, Any],
*,
index: int,
timestamp_ms: int,
) -> ToolCall:
"""Build a canonical `ToolCall` from a Hermes `<tool_call>` payload.
Hermes emits `{"name": "...", "arguments": {...}}` inside each
tool_call tag. Some Nous-trained models emit slight variants
`"function"` for the tool name, `"parameters"` or `"input"` for
the args. We accept any of those.
"""
name = (
payload.get("name")
or payload.get("function")
or payload.get("tool")
or ""
)
arguments = (
payload.get("arguments")
or payload.get("parameters")
or payload.get("args")
or payload.get("input")
or {}
)
if isinstance(arguments, str):
# Occasionally Hermes passes a JSON-encoded string of args.
try:
arguments = json.loads(arguments)
except json.JSONDecodeError:
arguments = {"raw": arguments}
if not isinstance(arguments, dict):
arguments = {"value": arguments}
call_id = str(payload.get("id") or payload.get("call_id") or f"hermes-{index}")
return ToolCall(
id=call_id,
name=str(name),
input=arguments,
timestamp_ms=timestamp_ms,
)
def _tool_response_summary(payload: dict[str, Any]) -> tuple[str, str, bool | None]:
"""Extract (output, error, success) from a `<tool_response>` payload."""
output = ""
error = ""
success: bool | None = None
stdout = payload.get("stdout")
stderr = payload.get("stderr")
result = payload.get("result")
err = payload.get("error")
msg = payload.get("message")
status = payload.get("status")
if isinstance(stdout, str):
output = stdout
elif isinstance(result, (str, dict, list)):
output = result if isinstance(result, str) else json.dumps(result)
elif isinstance(msg, str):
output = msg
if isinstance(stderr, str) and stderr.strip():
error = stderr
elif isinstance(err, (str, dict, list)):
error = err if isinstance(err, str) else json.dumps(err)
if isinstance(status, str):
lowered = status.lower()
if lowered in {"ok", "success", "succeeded"}:
success = True
elif lowered in {"error", "failed", "failure"}:
success = False
if error and success is None:
success = False
if not error and output and success is None:
success = True
return output, error, success
def _split_tagged(text: str, tag_re: re.Pattern[str]) -> list[tuple[str, str]]:
"""Split `text` into `(kind, body)` tuples where `kind` is `"text"` or
`"tag"`. Preserves ordering so we can thread tool calls/responses
back into the canonical transcript in the order they appeared.
"""
pieces: list[tuple[str, str]] = []
cursor = 0
for match in tag_re.finditer(text):
if match.start() > cursor:
pieces.append(("text", text[cursor : match.start()]))
pieces.append(("tag", match.group("body")))
cursor = match.end()
if cursor < len(text):
pieces.append(("text", text[cursor:]))
return pieces
def parse_conversation(result: dict[str, Any]) -> Transcript:
"""Parse a `MiniSWERunner.run_task` result dict into a `Transcript`.
The conversation is processed in order; tool calls are emitted into
the assistant message that contained them, and tool responses are
paired with the most recent unpaired call. The final Transcript is
ready for `annotate_transcript_tool_calls` scorer.
"""
transcript = Transcript()
conversations = result.get("conversations") or []
pending_calls: list[ToolCall] = []
call_counter = 0
for turn_index, entry in enumerate(conversations):
if not isinstance(entry, dict):
continue
role = _coerce_role(str(entry.get("from", "")))
value = str(entry.get("value", "") or "")
# Tool responses arrive from the tool/function role.
if role == "tool":
for response_body in _TOOL_RESPONSE_RE.findall(value):
payloads = _extract_json_objects(response_body)
if not payloads:
payloads = [{"result": response_body}]
for payload in payloads:
output, error, success = _tool_response_summary(payload)
if pending_calls:
target = pending_calls.pop(0)
target.output = output
target.error = error
if success is not None:
target.success = success
else:
# Orphan tool response — surface it as a tool
# message so nothing is silently dropped.
transcript.messages.append(
TranscriptMessage(
role="tool",
tool_result_content=output or error,
)
)
continue
# Everything else (assistant / user / system) may carry tool
# calls plus free-form text. We interleave them faithfully.
pieces = _split_tagged(value, _TOOL_CALL_RE)
text_chunks: list[str] = []
tool_calls: list[ToolCall] = []
for kind, body in pieces:
if kind == "text":
text_chunks.append(body)
else:
payloads = _extract_json_objects(body)
for payload in payloads:
call_counter += 1
tool_call = _tool_call_from_payload(
payload,
index=call_counter,
timestamp_ms=turn_index,
)
tool_calls.append(tool_call)
pending_calls.append(tool_call)
joined_text = "\n".join(chunk for chunk in text_chunks if chunk.strip()).strip()
if role == "assistant":
transcript.messages.append(
TranscriptMessage(
role="assistant",
text=joined_text,
tool_calls=tool_calls,
timestamp_ms=turn_index,
)
)
elif role == "user":
transcript.messages.append(
TranscriptMessage(
role="user",
text=joined_text,
timestamp_ms=turn_index,
)
)
elif role == "system":
if joined_text:
transcript.messages.append(
TranscriptMessage(
role="system",
text=joined_text,
timestamp_ms=turn_index,
)
)
else:
if joined_text:
transcript.messages.append(
TranscriptMessage(
role=role,
text=joined_text,
timestamp_ms=turn_index,
)
)
return transcript
def _content_to_text(content: Any) -> str:
"""Normalize OpenAI/Anthropic-style message content to plain text."""
if content is None:
return ""
if isinstance(content, str):
return content
if isinstance(content, list):
parts: list[str] = []
for part in content:
if isinstance(part, str):
parts.append(part)
elif isinstance(part, dict):
if isinstance(part.get("text"), str):
parts.append(part["text"])
elif isinstance(part.get("content"), str):
parts.append(part["content"])
return "\n".join(parts)
if isinstance(content, dict):
if isinstance(content.get("text"), str):
return content["text"]
if isinstance(content.get("content"), str):
return content["content"]
return str(content)
def _tool_call_from_chat_payload(
payload: dict[str, Any],
*,
index: int,
timestamp_ms: int,
) -> ToolCall:
"""Build a canonical tool call from chat-completions message payloads."""
function = payload.get("function")
if not isinstance(function, dict):
function = {}
name = (
function.get("name")
or payload.get("name")
or payload.get("tool")
or payload.get("type")
or ""
)
arguments = (
function.get("arguments")
or payload.get("arguments")
or payload.get("args")
or payload.get("input")
or {}
)
if isinstance(arguments, str):
try:
arguments = json.loads(arguments)
except json.JSONDecodeError:
arguments = {"raw": arguments}
if not isinstance(arguments, dict):
arguments = {"value": arguments}
return ToolCall(
id=str(payload.get("id") or payload.get("call_id") or f"hermes-chat-{index}"),
name=str(name),
input=arguments,
timestamp_ms=timestamp_ms,
)
def parse_chat_messages(messages: Iterable[dict[str, Any]]) -> Transcript:
"""Parse Hermes AIAgent/OpenAI-style message history to a Transcript.
`AIAgent.run_conversation()` returns a `messages` list with user,
assistant, and tool-role entries. This parser preserves ordering and
attaches tool-role output back to the assistant `ToolCall` it belongs to.
"""
transcript = Transcript()
pending_by_id: dict[str, ToolCall] = {}
pending_order: list[ToolCall] = []
call_counter = 0
for turn_index, entry in enumerate(messages):
if not isinstance(entry, dict):
continue
role = _coerce_role(str(entry.get("role") or entry.get("from") or ""))
text = _content_to_text(entry.get("content", entry.get("value", "")))
if role == "tool":
tool_call_id = str(entry.get("tool_call_id") or entry.get("id") or "")
target = pending_by_id.get(tool_call_id) if tool_call_id else None
if target is None and pending_order:
target = pending_order.pop(0)
if target is not None:
target.output = text
target.success = not _looks_like_error(text)
if not target.success:
target.error = text
elif text:
transcript.messages.append(
TranscriptMessage(
role="tool",
tool_result_for=tool_call_id or None,
tool_result_content=text,
timestamp_ms=turn_index,
)
)
continue
tool_calls: list[ToolCall] = []
raw_calls = entry.get("tool_calls") or []
if isinstance(raw_calls, list):
for payload in raw_calls:
if not isinstance(payload, dict):
continue
call_counter += 1
call = _tool_call_from_chat_payload(
payload,
index=call_counter,
timestamp_ms=turn_index,
)
tool_calls.append(call)
pending_by_id[call.id] = call
pending_order.append(call)
if role == "assistant":
transcript.messages.append(
TranscriptMessage(
role="assistant",
text=text,
tool_calls=tool_calls,
timestamp_ms=turn_index,
)
)
elif role in {"user", "system"}:
if text:
transcript.messages.append(
TranscriptMessage(
role=role,
text=text,
timestamp_ms=turn_index,
)
)
elif text:
transcript.messages.append(
TranscriptMessage(
role=role,
text=text,
timestamp_ms=turn_index,
)
)
return transcript
def _looks_like_error(text: str) -> bool:
lowered = text.lower()
return any(token in lowered for token in ("error", "traceback", "failed", "exception"))
def iter_tool_calls_from_conversations(conversations: Iterable[dict[str, Any]]) -> list[ToolCall]:
"""Helper used by tests: pull out just the tool-call sequence.
Equivalent to `parse_conversation({"conversations": list(conv)}).tool_call_sequence`
but skips the assistant-text assembly. Useful for asserting on call
order and arguments without noise.
"""
return parse_conversation({"conversations": list(conversations)}).tool_call_sequence
__all__ = [
"iter_tool_calls_from_conversations",
"parse_chat_messages",
"parse_conversation",
]

View File

@ -0,0 +1,472 @@
"""OpenClaw adapter — drives tasks through an OpenClaw gateway.
This is the adapter-shaped wrapper around the agent execution flow that
has lived inside `BenchmarkHarness._run_single` until now. It holds a
`GatewayClient` open for the run's duration, creates one agent per run
and one session per phase (matching the existing behavior), delivers
simulated-user turns, and resolves `StateQuery` assertions against the
gateway's `memory.search` / `sessions.resolve` / `cron.list` / arbitrary
`_rpc(method)` surface.
The benchmark harness now routes OpenClaw through this adapter, matching
the same canonical task/run lifecycle used by other harness adapters.
"""
from __future__ import annotations
import json
import logging
import uuid
from dataclasses import dataclass
from pathlib import Path
from typing import Any
from clawbench.adapters import register_adapter
from clawbench.adapters.base import (
AdapterConfig,
AdapterContext,
AgentAdapter,
PhaseResult,
StateQueryResult,
)
from clawbench.canonical import (
AdapterCapability,
CanonicalPhase,
StateQuery,
)
from clawbench.client import GatewayClient, GatewayConfig
from clawbench.environment_files import (
memory_visible_in_transcript,
resolve_json_path,
verify_memory_fallback,
)
from clawbench.schemas import (
CronState,
MemoryState,
PromptVariant,
SessionState,
Transcript,
)
from clawbench.session_labels import unique_session_label
from clawbench.simulated_user import UserSimulator
logger = logging.getLogger(__name__)
@dataclass
class OpenClawAdapterConfig(AdapterConfig):
"""Config for the OpenClaw adapter.
`gateway` holds the connection parameters the adapter uses to reach
the OpenClaw gateway. `prompt_variant` controls which wording of
each simulated-user turn is rendered.
"""
gateway: GatewayConfig | None = None
prompt_variant: str = PromptVariant.CLEAR.value
# Default per-turn timeout passed to `send_and_wait` when the
# phase does not override it. Matches the existing harness default.
turn_timeout_seconds: float = 180.0
@register_adapter
class OpenClawAdapter(AgentAdapter):
"""Adapter for the OpenClaw gateway (default harness path)."""
name = "openclaw"
capabilities = {
AdapterCapability.FILES,
AdapterCapability.EXECUTION,
AdapterCapability.MEMORY,
AdapterCapability.SESSION,
AdapterCapability.CRON,
AdapterCapability.BROWSER,
AdapterCapability.GATEWAY_RPC,
AdapterCapability.MULTI_TURN_INJECTION,
}
def __init__(self, config: OpenClawAdapterConfig | None = None) -> None:
super().__init__(config or OpenClawAdapterConfig())
self._config: OpenClawAdapterConfig = self.config # type: ignore[assignment]
self._gateway_config: GatewayConfig = self._config.gateway or GatewayConfig()
self._client: GatewayClient | None = None
# Dependency injection hook for tests: monkeypatch this to swap
# in a stub gateway without touching the class definition.
self._client_factory = lambda: GatewayClient(self._gateway_config)
# ------------------------------------------------------------------
# Long-lived gateway connection.
# ------------------------------------------------------------------
async def __aenter__(self) -> "OpenClawAdapter":
client = self._client_factory()
await client.__aenter__()
self._client = client
return self
async def __aexit__(self, exc_type: object, exc: object, tb: object) -> None:
if self._client is not None:
try:
await self._client.__aexit__(exc_type, exc, tb)
finally:
self._client = None
@property
def client(self) -> GatewayClient:
if self._client is None:
raise RuntimeError(
"OpenClawAdapter must be used as an async context manager "
"before calling setup/run_phase/teardown."
)
return self._client
# ------------------------------------------------------------------
# Lifecycle.
# ------------------------------------------------------------------
async def setup(self, ctx: AdapterContext) -> None:
"""Create the per-run agent and run pre-run state queries."""
self._realize_memory_seeds(ctx)
agent_name = (
f"clawbench-{ctx.task.id}-run-{ctx.run_index}-{uuid.uuid4().hex[:6]}"
)
agent_id = await self.client.create_agent(
name=agent_name, workspace=str(ctx.workspace)
)
ctx.adapter_state["agent_id"] = agent_id
ctx.adapter_state.setdefault("session_keys", [])
# Pre-run gateway assertions (ex-`setup.pre_check_gateway`) —
# evaluated immediately, failures are surfaced via the returned
# state via `ctx.adapter_state["pre_run_failures"]` so the
# harness can fail fast before doing any phase work.
failures: list[str] = []
for query in ctx.task.verifier.pre_run_queries:
result = await self.verify_state_query(query, ctx)
if not result.ok:
failures.append(result.detail or query.description)
if failures:
ctx.adapter_state["pre_run_failures"] = failures
def _realize_memory_seeds(self, ctx: AdapterContext) -> None:
"""Expose canonical memory seeds through the run workspace.
OpenClaw's native memory backend has no public seed/write RPC in the
benchmark client, but agents can read files in their workspace and the
verifier already falls back to these same memory files. This keeps
seeded-memory tasks fair across OpenClaw and filesystem-first harnesses.
"""
chunks: list[str] = []
for seed in ctx.task.assets.seed_state:
if seed.kind != "memory" or not seed.key:
continue
content = seed.content or ""
if not isinstance(content, str):
content = str(content)
safe_key = "".join(
ch if ch.isalnum() or ch in ("-", "_") else "_"
for ch in seed.key.strip()
).strip("_")
if not safe_key:
safe_key = "seed"
body = f"# {seed.key}\n\n{content.strip()}\n"
target = ctx.workspace / "memory" / f"{safe_key}.md"
target.parent.mkdir(parents=True, exist_ok=True)
target.write_text(body, encoding="utf-8")
chunks.append(body)
if chunks:
(ctx.workspace / "MEMORY.md").write_text("\n".join(chunks), encoding="utf-8")
async def run_phase(
self,
phase: CanonicalPhase,
ctx: AdapterContext,
) -> PhaseResult:
"""Create a session, drive the simulator, append to the transcript."""
agent_id = ctx.adapter_state.get("agent_id")
if not agent_id:
return PhaseResult(
error="OpenClawAdapter.run_phase called before setup(); no agent_id",
completed_normally=False,
)
session_keys: list[str] = ctx.adapter_state.setdefault("session_keys", [])
session_key = await self.client.create_session(
model=ctx.model,
agent_id=agent_id,
label=unique_session_label(
f"clawbench-{ctx.task.id}-run{ctx.run_index}-phase{phase.name}"
),
)
session_keys.append(session_key)
ctx.adapter_state["last_session_key"] = session_key
await self.client.subscribe(session_key)
# Browser tasks require the browser tool to actually be
# registered in the effective tool set for this session. If it
# isn't, fail the phase fast rather than letting the agent
# flounder against a missing tool.
if ctx.task.family.value == "browser":
try:
await self._assert_browser_support(session_key)
except Exception as exc:
return PhaseResult(
error=str(exc),
completed_normally=False,
)
simulator = UserSimulator(
phase.user,
ctx.runtime_values,
prompt_variant=self._config.prompt_variant,
)
turn_timeout = float(phase.timeout_seconds or ctx.task.budgets.timeout_seconds)
turn_timeout = min(turn_timeout, self._config.turn_timeout_seconds)
appended: list = []
turns_sent = 0
while not simulator.is_done:
user_message = await simulator.next_message(ctx.transcript)
if user_message is None:
break
phase_transcript = await self.client.send_and_wait(
session_key,
user_message,
timeout=turn_timeout,
)
ctx.transcript.messages.extend(phase_transcript.messages)
appended.extend(phase_transcript.messages)
turns_sent += 1
return PhaseResult(
messages=appended,
adapter_metadata={
"session_key": session_key,
"turns_sent": turns_sent,
},
)
async def _assert_browser_support(self, session_key: str) -> None:
inventory = await self.client.get_effective_tools(session_key)
tool_ids = {
str(tool.get("id", ""))
for group in inventory.get("groups", [])
for tool in group.get("tools", [])
}
if "browser" not in tool_ids:
raise RuntimeError(
"Browser tasks require the browser tool, but it is not available in this gateway."
)
async def teardown(self, ctx: AdapterContext) -> None:
"""Delete per-phase sessions and the per-run agent."""
client = self._client
if client is None:
return
session_keys: list[str] = ctx.adapter_state.get("session_keys", [])
agent_id: str | None = ctx.adapter_state.get("agent_id")
for session_key in session_keys:
try:
await client.delete_session(session_key)
except Exception as exc: # pragma: no cover - best effort
logger.warning("delete_session failed for %s: %s", session_key, exc)
if agent_id:
try:
await client.delete_agent(agent_id, delete_files=False)
except Exception as exc: # pragma: no cover - best effort
logger.warning("delete_agent failed for %s: %s", agent_id, exc)
# ------------------------------------------------------------------
# State query resolution.
# ------------------------------------------------------------------
async def verify_state_query(
self,
query: StateQuery,
ctx: AdapterContext,
) -> StateQueryResult:
try:
if query.kind == "memory":
return await self._verify_memory(query, ctx)
if query.kind == "session":
return await self._verify_session(query, ctx)
if query.kind == "cron":
return await self._verify_cron(query, ctx)
if query.kind == "custom":
return await self._verify_gateway(query, ctx)
except Exception as exc:
return StateQueryResult(ok=False, detail=str(exc))
return StateQueryResult(
ok=False,
detail=f"OpenClawAdapter has no handler for query kind '{query.kind}'",
capability_missing=True,
)
# --- memory ---
async def _verify_memory(
self, query: StateQuery, ctx: AdapterContext
) -> StateQueryResult:
key_pattern = str(query.selector.get("key_pattern", ""))
value_contains = list(query.expected.get("value_contains", []))
session_key = ctx.adapter_state.get("last_session_key", "")
agent_id = ctx.adapter_state.get("agent_id")
# Primary path: memory.search RPC.
try:
response = await self.client._rpc(
"memory.search",
{
"query": key_pattern,
"sessionKey": session_key,
"limit": 20,
},
)
entries = response.get("payload", {}).get("entries", [])
if query.predicate == "absent":
ok = not entries
return StateQueryResult(
ok=ok,
detail="Correctly absent" if ok else "Memory entry exists",
)
if not entries:
return StateQueryResult(ok=False, detail="No matching memory entries found")
all_values = " ".join(str(entry.get("value", "")) for entry in entries)
for token in value_contains:
if token.lower() not in all_values.lower():
return StateQueryResult(
ok=False, detail=f"Memory value missing '{token}'"
)
return StateQueryResult(ok=True, detail="OK")
except Exception as exc:
logger.info(
"memory.search unavailable for verification, falling back: %s",
exc,
)
# Fallback: gateway-sourced memory files + workspace scan + transcript.
fallback_state = MemoryState(
key_pattern=key_pattern,
exists=query.predicate != "absent",
value_contains=value_contains,
)
extra_memory_text = ""
if agent_id:
try:
from clawbench.environment import _read_agent_memory_text # local import to avoid cycle
extra_memory_text = await _read_agent_memory_text(self.client, agent_id)
except Exception:
extra_memory_text = ""
ok, detail = verify_memory_fallback(
fallback_state,
ctx.workspace,
transcript=ctx.transcript,
extra_memory_text=extra_memory_text,
)
return StateQueryResult(ok=ok, detail=detail)
# --- session ---
async def _verify_session(
self, query: StateQuery, ctx: AdapterContext
) -> StateQueryResult:
session_key = ctx.adapter_state.get("last_session_key", "")
expected_model = query.expected.get("model") or ""
try:
response = await self.client._rpc("sessions.resolve", {"key": session_key})
payload = response.get("payload", {})
if query.predicate == "absent":
return StateQueryResult(ok=False, detail="Session exists but should not")
if expected_model:
actual = str(payload.get("model", ""))
if str(expected_model).lower() not in actual.lower():
return StateQueryResult(
ok=False,
detail=f"Model mismatch: expected {expected_model}, got {actual}",
)
return StateQueryResult(ok=True, detail="OK")
except Exception as exc:
if query.predicate == "absent":
return StateQueryResult(ok=True, detail="Correctly absent")
return StateQueryResult(ok=False, detail=str(exc))
# --- cron ---
async def _verify_cron(
self, query: StateQuery, ctx: AdapterContext
) -> StateQueryResult:
description_contains = query.selector.get("description_contains")
try:
response = await self.client._rpc("cron.list", {})
jobs = response.get("payload", {}).get("jobs", [])
if query.predicate == "absent":
ok = not jobs
return StateQueryResult(
ok=ok,
detail="Correctly absent" if ok else "Cron jobs exist",
)
if not jobs:
return StateQueryResult(ok=False, detail="No cron jobs found")
if description_contains and not any(
str(description_contains).lower() in json.dumps(job).lower() for job in jobs
):
return StateQueryResult(
ok=False,
detail=f"No cron job matched '{description_contains}'",
)
return StateQueryResult(ok=True, detail="OK")
except Exception as exc:
return StateQueryResult(ok=False, detail=str(exc))
# --- arbitrary gateway RPC ---
async def _verify_gateway(
self, query: StateQuery, ctx: AdapterContext
) -> StateQueryResult:
method = str(query.selector.get("method", ""))
params = dict(query.selector.get("params", {}))
assert_path = str(query.selector.get("assert_path", "$"))
expected_equals = query.expected.get("equals")
expected_contains = query.expected.get("contains")
expected_exists = bool(query.expected.get("exists", True))
try:
response = await self.client._rpc(method, params)
payload = response.get("payload", {})
value = resolve_json_path(payload, assert_path)
if not expected_exists:
ok = value is None
return StateQueryResult(
ok=ok,
detail="Correctly absent" if ok else "Path exists",
)
if value is None:
return StateQueryResult(
ok=False, detail=f"Path {assert_path} not found"
)
if expected_equals is not None and value != expected_equals:
return StateQueryResult(
ok=False, detail=f"Expected {expected_equals}, got {value}"
)
if (
expected_contains is not None
and str(expected_contains).lower() not in str(value).lower()
):
return StateQueryResult(
ok=False,
detail=f"Expected '{expected_contains}' in {value}",
)
return StateQueryResult(ok=True, detail="OK")
except Exception as exc:
return StateQueryResult(ok=False, detail=str(exc))
__all__ = ["OpenClawAdapter", "OpenClawAdapterConfig"]

View File

@ -0,0 +1,45 @@
"""Canonical task schema — agent-agnostic intent layer.
Part of ClawBench Phase-4 per CLAWBENCH_V0_4_SPEC.md §"Canonical Task Schema".
Splits canonical task intent (what to set up, prompt with, and verify) from
OpenClaw-specific execution details (which become adapter responsibilities).
The existing `TaskDefinition` in `clawbench/schemas.py` stays as-is for
back-compat; this package adds a canonical view produced by
`convert.from_task_definition`, which is the single bridge between the two
shapes. Everything downstream of the harness (scorer, trajectory, judge,
stats) is already agent-agnostic those modules consume the transcript +
TaskRunResult and do not need changes.
"""
from clawbench.canonical.schema import (
AdapterCapability,
BudgetSpec,
CanonicalAssets,
CanonicalPhase,
CanonicalTask,
Deliverable,
InteractionPolicy,
SeedEntry,
StateQuery,
StateQueryKind,
StateQueryPredicate,
VerifierContract,
)
from clawbench.canonical.convert import from_task_definition
__all__ = [
"AdapterCapability",
"BudgetSpec",
"CanonicalAssets",
"CanonicalPhase",
"CanonicalTask",
"Deliverable",
"InteractionPolicy",
"SeedEntry",
"StateQuery",
"StateQueryKind",
"StateQueryPredicate",
"VerifierContract",
"from_task_definition",
]

View File

@ -0,0 +1,328 @@
"""Convert `TaskDefinition` → `CanonicalTask`.
This is the single bridge between the existing OpenClaw-entangled task
format (`clawbench.schemas.TaskDefinition`) and the agent-agnostic
canonical form (`CanonicalTask`). Callers load tasks as usual via
`clawbench.tasks.load_all_tasks` and then call
`from_task_definition(task)` to get the canonical view.
Field mappings (any field not mentioned is copied verbatim):
- `setup.asset_packs` `assets.seed_state` (kind="file", asset_pack=...)
- `setup.workspace_files` `assets.workspace_files`
- `setup.background_services` `assets.background_services`
- `setup.memory_seed` `assets.seed_state` (kind="memory")
- `setup.pre_check_gateway` `verifier.pre_run_queries` (GATEWAY_RPC)
- `completion.files` `verifier.file_states`
- `completion.execution_checks` `verifier.execution_checks`
- `completion.memory` `verifier.state_queries` (MEMORY)
- `completion.session` `verifier.state_queries` (SESSION)
- `completion.cron` `verifier.state_queries` (CRON)
- `completion.gateway_assertions` `verifier.state_queries` (GATEWAY_RPC)
- `trajectory` `verifier.trajectory`
- `behavior` `verifier.behavior`
- `judge` `verifier.judge`
- `user` / `phases` `phases` via `task.normalized_phases()`
- `timeout_seconds` `budgets.timeout_seconds` (also on each phase)
`required_adapter_capabilities` is computed from what the task actually
needs: always `{FILES, EXECUTION}`, plus `MEMORY`/`SESSION`/`CRON`/
`GATEWAY_RPC`/`BROWSER`/`MULTI_TURN_INJECTION` when the source task's
fields trigger those capabilities.
"""
from __future__ import annotations
from clawbench.canonical.schema import (
AdapterCapability,
BudgetSpec,
CanonicalAssets,
CanonicalPhase,
CanonicalTask,
InteractionPolicy,
SeedEntry,
StateQuery,
VerifierContract,
)
from clawbench.schemas import (
CronState,
GatewayAssertion,
MemoryState,
SessionState,
TaskDefinition,
TaskFamily,
UserTurn,
)
# ---------------------------------------------------------------------------
# Seed state
# ---------------------------------------------------------------------------
def _seeds_from_setup(task: TaskDefinition) -> list[SeedEntry]:
seeds: list[SeedEntry] = []
for pack in task.setup.asset_packs:
seeds.append(SeedEntry(kind="file", asset_pack=pack))
for entry in task.setup.memory_seed:
# memory_seed entries are free-form dicts in the existing schema;
# we preserve them verbatim in `metadata` and surface `key` +
# `content` when present so adapters can consume the structured
# pieces without re-parsing.
seeds.append(
SeedEntry(
kind="memory",
key=str(entry.get("key", "")),
content=entry.get("value") or entry.get("content"),
metadata=dict(entry),
)
)
return seeds
# ---------------------------------------------------------------------------
# State queries: memory / session / cron / gateway_assertions
# ---------------------------------------------------------------------------
def _memory_state_to_query(state: MemoryState) -> StateQuery:
expected: dict[str, object] = {}
if state.value_contains:
expected["value_contains"] = list(state.value_contains)
return StateQuery(
kind="memory",
predicate="exists" if state.exists else "absent",
selector={"key_pattern": state.key_pattern},
expected=expected,
required_capability=AdapterCapability.MEMORY,
description=f"memory key ~ /{state.key_pattern}/",
)
def _session_state_to_query(state: SessionState) -> StateQuery:
expected: dict[str, object] = {}
if state.model_should_be:
expected["model"] = state.model_should_be
return StateQuery(
kind="session",
predicate="exists" if state.should_exist else "absent",
selector={},
expected=expected,
required_capability=AdapterCapability.SESSION,
description="session state",
)
def _cron_state_to_query(state: CronState) -> StateQuery:
selector: dict[str, object] = {}
if state.description_contains:
selector["description_contains"] = state.description_contains
return StateQuery(
kind="cron",
predicate="exists" if state.exists else "absent",
selector=selector,
expected={},
required_capability=AdapterCapability.CRON,
description="cron schedule",
)
def _gateway_assertion_to_query(assertion: GatewayAssertion) -> StateQuery:
selector: dict[str, object] = {
"method": assertion.method,
"params": dict(assertion.params),
"assert_path": assertion.assert_path,
}
expected: dict[str, object] = {}
if assertion.assert_equals is not None:
expected["equals"] = assertion.assert_equals
if assertion.assert_contains is not None:
expected["contains"] = assertion.assert_contains
expected["exists"] = assertion.assert_exists
predicate = "exists"
if assertion.assert_equals is not None:
predicate = "equals"
elif assertion.assert_contains is not None:
predicate = "contains"
elif not assertion.assert_exists:
predicate = "absent"
return StateQuery(
kind="custom",
predicate=predicate,
selector=selector,
expected=expected,
required_capability=AdapterCapability.GATEWAY_RPC,
description=f"gateway rpc: {assertion.method}",
)
def _state_queries_from_completion(task: TaskDefinition) -> list[StateQuery]:
queries: list[StateQuery] = []
for mem in task.completion.memory:
queries.append(_memory_state_to_query(mem))
if task.completion.session is not None:
queries.append(_session_state_to_query(task.completion.session))
for cron in task.completion.cron:
queries.append(_cron_state_to_query(cron))
for assertion in task.completion.gateway_assertions:
queries.append(_gateway_assertion_to_query(assertion))
return queries
def _pre_run_queries_from_setup(task: TaskDefinition) -> list[StateQuery]:
return [_gateway_assertion_to_query(a) for a in task.setup.pre_check_gateway]
# ---------------------------------------------------------------------------
# Phases + dynamic-turn detection
# ---------------------------------------------------------------------------
_DYNAMIC_TURN_FIELDS = (
"when_tool_family",
"when_tool_name",
"when_assistant_contains",
"when_last_tool_failed",
)
def _turn_is_dynamic(turn: UserTurn) -> bool:
if turn.when_last_tool_failed:
return True
for name in _DYNAMIC_TURN_FIELDS:
value = getattr(turn, name, None)
if isinstance(value, bool):
if value:
return True
elif value:
return True
return False
def _phases_from_task(task: TaskDefinition) -> tuple[list[CanonicalPhase], bool]:
phases: list[CanonicalPhase] = []
any_dynamic = False
for phase in task.normalized_phases():
phases.append(
CanonicalPhase(
name=phase.name,
user=phase.user,
timeout_seconds=phase.timeout_seconds,
)
)
if len(phase.user.turns) > 1 or any(_turn_is_dynamic(t) for t in phase.user.turns):
any_dynamic = True
return phases, any_dynamic
# ---------------------------------------------------------------------------
# Capability inference
# ---------------------------------------------------------------------------
def _capabilities_for_task(task: TaskDefinition, *, uses_dynamic: bool) -> set[AdapterCapability]:
caps: set[AdapterCapability] = {AdapterCapability.FILES, AdapterCapability.EXECUTION}
if task.completion.memory or any(seed.get("key") for seed in task.setup.memory_seed):
caps.add(AdapterCapability.MEMORY)
if task.completion.session is not None:
caps.add(AdapterCapability.SESSION)
if task.completion.cron:
caps.add(AdapterCapability.CRON)
if task.completion.gateway_assertions or task.setup.pre_check_gateway:
caps.add(AdapterCapability.GATEWAY_RPC)
if task.family == TaskFamily.BROWSER:
caps.add(AdapterCapability.BROWSER)
if uses_dynamic:
caps.add(AdapterCapability.MULTI_TURN_INJECTION)
return caps
# ---------------------------------------------------------------------------
# Public entry point
# ---------------------------------------------------------------------------
def from_task_definition(task: TaskDefinition) -> CanonicalTask:
"""Produce the canonical view of a legacy `TaskDefinition`.
This is lossless for fields that have a canonical equivalent.
OpenClaw-only constructs (gateway_assertions, pre_check_gateway,
memory_seed) become `StateQuery` entries / `SeedEntry` entries
tagged with the capability an adapter needs to resolve them.
"""
phases, any_dynamic = _phases_from_task(task)
assets = CanonicalAssets(
workspace_files=list(task.setup.workspace_files),
background_services=list(task.setup.background_services),
seed_state=_seeds_from_setup(task),
)
verifier = VerifierContract(
file_states=list(task.completion.files),
execution_checks=list(task.completion.execution_checks),
state_queries=_state_queries_from_completion(task),
pre_run_queries=_pre_run_queries_from_setup(task),
trajectory=task.trajectory,
behavior=task.behavior,
judge=task.judge,
)
interaction = InteractionPolicy(
max_turns=max((phase.user.max_turns for phase in phases), default=20),
allow_multi_phase=len(phases) > 1,
uses_dynamic_user_triggers=any_dynamic,
)
budgets = BudgetSpec(timeout_seconds=task.timeout_seconds)
capabilities = _capabilities_for_task(task, uses_dynamic=any_dynamic)
return CanonicalTask(
id=task.id,
name=task.name,
tier=task.tier,
family=task.family,
surface=task.surface,
scenario=task.scenario,
subscenario=task.subscenario,
capabilities=list(task.capabilities),
atomic_capabilities=list(task.atomic_capabilities),
pool=task.pool,
subsets=list(task.subsets),
variant_group=task.variant_group,
variant_id=task.variant_id,
template_id=task.template_id,
release_id=task.release_id,
source_kind=task.source_kind,
provenance_ids=list(task.provenance_ids),
privacy_tier=task.privacy_tier,
contamination_risk=task.contamination_risk,
freshness_epoch=task.freshness_epoch,
category=task.category,
domain=task.domain,
functionality=list(task.functionality),
trace_distribution=list(task.trace_distribution),
tool_surface=list(task.tool_surface),
risk_tags=list(task.risk_tags),
first_used_at=task.first_used_at,
retire_after_runs=task.retire_after_runs,
similarity_hash=task.similarity_hash,
canary_token=task.canary_token,
official=task.official,
query_difficulty=task.query_difficulty,
query_weight=task.query_weight,
artifact_type=task.artifact_type,
preconditions=list(task.preconditions),
source_dataset=task.source_dataset,
prompt_variants=list(task.prompt_variants),
pass_threshold=task.pass_threshold,
assets=assets,
phases=phases,
verifier=verifier,
budgets=budgets,
interaction=interaction,
deliverables=[],
required_adapter_capabilities=capabilities,
)

View File

@ -0,0 +1,296 @@
"""Canonical task schema — agent-agnostic intent.
This is the Phase-4 split of `TaskDefinition` (see CLAWBENCH_V0_4_SPEC.md
§"Canonical Task Schema"). The canonical layer expresses **what** a task
is its identity, prompts, assets, and verification contract without
saying **how** it gets executed. The "how" (gateway RPCs, session
lifecycle, tool-family normalization) lives in per-adapter code under
`clawbench/adapters/`.
The rule of thumb:
- If a field describes what the user asked for, what files/state the
agent is expected to produce, or what the run must satisfy to pass,
it belongs here.
- If a field describes how OpenClaw's gateway is called to drive the
run or read back state, it belongs in the OpenClaw adapter (and the
canonical version of that check is a `StateQuery` with a
`required_capability`).
Converting from `TaskDefinition` `CanonicalTask` is lossless for fields
that have a canonical equivalent; OpenClaw-only fields (like
`pre_check_gateway` and `gateway_assertions`) survive as `StateQuery`
entries tagged with `AdapterCapability.GATEWAY_RPC`, so adapters that
support them can still resolve them while adapters that don't can cleanly
report a capability gap.
"""
from __future__ import annotations
import enum
from typing import Any, Literal
from pydantic import BaseModel, Field, model_validator
from clawbench.schemas import (
ArtifactType,
BackgroundService,
BehaviorExpectations,
CapabilityTag,
ExecutionCheck,
FileState,
JudgeExpectations,
PromptVariant,
QueryDifficulty,
ScenarioDomain,
SimulatedUser,
TaskFamily,
TaskPool,
TaskSubset,
Tier,
TrajectoryExpectations,
)
class AdapterCapability(str, enum.Enum):
"""What an adapter is able to provide to a running task.
Each `StateQuery` declares a `required_capability`. If the selected
adapter's `capabilities` set does not include that capability, the
harness either skips the task entirely (strict mode) or scores the
query as neutral (partial mode). This keeps the leaderboard honest
about what an adapter can actually evaluate.
"""
FILES = "files"
EXECUTION = "execution"
MEMORY = "memory"
SESSION = "session"
CRON = "cron"
BROWSER = "browser"
GATEWAY_RPC = "gateway_rpc"
# The adapter can deliver additional user turns mid-trajectory in
# response to simulated-user triggers (when_tool_family,
# when_assistant_contains, etc). Single-shot drivers like Hermes's
# MiniSWERunner do not provide this.
MULTI_TURN_INJECTION = "multi_turn_injection"
StateQueryKind = Literal["memory", "session", "cron", "custom"]
StateQueryPredicate = Literal["exists", "absent", "equals", "contains"]
class StateQuery(BaseModel):
"""An abstract state assertion resolved by the active adapter.
The canonical layer does not commit to how the state is read. For
example, a `kind="memory"` query with `selector={"key_pattern":"alpha"}`
and `expected={"value_contains":["foo"]}` means "there is a memory
entry whose key matches /alpha/ and whose value contains 'foo'".
OpenClaw's adapter resolves that against the `memory.search` gateway
RPC; a filesystem-memory adapter (e.g. Hermes) resolves it by
scanning `MEMORY.md` / `memory/notes.md` in the workspace.
The `required_capability` is what the harness checks against the
adapter's declared capability set.
"""
kind: StateQueryKind
predicate: StateQueryPredicate = "exists"
selector: dict[str, Any] = Field(default_factory=dict)
expected: dict[str, Any] = Field(default_factory=dict)
required_capability: AdapterCapability
description: str = ""
class SeedEntry(BaseModel):
"""A single piece of pre-task state to seed into the workspace.
`kind="file"`: the adapter writes `content` (or copies a bundled
asset via `asset_pack`) to `path` inside the workspace.
`kind="memory"`: the adapter seeds a memory entry with `key` and
`content`. Adapters without memory support fall back to writing
the seed as a file (see `environment_files.verify_memory_fallback`).
"""
kind: Literal["file", "memory"]
path: str | None = None
content: str | None = None
key: str | None = None
asset_pack: str = ""
metadata: dict[str, Any] = Field(default_factory=dict)
@model_validator(mode="after")
def _validate_shape(self) -> SeedEntry:
if self.kind == "file" and not self.path and not self.asset_pack:
raise ValueError("SeedEntry(kind='file') requires `path` or `asset_pack`.")
if self.kind == "memory" and not self.key:
raise ValueError("SeedEntry(kind='memory') requires `key`.")
return self
class Deliverable(BaseModel):
"""A user-visible artifact the task is expected to produce."""
kind: ArtifactType
paths: list[str] = Field(default_factory=list)
description: str = ""
class BudgetSpec(BaseModel):
"""Per-task execution budgets.
`timeout_seconds` is the wall clock for the full run (all phases).
`max_tool_calls=0` means unbounded within the timeout. Adapters are
expected to honor these as soft caps; the harness will also enforce
the timeout as a hard deadline.
"""
timeout_seconds: int = 180
max_tool_calls: int = 0
per_turn_timeout_seconds: int = 0
class InteractionPolicy(BaseModel):
"""How the canonical phases drive the agent."""
max_turns: int = 20
allow_multi_phase: bool = True
# Declares that the task's simulated user sends follow-up turns
# based on trajectory triggers (not just counts). Adapters without
# MULTI_TURN_INJECTION cannot deliver these dynamically.
uses_dynamic_user_triggers: bool = False
class VerifierContract(BaseModel):
"""Everything needed to score a run, independent of how it ran.
The file/execution halves are fully agent-agnostic `environment_files`
evaluates them against the workspace directly. State queries are
resolved by `adapter.verify_state_query`. Trajectory and behavior
expectations are evaluated against the `Transcript` (already agent-
agnostic). The optional judge rubric is evaluated against artifacts
+ transcript + completion feedback.
"""
file_states: list[FileState] = Field(default_factory=list)
execution_checks: list[ExecutionCheck] = Field(default_factory=list)
state_queries: list[StateQuery] = Field(default_factory=list)
pre_run_queries: list[StateQuery] = Field(default_factory=list)
trajectory: TrajectoryExpectations = Field(default_factory=TrajectoryExpectations)
behavior: BehaviorExpectations = Field(default_factory=BehaviorExpectations)
judge: JudgeExpectations | None = None
class CanonicalAssets(BaseModel):
"""Workspace + seed state the harness realizes before phases run.
`workspace_files` is a list of relative paths (resolved against the
task's assets/ dir) to copy into the workspace. `background_services`
is already canonical (subprocess + readiness probe, no OpenClaw
coupling). `seed_state` replaces `asset_packs` + `memory_seed` with
a uniform per-entry list.
"""
workspace_files: list[str] = Field(default_factory=list)
background_services: list[BackgroundService] = Field(default_factory=list)
seed_state: list[SeedEntry] = Field(default_factory=list)
class CanonicalPhase(BaseModel):
"""One simulated-user phase in a multi-phase task.
`user` is reused verbatim from `clawbench.schemas.SimulatedUser`
it is already agent-agnostic (turn text + canonical trigger
predicates). Whether a specific trigger fires on a given adapter
depends on whether tool-family tags are populated, which is an
adapter responsibility.
"""
name: str
user: SimulatedUser
timeout_seconds: int | None = None
class CanonicalTask(BaseModel):
"""Agent-agnostic task definition.
Produced by `convert.from_task_definition` from an existing
`TaskDefinition`. Consumed by adapters via `AdapterContext` and by
the scorer + trajectory/judge layers. No field here is OpenClaw-
specific; OpenClaw-only semantics survive as `StateQuery` entries
with `required_capability=GATEWAY_RPC`.
"""
# Identity and taxonomy (already canonical in TaskDefinition).
id: str
name: str
tier: Tier
family: TaskFamily
surface: str
scenario: ScenarioDomain | None = None
subscenario: str = ""
capabilities: list[CapabilityTag] = Field(default_factory=list)
atomic_capabilities: list[str] = Field(default_factory=list)
# Pool / rotation / provenance.
pool: TaskPool = TaskPool.PUBLIC_DEV
subsets: list[TaskSubset] = Field(default_factory=list)
variant_group: str = ""
variant_id: str = "main"
template_id: str = ""
release_id: str = ""
source_kind: str = ""
provenance_ids: list[str] = Field(default_factory=list)
privacy_tier: str = ""
contamination_risk: str = ""
freshness_epoch: str = ""
category: str = ""
domain: str = ""
functionality: list[str] = Field(default_factory=list)
trace_distribution: list[str] = Field(default_factory=list)
tool_surface: list[str] = Field(default_factory=list)
risk_tags: list[str] = Field(default_factory=list)
first_used_at: str = ""
retire_after_runs: int = 0
similarity_hash: str = ""
canary_token: str = ""
official: bool = False
# Policy + prompts.
query_difficulty: QueryDifficulty | None = None
query_weight: float = 1.0
artifact_type: ArtifactType | None = None
preconditions: list[str] = Field(default_factory=list)
source_dataset: str = ""
prompt_variants: list[PromptVariant] = Field(default_factory=lambda: [PromptVariant.CLEAR])
pass_threshold: float = 0.7
# Canonical body.
assets: CanonicalAssets = Field(default_factory=CanonicalAssets)
phases: list[CanonicalPhase]
verifier: VerifierContract = Field(default_factory=VerifierContract)
budgets: BudgetSpec = Field(default_factory=BudgetSpec)
interaction: InteractionPolicy = Field(default_factory=InteractionPolicy)
deliverables: list[Deliverable] = Field(default_factory=list)
# Adapter gating.
required_adapter_capabilities: set[AdapterCapability] = Field(default_factory=set)
# Forward-compat: lets us evolve this schema while hidden / external
# task manifests continue to validate.
schema_version: str = "1"
@model_validator(mode="after")
def _defaults(self) -> CanonicalTask:
if not self.variant_group:
self.variant_group = self.id
if not self.prompt_variants:
self.prompt_variants = [PromptVariant.CLEAR]
else:
deduped: list[PromptVariant] = []
for variant in self.prompt_variants:
if variant not in deduped:
deduped.append(variant)
self.prompt_variants = deduped
return self

View File

@ -46,9 +46,16 @@ def cli(verbose: bool) -> None:
type=click.Choice(KNOWN_ADAPTERS),
default="openclaw",
show_default=True,
help="Agent harness adapter. OpenClaw is executable today; other adapters are tracked targets.",
help="Agent harness adapter. OpenClaw uses the gateway; Hermes runs hermes-agent locally.",
)
@click.option("--gateway-token", envvar="OPENCLAW_GATEWAY_TOKEN", default="", help="Gateway auth token")
@click.option(
"--gateway-url",
envvar="OPENCLAW_GATEWAY_URL",
default="ws://localhost:18789",
show_default=True,
help="OpenClaw gateway websocket URL",
)
@click.option(
"--judge-model",
envvar="CLAWBENCH_JUDGE_MODEL",
@ -116,6 +123,11 @@ def cli(verbose: bool) -> None:
"completes the v0.5 Configuration Diagnostic Report is generated and "
"the run is recorded in the historical profile database.",
)
@click.option(
"--tool-profile",
default=None,
help="Optional label for the tool/profile axis recorded in result metadata.",
)
@click.option(
"--insights-dir",
type=click.Path(path_type=Path),
@ -132,6 +144,7 @@ def run(
model: str,
adapter: str,
gateway_token: str,
gateway_url: str,
judge_model: str,
runs: int,
tier: str | None,
@ -149,10 +162,11 @@ def run(
concurrency: int,
browser_concurrency: int,
profile: Path | None,
tool_profile: str | None,
insights_dir: Path,
dynamics: bool,
) -> None:
gateway_config = GatewayConfig(token=gateway_token)
gateway_config = GatewayConfig(url=gateway_url, token=gateway_token)
harness = BenchmarkHarness(
gateway_config=gateway_config,
model=model,
@ -171,6 +185,7 @@ def run(
randomize_order=not no_randomize,
concurrency=concurrency,
browser_concurrency=browser_concurrency,
tool_profile_name=tool_profile,
)
result = asyncio.run(harness.run())
@ -198,6 +213,40 @@ def run(
asyncio.run(upload_result(result))
@cli.command("compare-results")
@click.argument("results", nargs=-1, type=click.Path(exists=True, path_type=Path), required=True)
@click.option("--json-out", is_flag=True, help="Print machine-readable comparison JSON.")
def compare_results_cmd(results: tuple[Path, ...], json_out: bool) -> None:
"""Compare BenchmarkResult JSON files with fairness checks."""
from clawbench.ablation import compare_results
from clawbench.schemas import BenchmarkResult
loaded: dict[str, BenchmarkResult] = {}
for path in results:
with path.open(encoding="utf-8") as handle:
loaded[path.stem] = BenchmarkResult(**json.load(handle))
comparison = compare_results(loaded)
if json_out:
click.echo(json.dumps(comparison, indent=2, default=str))
return
click.echo(f"Task/verifier fair: {comparison['task_verifier_fair']}")
click.echo(f"Controlled ablation: {comparison['controlled_ablation']}")
click.echo(f"Same model: {comparison['same_model']}")
click.echo(f"Same task set: {comparison['same_task_set']}")
click.echo(f"Same task snapshot: {comparison['same_task_snapshot']}")
click.echo(f"Same prompt variant: {comparison['same_prompt_variant']}")
for label, row in comparison["rows"].items():
click.echo(
f"{label}: model={row['model']} adapter={row['adapter']} "
f"tasks={row['task_count']} score={row['score']:.3f} "
f"C={row['completion']:.3f} T={row['trajectory']:.3f} "
f"B={row['behavior']:.3f} R={row['reliability']:.3f}"
)
for label, delta in comparison["deltas"].items():
click.echo(f"{label}: {delta:+.3f}")
@cli.command("dynamics-report")
@click.option(
"--archive-dir",
@ -797,6 +846,20 @@ def show(result_file: str) -> None:
)
console.print(f" [bold]pass^k reliability: {result.overall_pass_hat_k:.0%}[/]\n")
for label, dimension_items in (
("Category", result.category_results),
("Domain", result.domain_results),
):
if not dimension_items:
continue
summary = ", ".join(
f"{item.value}={item.weighted_score:.3f}"
for item in sorted(dimension_items, key=lambda item: item.value)
)
console.print(f" [bold]{label}:[/] {summary}")
if result.category_results or result.domain_results:
console.print()
for task in result.task_results:
color = "green" if task.mean_task_score >= 0.7 else "yellow" if task.mean_task_score >= 0.4 else "red"
top_failure = max(task.failure_mode_counts.items(), key=lambda item: item[1])[0] if task.failure_mode_counts else "-"

View File

@ -226,14 +226,81 @@ class GatewayClient:
attempt += 1
try:
remaining = max(1.0, deadline - asyncio.get_running_loop().time())
attempt_timeout = min(30.0, remaining)
self._ws = await websockets.connect(
self.config.url,
max_size=10 * 1024 * 1024,
open_timeout=min(self.config.connect_timeout, remaining),
open_timeout=attempt_timeout,
additional_headers={"Origin": host},
# The benchmark uses loopback gateway sockets and can issue
# long-lived RPCs (notably agent.wait while a provider call
# is in flight). Python websockets' default keepalive can
# close the connection before the gateway surfaces the
# actual model/provider result, contaminating runs as infra
# timeouts. The gateway already owns run-level timeouts.
ping_interval=None,
ping_timeout=None,
)
break
self._listen_task = asyncio.create_task(self._listener())
challenge = await self._wait_event(
"connect.challenge", timeout=attempt_timeout
)
challenge_payload = challenge.get("payload", {})
nonce = ""
if isinstance(challenge_payload, dict):
raw_nonce = challenge_payload.get("nonce", "")
if isinstance(raw_nonce, str):
nonce = raw_nonce.strip()
role = "operator"
scopes = [
"operator.admin",
"operator.read",
"operator.write",
"operator.approvals",
"operator.pairing",
]
client_info = {
"id": "openclaw-control-ui",
"version": __version__,
"platform": "linux",
"mode": "ui",
}
connect_params: dict[str, Any] = {
"minProtocol": PROTOCOL_VERSION,
"maxProtocol": PROTOCOL_VERSION,
"client": client_info,
"role": role,
"scopes": scopes,
"caps": [],
"commands": [],
"permissions": {},
"auth": {"token": self.config.token} if self.config.token else {},
}
device = _build_connect_device(
nonce=nonce,
token=self.config.token,
client_id=str(client_info["id"]),
client_mode=str(client_info["mode"]),
role=role,
scopes=scopes,
platform=str(client_info["platform"]),
)
if device:
connect_params["device"] = device
response = await self._rpc(
"connect",
connect_params,
timeout=attempt_timeout,
)
payload = response.get("payload", {})
if payload.get("type") != "hello-ok":
raise ConnectionError(f"Expected hello-ok, got: {payload}")
logger.info("Connected to gateway (protocol v%s)", payload.get("protocol", "?"))
return
except Exception as exc:
await self.close()
if not _is_transient_gateway_connect_error(exc):
raise
if asyncio.get_running_loop().time() >= deadline:
@ -245,60 +312,6 @@ class GatewayClient:
delay,
)
await asyncio.sleep(delay)
self._listen_task = asyncio.create_task(self._listener())
challenge = await self._wait_event("connect.challenge", timeout=self.config.connect_timeout)
challenge_payload = challenge.get("payload", {})
nonce = ""
if isinstance(challenge_payload, dict):
raw_nonce = challenge_payload.get("nonce", "")
if isinstance(raw_nonce, str):
nonce = raw_nonce.strip()
role = "operator"
scopes = [
"operator.admin",
"operator.read",
"operator.write",
"operator.approvals",
"operator.pairing",
]
client_info = {
"id": "openclaw-control-ui",
"version": __version__,
"platform": "linux",
"mode": "ui",
}
connect_params: dict[str, Any] = {
"minProtocol": PROTOCOL_VERSION,
"maxProtocol": PROTOCOL_VERSION,
"client": client_info,
"role": role,
"scopes": scopes,
"caps": [],
"commands": [],
"permissions": {},
"auth": {"token": self.config.token} if self.config.token else {},
}
device = _build_connect_device(
nonce=nonce,
token=self.config.token,
client_id=str(client_info["id"]),
client_mode=str(client_info["mode"]),
role=role,
scopes=scopes,
platform=str(client_info["platform"]),
)
if device:
connect_params["device"] = device
response = await self._rpc(
"connect",
connect_params,
)
payload = response.get("payload", {})
if payload.get("type") != "hello-ok":
raise ConnectionError(f"Expected hello-ok, got: {payload}")
logger.info("Connected to gateway (protocol v%s)", payload.get("protocol", "?"))
async def close(self) -> None:
if self._listen_task and not self._listen_task.done():
@ -394,6 +407,15 @@ class GatewayClient:
except Exception as exc:
logger.warning("Failed to delete session %s: %s", session_key, exc)
async def abort_session(self, session_key: str, *, run_id: str | None = None) -> None:
params: dict[str, Any] = {"key": session_key}
if run_id:
params["runId"] = run_id
try:
await self._rpc("sessions.abort", params, timeout=min(self.config.request_timeout, 10.0))
except Exception as exc:
logger.warning("Failed to abort session %s run %s: %s", session_key, run_id or "-", exc)
async def get_effective_tools(self, session_key: str) -> dict[str, Any]:
response = await self._rpc("tools.effective", {"sessionKey": session_key})
return response.get("payload", {})
@ -413,15 +435,27 @@ class GatewayClient:
msg_queue: asyncio.Queue[dict[str, Any]] = asyncio.Queue()
self._event_queues[chat_queue_key] = chat_queue
self._event_queues[msg_queue_key] = msg_queue
timeout_ms = max(1, min(int(timeout * 1000), 2_147_483_647))
await self._rpc(
send_response = await self._rpc(
"sessions.send",
{
"key": session_key,
"message": message,
"idempotencyKey": idempotency_key,
"timeoutMs": timeout_ms,
},
)
send_payload = send_response.get("payload", {})
run_id = idempotency_key
if isinstance(send_payload, dict):
raw_run_id = send_payload.get("runId")
if isinstance(raw_run_id, str) and raw_run_id.strip():
run_id = raw_run_id.strip()
wait_task = asyncio.create_task(
self._wait_for_agent_run(run_id, timeout_ms=timeout_ms)
)
collected_messages: list[TranscriptMessage] = []
done = False
@ -430,8 +464,31 @@ class GatewayClient:
while not done:
remaining = deadline - asyncio.get_running_loop().time()
if remaining <= 0:
logger.warning("Timeout waiting for final state on session %s", session_key)
logger.warning(
"Timeout waiting for final state on session %s run %s",
session_key,
run_id,
)
break
if wait_task.done():
wait_payload = _task_result_or_empty(wait_task)
status = str(wait_payload.get("status", ""))
if status and status != "timeout":
logger.info(
"agent.wait observed terminal status for session %s run %s: %s",
session_key,
run_id,
status,
)
done = True
break
if status == "timeout":
logger.warning(
"agent.wait timed out for session %s run %s",
session_key,
run_id,
)
break
try:
event = await asyncio.wait_for(chat_queue.get(), timeout=min(0.5, remaining))
state = event.get("payload", {}).get("state", "")
@ -440,6 +497,9 @@ class GatewayClient:
except asyncio.TimeoutError:
pass
if not done:
await self.abort_session(session_key, run_id=run_id)
collected_messages.extend(
await _drain_message_queue(
msg_queue,
@ -464,11 +524,30 @@ class GatewayClient:
):
collected_messages = history_messages
finally:
if not wait_task.done():
wait_task.cancel()
try:
await wait_task
except asyncio.CancelledError:
pass
self._event_queues.pop(chat_queue_key, None)
self._event_queues.pop(msg_queue_key, None)
return _correlate_transcript(Transcript(messages=collected_messages))
async def _wait_for_agent_run(self, run_id: str, *, timeout_ms: int) -> dict[str, Any]:
try:
response = await self._rpc(
"agent.wait",
{"runId": run_id, "timeoutMs": timeout_ms},
timeout=(timeout_ms / 1000.0) + 10.0,
)
except Exception as exc:
logger.warning("agent.wait failed for run %s: %s", run_id, exc)
return {}
payload = response.get("payload", {})
return payload if isinstance(payload, dict) else {}
async def get_session_messages(self, session_key: str) -> list[TranscriptMessage]:
try:
response = await self._rpc("sessions.get", {"key": session_key})
@ -574,6 +653,13 @@ def _build_connect_device(
platform: str,
device_family: str | None = None,
) -> dict[str, Any] | None:
if os.environ.get("CLAWBENCH_DISABLE_GATEWAY_DEVICE_IDENTITY", "").strip().lower() in {
"1",
"true",
"yes",
"on",
}:
return None
if not nonce:
return None
@ -643,6 +729,10 @@ def _resolve_node_executable() -> str | None:
def _is_transient_gateway_connect_error(exc: Exception) -> bool:
if isinstance(exc, (TimeoutError, asyncio.TimeoutError)):
return True
if isinstance(exc, websockets.exceptions.ConnectionClosed):
return True
if isinstance(exc, InvalidStatus):
return exc.response.status_code in {502, 503, 504}
if isinstance(exc, InvalidMessage):
@ -658,6 +748,13 @@ def _describe_connect_error(exc: Exception) -> str:
return exc.__class__.__name__
def _task_result_or_empty(task: asyncio.Task[dict[str, Any]]) -> dict[str, Any]:
try:
return task.result()
except Exception:
return {}
def _parse_single_message(message_data: dict[str, Any]) -> TranscriptMessage | None:
role = message_data.get("role", "")
if not role:

View File

@ -1,17 +1,44 @@
"""Completion verification for ClawBench v0.3."""
"""Completion verification — OpenClaw-aware entry point.
Historically this module contained both agent-agnostic verification
primitives (file states, execution checks, workspace memory scans, JSON
path resolution) and OpenClaw-specific verifiers that reach into the
gateway via RPCs (`memory.search`, `sessions.resolve`, `cron.list`,
arbitrary `_rpc(method)`).
Phase-4 splits them:
- The agent-agnostic primitives now live in `clawbench.environment_files`
and are used by every adapter.
- The OpenClaw-specific primitives stay here for now and will move into
`clawbench/adapters/openclaw.py` once the adapter wiring lands in a
later step.
The public surface `verify_completion`, `run_execution_check`, module-
level helpers stays unchanged so existing callers (harness, scorer,
tests) keep working. Function bodies that used to do real work now
delegate to `environment_files` to keep behavior identical.
"""
from __future__ import annotations
import asyncio
import json
import logging
import re
import shlex
from pathlib import Path
from typing import Any
from clawbench.client import GatewayClient
from clawbench.render import render_template, render_value
from clawbench.environment_files import (
MEMORY_FILE_CANDIDATES,
evaluate_execution_result as _evaluate_execution_result_impl,
memory_visible_in_transcript as _memory_visible_in_transcript_impl,
read_workspace_memory_text,
resolve_json_path,
run_execution_check as _run_execution_check_impl,
verify_file_state as _verify_file_state_impl,
verify_memory_fallback,
)
from clawbench.schemas import (
CompletionResult,
CompletionSpec,
@ -52,7 +79,9 @@ async def verify_completion(
failures.append(f"FILE {spec.path}: {reason}")
for spec in completion.memory:
ok, reason = await _verify_memory(spec, client, session_key, agent_id=agent_id, transcript=transcript)
ok, reason = await _verify_memory(
spec, client, session_key, agent_id=agent_id, transcript=transcript, workspace=workspace
)
total += 1
if ok:
passed += 1
@ -102,82 +131,20 @@ async def verify_completion(
)
# ---------------------------------------------------------------------------
# Agent-agnostic primitives — re-exported via delegates so historical
# callers that import from `clawbench.environment` keep working.
# ---------------------------------------------------------------------------
async def run_execution_check(
spec: ExecutionCheck,
*,
workspace: Path,
runtime_values: dict[str, Any],
) -> ExecutionCheckResult:
rendered_command = render_template(spec.command, runtime_values)
rendered_cwd = workspace / render_template(spec.cwd, runtime_values)
rendered_env = render_value(spec.env, runtime_values)
import os
import sys
full_env = {
**os.environ,
**{key: str(value) for key, value in rendered_env.items()},
"PYTHONUNBUFFERED": "1",
}
python_bin_dir = str(Path(sys.executable).parent)
full_env["PATH"] = f"{python_bin_dir}:{full_env.get('PATH', '')}"
python_path_parts = [str(rendered_cwd), str(workspace)]
existing_pythonpath = full_env.get("PYTHONPATH")
if existing_pythonpath:
python_path_parts.append(existing_pythonpath)
full_env["PYTHONPATH"] = ":".join(python_path_parts)
try:
if spec.shell:
process = await asyncio.create_subprocess_shell(
rendered_command,
cwd=str(rendered_cwd),
env=full_env,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
else:
process = await asyncio.create_subprocess_exec(
*shlex.split(rendered_command),
cwd=str(rendered_cwd),
env=full_env,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout_bytes, stderr_bytes = await asyncio.wait_for(
process.communicate(),
timeout=spec.timeout_seconds,
)
except asyncio.TimeoutError:
process.kill()
await process.communicate()
return ExecutionCheckResult(
name=spec.name,
command=rendered_command,
exit_code=-1,
passed=False,
reason=f"Timed out after {spec.timeout_seconds}s",
)
except Exception as exc:
return ExecutionCheckResult(
name=spec.name,
command=rendered_command,
exit_code=-1,
passed=False,
reason=str(exc),
)
stdout = stdout_bytes.decode("utf-8", errors="replace")
stderr = stderr_bytes.decode("utf-8", errors="replace")
passed, reason = _evaluate_execution_result(spec, workspace, runtime_values, process.returncode, stdout, stderr)
return ExecutionCheckResult(
name=spec.name,
command=rendered_command,
exit_code=process.returncode,
stdout=stdout,
stderr=stderr,
passed=passed,
reason=reason,
return await _run_execution_check_impl(
spec, workspace=workspace, runtime_values=runtime_values
)
@ -189,92 +156,27 @@ def _evaluate_execution_result(
stdout: str,
stderr: str,
) -> tuple[bool, str]:
if exit_code != spec.expected_exit_code:
return False, f"Exit code {exit_code} != expected {spec.expected_exit_code}"
for token in spec.stdout_contains:
rendered = render_template(token, runtime_values)
if rendered not in stdout:
return False, f"stdout missing '{rendered}'"
for token in spec.stdout_not_contains:
rendered = render_template(token, runtime_values)
if rendered in stdout:
return False, f"stdout unexpectedly contains '{rendered}'"
for token in spec.stderr_contains:
rendered = render_template(token, runtime_values)
if rendered not in stderr:
return False, f"stderr missing '{rendered}'"
if spec.stdout_matches and not re.search(render_template(spec.stdout_matches, runtime_values), stdout, re.MULTILINE | re.DOTALL):
return False, f"stdout does not match {spec.stdout_matches}"
if spec.stderr_matches and not re.search(render_template(spec.stderr_matches, runtime_values), stderr, re.MULTILINE | re.DOTALL):
return False, f"stderr does not match {spec.stderr_matches}"
if spec.expected_stdout is not None:
rendered = render_template(spec.expected_stdout, runtime_values).strip()
if stdout.strip() != rendered:
return False, "stdout did not match expected text"
if spec.expected_stdout_file:
expected_path = workspace / render_template(spec.expected_stdout_file, runtime_values)
if stdout.strip() != expected_path.read_text(encoding="utf-8").strip():
return False, f"stdout did not match {spec.expected_stdout_file}"
if spec.expected_json is not None:
try:
parsed = json.loads(stdout)
except json.JSONDecodeError as exc:
return False, f"stdout was not valid JSON: {exc}"
if parsed != render_value(spec.expected_json, runtime_values):
return False, "stdout JSON did not match expected JSON"
if spec.expected_json_file:
expected_path = workspace / render_template(spec.expected_json_file, runtime_values)
try:
parsed = json.loads(stdout)
except json.JSONDecodeError as exc:
return False, f"stdout was not valid JSON: {exc}"
expected_json = json.loads(expected_path.read_text(encoding="utf-8"))
if parsed != expected_json:
return False, f"stdout JSON did not match {spec.expected_json_file}"
return True, "OK"
return _evaluate_execution_result_impl(
spec, workspace, runtime_values, exit_code, stdout, stderr
)
def _verify_file(spec: FileState, workspace: Path, runtime_values: dict[str, Any]) -> tuple[bool, str]:
path = workspace / render_template(spec.path, runtime_values)
exists = path.exists() and path.is_file()
return _verify_file_state_impl(spec, workspace, runtime_values)
if not spec.exists:
return (not exists, "Correctly absent" if not exists else "File should not exist")
if not exists:
return False, "File does not exist"
content = path.read_text(encoding="utf-8", errors="replace")
if spec.min_size_bytes > 0 and path.stat().st_size < spec.min_size_bytes:
return False, f"File too small: {path.stat().st_size} < {spec.min_size_bytes}"
def _memory_visible_in_transcript(spec: MemoryState, transcript: Transcript) -> bool:
return _memory_visible_in_transcript_impl(spec, transcript)
for token in spec.content_contains:
rendered = render_template(token, runtime_values)
if rendered not in content:
return False, f"Missing expected content '{rendered}'"
for token in spec.content_not_contains:
rendered = render_template(token, runtime_values)
if rendered in content:
return False, f"Contains forbidden content '{rendered}'"
def _resolve_path(payload: Any, path: str) -> Any:
return resolve_json_path(payload, path)
if spec.content_matches and not re.search(
render_template(spec.content_matches, runtime_values),
content,
re.MULTILINE | re.DOTALL,
):
return False, f"Content does not match {spec.content_matches}"
return True, "OK"
# ---------------------------------------------------------------------------
# OpenClaw-tied verifiers. These call `GatewayClient` RPCs; they will
# migrate into `adapters/openclaw.py` once the adapter wiring lands.
# ---------------------------------------------------------------------------
async def _verify_memory(
@ -284,6 +186,7 @@ async def _verify_memory(
*,
agent_id: str | None = None,
transcript: Transcript | None = None,
workspace: Path | None = None,
) -> tuple[bool, str]:
try:
response = await client._rpc(
@ -305,16 +208,42 @@ async def _verify_memory(
return False, f"Memory value missing '{token}'"
return True, "OK"
except Exception as exc:
logger.info("memory.search unavailable for verification, falling back to agent memory files: %s", exc)
logger.info(
"memory.search unavailable for verification, falling back to agent memory files: %s",
exc,
)
# Fallback path: pull the same set of memory files the agent would
# produce (MEMORY.md, memory/notes.md, …) via the gateway, then hand
# the resulting text to the shared filesystem-fallback resolver in
# `environment_files`. If no gateway is available (agent_id is None
# or the calls error) and a workspace was supplied, fall back further
# to scanning the workspace filesystem directly.
extra_memory_text = ""
if agent_id:
try:
extra_memory_text = await _read_agent_memory_text(client, agent_id)
except Exception:
extra_memory_text = ""
if workspace is not None:
return verify_memory_fallback(
spec,
workspace,
transcript=transcript,
extra_memory_text=extra_memory_text,
)
if not agent_id:
return False, "memory.search unavailable and no agent id was provided for fallback verification"
fallback_text = await _read_agent_memory_text(client, agent_id)
normalized = fallback_text.lower()
# Legacy pre-workspace path: agent_id is set but we don't have a
# workspace handle. Resolve using only the gateway-sourced text +
# transcript scan to preserve the exact prior behavior.
normalized = extra_memory_text.lower()
needle = spec.key_pattern.lower()
found = needle in normalized
if not spec.exists:
return (not found, "Correctly absent" if not found else "Memory entry exists")
if found:
@ -322,23 +251,17 @@ async def _verify_memory(
if token.lower() not in normalized:
return False, f"Memory value missing '{token}'"
return True, "OK"
if transcript and _memory_visible_in_transcript(spec, transcript):
return True, "Verified from transcript fallback"
return False, "No matching memory content found in persisted memory files or transcript fallback"
return (
False,
"No matching memory content found in persisted memory files or transcript fallback",
)
async def _read_agent_memory_text(client: GatewayClient, agent_id: str) -> str:
contents: list[str] = []
for file_name in (
"MEMORY.md",
"memory.md",
"memory/MEMORY.md",
"memory/memory.md",
"memory/notes.md",
"memory/NOTES.md",
"notes.md",
):
for file_name in MEMORY_FILE_CANDIDATES:
try:
payload = await client.get_agent_file(agent_id, file_name)
except Exception:
@ -350,30 +273,6 @@ async def _read_agent_memory_text(client: GatewayClient, agent_id: str) -> str:
return "\n".join(contents)
def _memory_visible_in_transcript(spec: MemoryState, transcript: Transcript) -> bool:
needle = spec.key_pattern.lower()
for call in transcript.tool_call_sequence:
family = (call.family or "").lower()
name = call.name.lower()
path = str(call.input.get("path", "")).lower()
if family != "memory" and "memory" not in path:
continue
if family == "memory" and "search" in name and "write" not in name and "store" not in name and "save" not in name:
continue
serialized_bits = [call.output, call.error]
try:
serialized_bits.append(json.dumps(call.input, sort_keys=True))
except TypeError:
serialized_bits.append(str(call.input))
haystack = " ".join(bit for bit in serialized_bits if bit).lower()
if needle not in haystack:
continue
if all(token.lower() in haystack for token in spec.value_contains):
return True
return False
async def _verify_session(
spec: SessionState,
client: GatewayClient,
@ -404,8 +303,7 @@ async def _verify_cron(spec: CronState, client: GatewayClient) -> tuple[bool, st
if not jobs:
return False, "No cron jobs found"
if spec.description_contains and not any(
spec.description_contains.lower() in json.dumps(job).lower()
for job in jobs
spec.description_contains.lower() in json.dumps(job).lower() for job in jobs
):
return False, f"No cron job matched '{spec.description_contains}'"
return True, "OK"
@ -420,7 +318,7 @@ async def _verify_gateway_assertion(
try:
response = await client._rpc(spec.method, spec.params)
payload = response.get("payload", {})
value = _resolve_path(payload, spec.assert_path)
value = resolve_json_path(payload, spec.assert_path)
if not spec.assert_exists:
return (value is None, "Correctly absent" if value is None else "Path exists")
if value is None:
@ -434,28 +332,13 @@ async def _verify_gateway_assertion(
return False, str(exc)
def _resolve_path(payload: Any, path: str) -> Any:
if path == "$":
return payload
current = payload
for part in path.lstrip("$").lstrip(".").split("."):
if not part:
continue
match = re.fullmatch(r"([^\[]+)\[(\d+)\]", part)
if match:
key, index = match.groups()
if not isinstance(current, dict) or key not in current:
return None
current = current[key]
if not isinstance(current, list):
return None
idx = int(index)
if idx >= len(current):
return None
current = current[idx]
continue
if isinstance(current, dict) and part in current:
current = current[part]
continue
return None
return current
# Backward-compatible names for any external users that imported the
# private delegates directly. The old symbols resolve to the new ones.
_verify_file_state = _verify_file
_verify_execution = _evaluate_execution_result_impl
__all__ = [
"run_execution_check",
"verify_completion",
]

View File

@ -0,0 +1,403 @@
"""Agent-agnostic workspace verification primitives.
This is the half of `environment.py` that does not touch the OpenClaw
gateway: file-state checks, execution-check subprocessing, stdout/JSON
assertions, JSON path resolution, and the filesystem/transcript-based
memory fallback readers.
Adapters (OpenClaw, Hermes, future) consume these primitives directly.
`environment.py` re-exports them for back-compat so existing callers
keep working while the gateway-tied halves (`_verify_memory` primary
path, `_verify_session`, `_verify_cron`, `_verify_gateway_assertion`)
stay where they are and move to `adapters/openclaw.py` in a later step.
"""
from __future__ import annotations
import asyncio
import json
import logging
import os
import re
import shlex
import sys
from pathlib import Path
from typing import Any
from clawbench.render import render_template, render_value
from clawbench.schemas import (
ExecutionCheck,
ExecutionCheckResult,
FileState,
MemoryState,
Transcript,
)
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# File-state verification
# ---------------------------------------------------------------------------
def verify_file_state(
spec: FileState,
workspace: Path,
runtime_values: dict[str, Any],
) -> tuple[bool, str]:
"""Verify a single `FileState` against the workspace filesystem."""
path = workspace / render_template(spec.path, runtime_values)
exists = path.exists() and path.is_file()
if not spec.exists:
return (not exists, "Correctly absent" if not exists else "File should not exist")
if not exists:
return False, "File does not exist"
content = path.read_text(encoding="utf-8", errors="replace")
if spec.min_size_bytes > 0 and path.stat().st_size < spec.min_size_bytes:
return False, f"File too small: {path.stat().st_size} < {spec.min_size_bytes}"
for token in spec.content_contains:
rendered = render_template(token, runtime_values)
if rendered not in content:
return False, f"Missing expected content '{rendered}'"
for token in spec.content_not_contains:
rendered = render_template(token, runtime_values)
if rendered in content:
return False, f"Contains forbidden content '{rendered}'"
if spec.content_matches and not re.search(
render_template(spec.content_matches, runtime_values),
content,
re.MULTILINE | re.DOTALL,
):
return False, f"Content does not match {spec.content_matches}"
return True, "OK"
# ---------------------------------------------------------------------------
# Execution checks
# ---------------------------------------------------------------------------
async def run_execution_check(
spec: ExecutionCheck,
*,
workspace: Path,
runtime_values: dict[str, Any],
) -> ExecutionCheckResult:
"""Run a single `ExecutionCheck` subprocess and evaluate its output."""
rendered_command = render_template(spec.command, runtime_values)
rendered_cwd = workspace / render_template(spec.cwd, runtime_values)
rendered_env = render_value(spec.env, runtime_values)
full_env = {
**os.environ,
**{key: str(value) for key, value in rendered_env.items()},
"PYTHONUNBUFFERED": "1",
}
python_bin_dir = str(Path(sys.executable).parent)
full_env["PATH"] = f"{python_bin_dir}:{full_env.get('PATH', '')}"
python_path_parts = [str(rendered_cwd), str(workspace)]
existing_pythonpath = full_env.get("PYTHONPATH")
if existing_pythonpath:
python_path_parts.append(existing_pythonpath)
full_env["PYTHONPATH"] = ":".join(python_path_parts)
try:
if spec.shell:
process = await asyncio.create_subprocess_shell(
rendered_command,
cwd=str(rendered_cwd),
env=full_env,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
else:
process = await asyncio.create_subprocess_exec(
*shlex.split(rendered_command),
cwd=str(rendered_cwd),
env=full_env,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout_bytes, stderr_bytes = await asyncio.wait_for(
process.communicate(),
timeout=spec.timeout_seconds,
)
except asyncio.TimeoutError:
process.kill()
await process.communicate()
return ExecutionCheckResult(
name=spec.name,
command=rendered_command,
exit_code=-1,
passed=False,
reason=f"Timed out after {spec.timeout_seconds}s",
)
except Exception as exc:
return ExecutionCheckResult(
name=spec.name,
command=rendered_command,
exit_code=-1,
passed=False,
reason=str(exc),
)
stdout = stdout_bytes.decode("utf-8", errors="replace")
stderr = stderr_bytes.decode("utf-8", errors="replace")
passed, reason = evaluate_execution_result(
spec, workspace, runtime_values, process.returncode, stdout, stderr
)
return ExecutionCheckResult(
name=spec.name,
command=rendered_command,
exit_code=process.returncode,
stdout=stdout,
stderr=stderr,
passed=passed,
reason=reason,
)
def evaluate_execution_result(
spec: ExecutionCheck,
workspace: Path,
runtime_values: dict[str, Any],
exit_code: int,
stdout: str,
stderr: str,
) -> tuple[bool, str]:
"""Apply every assertion declared on an `ExecutionCheck`."""
if exit_code != spec.expected_exit_code:
return False, f"Exit code {exit_code} != expected {spec.expected_exit_code}"
for token in spec.stdout_contains:
rendered = render_template(token, runtime_values)
if rendered not in stdout:
return False, f"stdout missing '{rendered}'"
for token in spec.stdout_not_contains:
rendered = render_template(token, runtime_values)
if rendered in stdout:
return False, f"stdout unexpectedly contains '{rendered}'"
for token in spec.stderr_contains:
rendered = render_template(token, runtime_values)
if rendered not in stderr:
return False, f"stderr missing '{rendered}'"
if spec.stdout_matches and not re.search(
render_template(spec.stdout_matches, runtime_values), stdout, re.MULTILINE | re.DOTALL
):
return False, f"stdout does not match {spec.stdout_matches}"
if spec.stderr_matches and not re.search(
render_template(spec.stderr_matches, runtime_values), stderr, re.MULTILINE | re.DOTALL
):
return False, f"stderr does not match {spec.stderr_matches}"
if spec.expected_stdout is not None:
rendered = render_template(spec.expected_stdout, runtime_values).strip()
if stdout.strip() != rendered:
return False, "stdout did not match expected text"
if spec.expected_stdout_file:
expected_path = workspace / render_template(spec.expected_stdout_file, runtime_values)
if stdout.strip() != expected_path.read_text(encoding="utf-8").strip():
return False, f"stdout did not match {spec.expected_stdout_file}"
if spec.expected_json is not None:
try:
parsed = json.loads(stdout)
except json.JSONDecodeError as exc:
return False, f"stdout was not valid JSON: {exc}"
if parsed != render_value(spec.expected_json, runtime_values):
return False, "stdout JSON did not match expected JSON"
if spec.expected_json_file:
expected_path = workspace / render_template(spec.expected_json_file, runtime_values)
try:
parsed = json.loads(stdout)
except json.JSONDecodeError as exc:
return False, f"stdout was not valid JSON: {exc}"
expected_json = json.loads(expected_path.read_text(encoding="utf-8"))
if parsed != expected_json:
return False, f"stdout JSON did not match {spec.expected_json_file}"
return True, "OK"
# ---------------------------------------------------------------------------
# Memory fallback: read well-known files from the workspace directly.
# ---------------------------------------------------------------------------
MEMORY_FILE_CANDIDATES: tuple[str, ...] = (
"MEMORY.md",
"memory.md",
"memory/MEMORY.md",
"memory/memory.md",
"memory/notes.md",
"memory/NOTES.md",
"notes.md",
)
def read_workspace_memory_text(workspace: Path) -> str:
"""Read concatenated memory-file contents straight from the workspace.
This is the adapter-free equivalent of
`environment._read_agent_memory_text`, which reads the same files via
`GatewayClient.get_agent_file`. Use this from any adapter whose agent
runs directly in the ClawBench workspace (Hermes, Claude Code, Codex).
"""
contents: list[str] = []
for name in MEMORY_FILE_CANDIDATES:
path = workspace / name
try:
if path.is_file():
text = path.read_text(encoding="utf-8", errors="replace")
if text.strip():
contents.append(text)
except Exception:
continue
return "\n".join(contents)
def memory_visible_in_transcript(spec: MemoryState, transcript: Transcript) -> bool:
"""Return True if the transcript shows a memory *write* matching `spec`.
Same heuristic as `environment._memory_visible_in_transcript` kept
agent-agnostic: it reads `ToolCall.family`, `call.name`, `call.input`,
`call.output`, `call.error`, all of which are canonical.
"""
needle = spec.key_pattern.lower()
for call in transcript.tool_call_sequence:
family = (call.family or "").lower()
name = call.name.lower()
path = str(call.input.get("path", "")).lower()
if family != "memory" and "memory" not in path:
continue
if (
family == "memory"
and "search" in name
and "write" not in name
and "store" not in name
and "save" not in name
):
continue
serialized_bits = [call.output, call.error]
try:
serialized_bits.append(json.dumps(call.input, sort_keys=True))
except TypeError:
serialized_bits.append(str(call.input))
haystack = " ".join(bit for bit in serialized_bits if bit).lower()
if needle not in haystack:
continue
if all(token.lower() in haystack for token in spec.value_contains):
return True
return False
def verify_memory_fallback(
spec: MemoryState,
workspace: Path,
*,
transcript: Transcript | None = None,
extra_memory_text: str = "",
) -> tuple[bool, str]:
"""Resolve a `MemoryState` assertion using workspace files + transcript.
Used by any adapter that doesn't expose an OpenClaw-style
`memory.search` RPC. The lookup strategy is deliberately permissive
(matches the existing fallback path in `environment._verify_memory`):
1. Concatenate every known memory file in the workspace.
2. Optionally add any adapter-supplied text (e.g. OpenClaw's
`_read_agent_memory_text`) via `extra_memory_text`.
3. If the key_pattern appears (case-insensitive), check every
`value_contains` token.
4. If that fails, fall back to scanning the transcript for a memory
write that matches.
"""
memory_text = (read_workspace_memory_text(workspace) + "\n" + extra_memory_text).lower()
needle = spec.key_pattern.lower()
found = needle in memory_text
if not spec.exists:
return (not found, "Correctly absent" if not found else "Memory entry exists")
if found:
for token in spec.value_contains:
if token.lower() not in memory_text:
return False, f"Memory value missing '{token}'"
return True, "OK"
if transcript is not None and memory_visible_in_transcript(spec, transcript):
return True, "Verified from transcript fallback"
return (
False,
"No matching memory content found in persisted memory files or transcript fallback",
)
# ---------------------------------------------------------------------------
# JSON-path resolver (pure function over dict/list payloads)
# ---------------------------------------------------------------------------
def resolve_json_path(payload: Any, path: str) -> Any:
"""Resolve a dotted `$.foo.bar[0].baz` path into `payload`.
Returns None if any part of the path is missing or the type is
wrong. Handles index syntax via `foo[3]`.
"""
if path == "$":
return payload
current = payload
for part in path.lstrip("$").lstrip(".").split("."):
if not part:
continue
match = re.fullmatch(r"([^\[]+)\[(\d+)\]", part)
if match:
key, index = match.groups()
if not isinstance(current, dict) or key not in current:
return None
current = current[key]
if not isinstance(current, list):
return None
idx = int(index)
if idx >= len(current):
return None
current = current[idx]
continue
if isinstance(current, dict) and part in current:
current = current[part]
continue
return None
return current
__all__ = [
"MEMORY_FILE_CANDIDATES",
"evaluate_execution_result",
"memory_visible_in_transcript",
"read_workspace_memory_text",
"resolve_json_path",
"run_execution_check",
"verify_file_state",
"verify_memory_fallback",
]

View File

@ -8,21 +8,35 @@ import hashlib
import logging
import os
import shutil
import subprocess
import time
import uuid
from collections.abc import Awaitable, Callable
from pathlib import Path
from typing import Any
from urllib.parse import urlparse
from rich.console import Console
from rich.table import Table
from clawbench import __version__
from clawbench.ablation import build_ablation_profile, git_head
from clawbench.adapters import get_adapter
from clawbench.adapters.base import AdapterContext
from clawbench.adapters.hermes import HermesAdapterConfig
from clawbench.adapters.openclaw import OpenClawAdapterConfig
from clawbench.canonical.convert import from_task_definition
from clawbench.client import GatewayClient, GatewayConfig
from clawbench.environment_files import run_execution_check, verify_file_state
from clawbench.judge import judge_task_run
from clawbench.releases import compute_task_snapshot_fingerprint, load_active_release
from clawbench.schemas import (
BenchmarkResult,
CompletionResult,
DimensionResult,
DeliveryOutcome,
EfficiencyResult,
JudgeResult,
ScenarioResult,
TaskDefinition,
TaskRunResult,
@ -30,18 +44,38 @@ from clawbench.schemas import (
TierResult,
Transcript,
)
from clawbench.scorer import classify_error_failure_mode, score_task_run
from clawbench.session_labels import unique_session_label
from clawbench.scorer import (
classify_delivery_outcome,
classify_error_failure_mode,
classify_failure_mode,
combine_run_score,
evaluate_behavior,
)
from clawbench.services import build_runtime_values, start_background_services, stop_background_services
from clawbench.simulated_user import UserSimulator
from clawbench.stats import bootstrap_ci, summarize_task_runs
from clawbench.tasks import get_assets_dir, load_all_tasks
from clawbench.trajectory import annotate_transcript_tool_calls, evaluate_trajectory
logger = logging.getLogger(__name__)
console = Console()
KNOWN_ADAPTERS = ("openclaw", "hermes", "codex", "claude-code")
EXECUTABLE_ADAPTERS = {"openclaw"}
EXECUTABLE_ADAPTERS = {"openclaw", "hermes"}
def _command_version(command: list[str]) -> str:
try:
result = subprocess.run(
command,
check=False,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
timeout=5,
)
except Exception:
return ""
return (result.stdout or "").strip().splitlines()[0] if result.stdout else ""
class _NullCtx:
@ -83,6 +117,7 @@ class BenchmarkHarness:
concurrency: int = 1,
browser_concurrency: int = 1,
adapter: str = "openclaw",
tool_profile_name: str | None = None,
) -> None:
self.gateway_config = gateway_config
self.model = model
@ -107,6 +142,7 @@ class BenchmarkHarness:
self.concurrency = max(1, int(concurrency))
self.browser_concurrency = max(1, int(browser_concurrency))
self.adapter = adapter
self.tool_profile_name = tool_profile_name
self.repo_root = Path(__file__).parent.parent
self.last_task_runs: dict[str, list[TaskRunResult]] = {}
@ -136,6 +172,8 @@ class BenchmarkHarness:
if not tasks:
raise ValueError("No tasks to run")
tasks = self._filter_tasks_for_adapter(tasks)
if self.randomize_order:
import random
@ -261,66 +299,168 @@ class BenchmarkHarness:
console.print(f" [red]! {failure}[/]")
async def _run_single(self, task: TaskDefinition, run_index: int) -> TaskRunResult:
# Per-turn timeout cap: prevents a single send_and_wait from burning the entire task
# timeout (often 300-600s). Default 180s is enough for any reasonable single-turn
# response and fails fast on stuck models. Override with env var if needed.
per_turn_cap = float(os.environ.get("CLAWBENCH_PER_TURN_TIMEOUT_SECONDS", "180"))
# Per-run hard budget: total wall time a single (task, run) is allowed to consume.
# Default 300s (5 min) bounds the worst case to 5min * 120 = 10h/model if fully
# serial, and <3h/model at lanes=4. Env override available for longer slower models.
per_run_budget = float(os.environ.get("CLAWBENCH_PER_RUN_BUDGET_SECONDS", "300"))
return await self._run_single_with_agent_adapter(task, run_index)
# Per-run result cache: allows a failed job to resume from previously completed
# (task, run) pairs on resubmit. Keyed by model + task + run_index so the same
# model's runs are reused, but different models stay isolated. The cache is
# written AFTER successful score_task_run and read at the start of this method.
# Set CLAWBENCH_RUN_CACHE_DIR="" to disable.
def _filter_tasks_for_adapter(self, tasks: list[TaskDefinition]) -> list[TaskDefinition]:
"""Drop tasks the selected adapter cannot execute."""
adapter_cls = get_adapter(self.adapter)
adapter_config = self._adapter_config()
compatible: list[TaskDefinition] = []
skipped: list[tuple[str, str]] = []
for task in tasks:
canonical = from_task_definition(task)
missing = adapter_cls.missing_capabilities_for(canonical, adapter_config)
if missing:
skipped.append((task.id, ", ".join(sorted(cap.value for cap in missing))))
continue
compatible.append(task)
if skipped and not self.quiet:
console.print(
f"[yellow]Adapter '{self.adapter}' skipped {len(skipped)} incompatible task(s).[/]"
)
for task_id, caps in skipped[:5]:
console.print(f" [yellow]- {task_id}: missing {caps}[/]")
if len(skipped) > 5:
console.print(f" [yellow]- ... {len(skipped) - 5} more[/]")
if not compatible:
raise ValueError(
f"No selected tasks are compatible with adapter '{self.adapter}'. "
"Try a files/execution task such as t1-bugfix-discount, or use adapter 'openclaw'."
)
return compatible
def _adapter_config(self) -> object:
if self.adapter == "openclaw":
per_turn_cap = float(os.environ.get("CLAWBENCH_PER_TURN_TIMEOUT_SECONDS", "180"))
return OpenClawAdapterConfig(
gateway=self.gateway_config,
prompt_variant=self.prompt_variant,
turn_timeout_seconds=per_turn_cap,
)
if self.adapter == "hermes":
provider = os.environ.get("HERMES_PROVIDER") or None
base_url = os.environ.get("HERMES_BASE_URL") or None
api_mode = os.environ.get("HERMES_API_MODE") or None
api_key = (
os.environ.get("HERMES_API_KEY")
or os.environ.get("OPENROUTER_API_KEY")
or os.environ.get("OPENAI_API_KEY")
or None
)
if provider:
base_url = None
api_key = None
elif provider is None and self.model.startswith("openai/"):
base_url = (
base_url
or os.environ.get("OPENAI_BASE_URL")
or ("https://api.openai.com/v1" if os.environ.get("OPENAI_API_KEY") else None)
)
host = ""
try:
host = urlparse(base_url or "").hostname or ""
except Exception:
host = ""
if host == "api.openai.com":
api_key = os.environ.get("OPENAI_API_KEY") or os.environ.get("HERMES_API_KEY") or None
if api_mode is None and self.model.split("/", 1)[1].lower().startswith("gpt-5"):
api_mode = "codex_responses"
elif provider is None and self.model.startswith("anthropic/"):
provider = "anthropic"
base_url = None
api_key = None
elif (
base_url is None
and os.environ.get("OPENAI_API_KEY")
and not os.environ.get("HERMES_API_KEY")
and not os.environ.get("OPENROUTER_API_KEY")
):
base_url = "https://api.openai.com/v1"
enabled_toolsets = [
item.strip()
for item in os.environ.get("HERMES_TOOLSETS", "hermes-api-server").split(",")
if item.strip()
]
disabled_toolsets = [
item.strip()
for item in os.environ.get("HERMES_DISABLED_TOOLSETS", "").split(",")
if item.strip()
] or None
return HermesAdapterConfig(
model=self.model,
env_type=os.environ.get("HERMES_ENV_TYPE", "local"),
max_iterations=int(os.environ.get("HERMES_MAX_ITERATIONS", "15")),
timeout_seconds=int(os.environ.get("HERMES_STEP_TIMEOUT_SECONDS", "60")),
base_url=base_url,
api_key=api_key,
provider=provider,
api_mode=api_mode,
prompt_variant=self.prompt_variant,
driver_mode=os.environ.get("HERMES_DRIVER", "ai_agent"),
enabled_toolsets=enabled_toolsets,
disabled_toolsets=disabled_toolsets,
hermes_home=os.environ.get("HERMES_HOME_BASE") or None,
)
raise ValueError(f"No config builder for adapter '{self.adapter}'")
async def _run_single_with_agent_adapter(
self,
task: TaskDefinition,
run_index: int,
) -> TaskRunResult:
per_run_budget = float(os.environ.get("CLAWBENCH_PER_RUN_BUDGET_SECONDS", "300"))
cache_dir_env = os.environ.get("CLAWBENCH_RUN_CACHE_DIR", "/data/run_cache")
cache_path: Path | None = None
if cache_dir_env:
safe_model = self.model.replace("/", "_").replace(":", "_")
cache_path = Path(cache_dir_env) / safe_model / task.id / f"run{run_index}.json"
cache_path = (
Path(cache_dir_env)
/ f"{self.adapter}-{safe_model}"
/ task.id
/ f"run{run_index}.json"
)
if cache_path.exists():
try:
cached = TaskRunResult.model_validate_json(cache_path.read_text(encoding="utf-8"))
cached.run_index = run_index
logger.info(
"TIMING %s/run%s total=cached score=%.2f C=%.2f T=%.2f B=%.2f J=%.2f (resumed from %s)",
task.id, run_index,
cached.run_score,
cached.completion_result.score,
cached.trajectory_result.score,
cached.behavior_result.score,
cached.judge_result.score if cached.judge_result.enabled else 0.0,
cache_path,
cached = TaskRunResult.model_validate_json(
cache_path.read_text(encoding="utf-8")
)
cached.run_index = run_index
return cached
except Exception as exc:
logger.warning("Cache load failed for %s/run%s: %s (will re-run)", task.id, run_index, exc)
logger.warning(
"Adapter cache load failed for %s/run%s: %s (will re-run)",
task.id,
run_index,
exc,
)
workspace = self._create_run_workspace(task, run_index)
services = []
session_keys: list[str] = []
agent_id: str | None = None
# Per-phase timings so we can see where slow runs are spending their wall time.
timings: dict[str, float] = {}
def _tick(label: str, since: float) -> float:
now = time.monotonic()
timings[label] = round(now - since, 2)
return now
t_run_start = time.monotonic()
try:
t_phase = t_run_start
self._setup_workspace(task, workspace)
t_phase = _tick("workspace_setup", t_phase)
transcript = Transcript()
canonical = from_task_definition(task)
ctx = AdapterContext(
task=canonical,
workspace=workspace,
runtime_values={},
run_index=run_index,
model=self.model,
transcript=transcript,
)
try:
self._setup_workspace(task, workspace)
runtime_values = build_runtime_values(
workspace=workspace,
repo_root=self.repo_root,
extra={"task_id": task.id, "model": self.model, "prompt_variant": self.prompt_variant},
extra={
"task_id": task.id,
"model": self.model,
"prompt_variant": self.prompt_variant,
},
)
services, runtime_values = await start_background_services(
task.setup.background_services,
@ -328,118 +468,65 @@ class BenchmarkHarness:
repo_root=self.repo_root,
runtime_values=runtime_values,
)
t_phase = _tick("bg_services_start", t_phase)
ctx.runtime_values = runtime_values
transcript = Transcript()
adapter_cls = get_adapter(self.adapter)
adapter = adapter_cls(self._adapter_config()) # type: ignore[arg-type]
phase_errors: list[str] = []
start_ms = _now_ms()
async with adapter:
try:
await adapter.setup(ctx)
pre_run_failures = ctx.adapter_state.get("pre_run_failures") or []
if pre_run_failures:
raise RuntimeError("; ".join(str(item) for item in pre_run_failures))
async with GatewayClient(self.gateway_config) as client:
t_phase = _tick("gateway_connect", t_phase)
agent_id = await self._create_run_agent(
client,
task=task,
workspace=workspace,
run_index=run_index,
)
t_phase = _tick("agent_create", t_phase)
for phase_index, phase in enumerate(task.normalized_phases()):
session_key = await client.create_session(
model=self.model,
agent_id=agent_id,
label=unique_session_label(
f"clawbench-{task.id}-run{run_index}-phase{phase_index}"
),
)
session_keys.append(session_key)
await client.subscribe(session_key)
if task.family.value == "browser":
await self._assert_browser_support(client, session_key)
t_phase = _tick(f"phase{phase_index}_session_setup", t_phase)
simulator = UserSimulator(
phase.user,
runtime_values,
prompt_variant=self.prompt_variant,
)
turn_index = 0
phase_raw_timeout = float(phase.timeout_seconds or task.timeout_seconds)
turn_timeout = min(phase_raw_timeout, per_turn_cap)
while not simulator.is_done:
# Enforce per-run budget: if we've already burned our whole budget
# on previous turns of this run, bail out and score whatever we have.
for phase in canonical.phases:
elapsed = time.monotonic() - t_run_start
if elapsed >= per_run_budget:
logger.warning(
"Run %s/%s hit per-run budget (%.0fs); stopping user simulator",
task.id,
run_index,
per_run_budget,
remaining_budget = per_run_budget - elapsed
if remaining_budget <= 0:
phase_errors.append(
f"Adapter run hit per-run budget ({per_run_budget:.0f}s)"
)
break
remaining_budget = per_run_budget - elapsed
effective_timeout = min(turn_timeout, remaining_budget)
user_message = await simulator.next_message(transcript)
if user_message is None:
try:
phase_result = await asyncio.wait_for(
adapter.run_phase(phase, ctx),
timeout=remaining_budget,
)
except asyncio.TimeoutError:
phase_errors.append(
f"Adapter run hit per-run budget ({per_run_budget:.0f}s)"
)
break
if phase_result.error:
phase_errors.append(phase_result.error)
break
t_turn_start = time.monotonic()
phase_transcript = await client.send_and_wait(
session_key,
user_message,
timeout=effective_timeout,
)
timings[f"phase{phase_index}_turn{turn_index}"] = round(
time.monotonic() - t_turn_start, 2
)
transcript.messages.extend(phase_transcript.messages)
turn_index += 1
t_phase = _tick(f"phase{phase_index}_total", t_phase)
duration_ms = _now_ms() - start_ms
last_session_key = session_keys[-1] if session_keys else ""
t_score_start = time.monotonic()
result = await score_task_run(
task=task,
transcript=transcript,
workspace=workspace,
client=client,
session_key=last_session_key,
agent_id=agent_id,
duration_ms=duration_ms,
runtime_values=runtime_values,
judge_model=self.judge_model,
)
timings["score"] = round(time.monotonic() - t_score_start, 2)
timings["total"] = round(time.monotonic() - t_run_start, 2)
result.run_index = run_index
duration_ms = _now_ms() - start_ms
result = await self._score_adapter_task_run(
task=task,
canonical_task=canonical,
ctx=ctx,
duration_ms=duration_ms,
adapter=adapter,
error="; ".join(phase_errors) if phase_errors else None,
)
finally:
await adapter.teardown(ctx)
result.run_index = run_index
# Write per-run cache so a future resume of this job can skip this run.
if cache_path is not None:
try:
cache_path.parent.mkdir(parents=True, exist_ok=True)
tmp_path = cache_path.with_suffix(".json.tmp")
tmp_path.write_text(
result.model_dump_json(indent=2), encoding="utf-8"
)
tmp_path.replace(cache_path)
except Exception as exc:
logger.warning("Cache write failed for %s/run%s: %s", task.id, run_index, exc)
logger.info(
"TIMING %s/run%s total=%.1fs score=%.2f C=%.2f T=%.2f B=%.2f J=%.2f %s",
task.id,
run_index,
timings["total"],
result.run_score,
result.completion_result.score,
result.trajectory_result.score,
result.behavior_result.score,
result.judge_result.score if (result.judge_result.enabled and not result.judge_result.error) else 0.0,
" ".join(f"{k}={v}s" for k, v in timings.items() if k != "total"),
)
return result
if cache_path is not None:
try:
cache_path.parent.mkdir(parents=True, exist_ok=True)
tmp_path = cache_path.with_suffix(".json.tmp")
tmp_path.write_text(result.model_dump_json(indent=2), encoding="utf-8")
tmp_path.replace(cache_path)
except Exception as exc:
logger.warning("Adapter cache write failed for %s/run%s: %s", task.id, run_index, exc)
return result
except Exception as exc:
logger.exception("Run %s/%s failed", task.id, run_index)
logger.exception("Adapter run %s/%s failed", task.id, run_index)
return TaskRunResult(
task_id=task.id,
tier=task.tier.value,
@ -461,30 +548,171 @@ class BenchmarkHarness:
privacy_tier=task.privacy_tier,
contamination_risk=task.contamination_risk,
freshness_epoch=task.freshness_epoch,
category=task.category,
domain=task.domain,
functionality=list(task.functionality),
trace_distribution=list(task.trace_distribution),
tool_surface=list(task.tool_surface),
risk_tags=list(task.risk_tags),
similarity_hash=task.similarity_hash,
official=task.official,
run_index=run_index,
run_score=0.0,
transcript=Transcript(),
duration_ms=0,
transcript=transcript,
duration_ms=round((time.monotonic() - t_run_start) * 1000),
delivery_outcome=DeliveryOutcome.FAIL,
failure_mode=classify_error_failure_mode(task, str(exc)),
error=str(exc),
)
finally:
await stop_background_services(services)
if session_keys or agent_id:
try:
async with GatewayClient(self.gateway_config) as cleanup_client:
for session_key in session_keys:
await cleanup_client.delete_session(session_key)
if agent_id:
await cleanup_client.delete_agent(agent_id, delete_files=False)
except Exception as exc:
logger.warning("Session cleanup failed: %s", exc)
if os.environ.get("CLAWBENCH_KEEP_WORKSPACES") != "1":
shutil.rmtree(workspace, ignore_errors=True)
async def _score_adapter_task_run(
self,
*,
task: TaskDefinition,
canonical_task,
ctx: AdapterContext,
duration_ms: int,
adapter,
error: str | None,
) -> TaskRunResult:
annotate_transcript_tool_calls(ctx.transcript)
total = 0
passed = 0
failures: list[str] = []
execution_results = []
for spec in canonical_task.verifier.file_states:
ok, reason = verify_file_state(spec, ctx.workspace, ctx.runtime_values)
total += 1
if ok:
passed += 1
else:
failures.append(f"FILE {spec.path}: {reason}")
for query in canonical_task.verifier.state_queries:
state = await adapter.verify_state_query(query, ctx)
if state.capability_missing:
failures.append(f"SKIP {query.kind}: {state.detail}")
continue
total += 1
if state.ok:
passed += 1
else:
failures.append(f"{query.kind.upper()}: {state.detail or query.description}")
for spec in canonical_task.verifier.execution_checks:
result = await run_execution_check(
spec,
workspace=ctx.workspace,
runtime_values=ctx.runtime_values,
)
execution_results.append(result)
total += 1
if result.passed:
passed += 1
else:
failures.append(f"EXEC {spec.name}: {result.reason}")
completion_result = CompletionResult(
total_assertions=total,
passed_assertions=passed,
failed_assertions=failures,
execution_results=execution_results,
score=round(passed / total if total else 1.0, 4),
)
trajectory_result = evaluate_trajectory(ctx.transcript, canonical_task.verifier.trajectory)
behavior_result = evaluate_behavior(canonical_task.verifier.behavior, ctx.transcript)
if self.judge_model:
async with GatewayClient(self.gateway_config) as judge_client:
judge_result = await judge_task_run(
task=task,
transcript=ctx.transcript,
workspace=ctx.workspace,
client=judge_client,
judge_model=self.judge_model,
completion_result=completion_result,
)
else:
judge_result = JudgeResult()
token_usage = ctx.transcript.total_usage
efficiency_result = EfficiencyResult.from_usage(
duration_ms=duration_ms,
usage=token_usage,
)
run_score = combine_run_score(
completion=completion_result.score,
trajectory=trajectory_result.score,
behavior=behavior_result.score,
judge=(
judge_result.score
if judge_result.enabled and not judge_result.error
else None
),
has_deterministic_verifier=completion_result.total_assertions > 0,
)
delivery_outcome = classify_delivery_outcome(
task=task,
completion_result=completion_result,
run_score=run_score,
)
failure_mode = classify_failure_mode(
task=task,
transcript=ctx.transcript,
completion_result=completion_result,
trajectory_result=trajectory_result,
behavior_result=behavior_result,
error=error,
)
return TaskRunResult(
task_id=task.id,
tier=task.tier.value,
family=task.family.value,
scenario=task.scenario.value if task.scenario else "",
subscenario=task.subscenario,
artifact_type=task.artifact_type.value if task.artifact_type else "",
prompt_variant=self.prompt_variant,
query_difficulty=task.query_difficulty.value if task.query_difficulty else "",
query_weight=task.query_weight,
pool=task.pool.value,
subsets=[subset.value for subset in task.subsets],
capabilities=[capability.value for capability in task.capabilities],
variant_group=task.variant_group,
variant_id=task.variant_id,
template_id=task.template_id,
release_id=task.release_id,
source_kind=task.source_kind,
privacy_tier=task.privacy_tier,
contamination_risk=task.contamination_risk,
freshness_epoch=task.freshness_epoch,
category=task.category,
domain=task.domain,
functionality=list(task.functionality),
trace_distribution=list(task.trace_distribution),
tool_surface=list(task.tool_surface),
risk_tags=list(task.risk_tags),
similarity_hash=task.similarity_hash,
official=task.official,
run_index=0,
completion_result=completion_result,
trajectory_result=trajectory_result,
behavior_result=behavior_result,
judge_result=judge_result,
run_score=round(run_score, 4),
transcript=ctx.transcript,
duration_ms=duration_ms,
token_usage=token_usage,
efficiency_result=efficiency_result,
delivery_outcome=delivery_outcome,
failure_mode=failure_mode,
error=error,
)
async def _create_run_agent(
self,
client: GatewayClient,
@ -606,6 +834,12 @@ class BenchmarkHarness:
privacy_tier=task.privacy_tier,
contamination_risk=task.contamination_risk,
freshness_epoch=task.freshness_epoch,
category=task.category,
domain=task.domain,
functionality=list(task.functionality),
trace_distribution=list(task.trace_distribution),
tool_surface=list(task.tool_surface),
risk_tags=list(task.risk_tags),
similarity_hash=task.similarity_hash,
official=task.official,
runs=len(runs),
@ -712,6 +946,45 @@ class BenchmarkHarness:
)
)
category_results = _dimension_results(
task_stats,
dimension="category",
values_for=lambda stat: [stat.category] if stat.category else [],
)
domain_results = _dimension_results(
task_stats,
dimension="domain",
values_for=lambda stat: [stat.domain] if stat.domain else [],
)
functionality_results = _dimension_results(
task_stats,
dimension="functionality",
values_for=lambda stat: stat.functionality,
)
trace_distribution_results = _dimension_results(
task_stats,
dimension="trace_distribution",
values_for=lambda stat: stat.trace_distribution,
)
tool_surface_results = _dimension_results(
task_stats,
dimension="tool_surface",
values_for=lambda stat: stat.tool_surface,
)
risk_tag_results = _dimension_results(
task_stats,
dimension="risk_tag",
values_for=lambda stat: stat.risk_tags,
)
dimension_results = {
"category": category_results,
"domain": domain_results,
"functionality": functionality_results,
"trace_distribution": trace_distribution_results,
"tool_surface": tool_surface_results,
"risk_tag": risk_tag_results,
}
overall_ci = bootstrap_ci([stat.mean_task_score for stat in task_stats])
total_weight = sum(stat.query_weight for stat in task_stats)
overall_failure_mode_counts = _count_values(
@ -727,6 +1000,7 @@ class BenchmarkHarness:
for _ in range(count)
)
active_release = load_active_release()
ablation_profile = self._ablation_profile()
result = BenchmarkResult(
submission_id=str(uuid.uuid4()),
model=self.model,
@ -743,10 +1017,17 @@ class BenchmarkHarness:
"prompt_variant": self.prompt_variant,
"judge_model": self.judge_model,
"adapter": self.adapter,
"ablation_profile": ablation_profile.model_dump(),
"tool_profile": ablation_profile.tool_profile.model_dump(),
"harness": ablation_profile.harness.model_dump(),
"known_adapters": list(KNOWN_ADAPTERS),
"executable_adapters": sorted(EXECUTABLE_ADAPTERS),
"subsets": self.subsets,
"capabilities": self.capabilities,
"dimension_coverage": {
key: len(value)
for key, value in dimension_results.items()
},
"official_only": self.official_only,
**(environment_extra or {}),
},
@ -803,6 +1084,13 @@ class BenchmarkHarness:
overall_pass_hat_k=_mean([1.0 if stat.pass_hat_k else 0.0 for stat in task_stats]),
tier_results=tier_results,
scenario_results=scenario_results,
category_results=category_results,
domain_results=domain_results,
functionality_results=functionality_results,
trace_distribution_results=trace_distribution_results,
tool_surface_results=tool_surface_results,
risk_tag_results=risk_tag_results,
dimension_results=dimension_results,
task_results=task_stats,
environment_checksum=self._benchmark_checksum(tasks),
task_snapshot_fingerprint=compute_task_snapshot_fingerprint(tasks),
@ -823,6 +1111,48 @@ class BenchmarkHarness:
completion_passed = completion.score >= 0.9999
return completion_passed and result.run_score >= task.pass_threshold
def _ablation_profile(self):
config = self._adapter_config()
driver = ""
enabled_toolsets: list[str] = []
disabled_toolsets: list[str] = []
if isinstance(config, HermesAdapterConfig):
driver = config.driver_mode
enabled_toolsets = list(config.enabled_toolsets or [])
disabled_toolsets = list(config.disabled_toolsets or [])
elif isinstance(config, OpenClawAdapterConfig):
driver = "gateway"
source = ""
sha = ""
version = ""
if self.adapter == "hermes":
repo = os.environ.get("HERMES_AGENT_REPO") or os.environ.get("HERMES_INSTALL_DIR")
if repo:
source = str(Path(repo).expanduser())
sha, version = git_head(Path(source))
elif self.adapter == "openclaw":
candidate = Path(os.environ.get("OPENCLAW_REPO", self.repo_root.parent / "openclaw"))
if candidate.exists():
source = str(candidate)
sha, version = git_head(candidate)
if not version:
version = _command_version(["openclaw", "--version"])
return build_ablation_profile(
model=self.model,
adapter=self.adapter,
config=config, # type: ignore[arg-type]
prompt_profile=self.prompt_variant,
harness_version=version,
harness_git_sha=sha,
harness_source=source,
driver=driver,
tool_profile_name=self.tool_profile_name,
enabled_toolsets=enabled_toolsets,
disabled_toolsets=disabled_toolsets,
)
def _print_report(self, result: BenchmarkResult) -> None:
console.print(f"\n[bold]{'=' * 60}[/]")
console.print(f"[bold]Results — {result.model}[/]")
@ -909,6 +1239,47 @@ def _mean(values: list[float]) -> float:
return sum(values) / len(values) if values else 0.0
def _dimension_results(
task_stats: list[TaskStats],
*,
dimension: str,
values_for: Callable[[TaskStats], list[str]],
) -> list[DimensionResult]:
grouped: dict[str, list[TaskStats]] = {}
for stat in task_stats:
values = sorted({value.strip() for value in values_for(stat) if value.strip()})
for value in values:
grouped.setdefault(value, []).append(stat)
results: list[DimensionResult] = []
for value in sorted(grouped):
current = grouped[value]
total_weight = sum(stat.query_weight for stat in current)
weighted_score = (
sum(stat.mean_task_score * stat.query_weight for stat in current) / total_weight
if total_weight
else _mean([stat.mean_task_score for stat in current])
)
results.append(
DimensionResult(
dimension=dimension,
value=value,
mean_task_score=_mean([stat.mean_task_score for stat in current]),
weighted_score=weighted_score,
mean_completion=_mean([stat.mean_completion_score for stat in current]),
mean_trajectory=_mean([stat.mean_trajectory_score for stat in current]),
mean_behavior=_mean([stat.mean_behavior_score for stat in current]),
mean_judge=_mean([stat.mean_judge_score for stat in current if stat.judged_runs > 0]),
mean_reliability=_mean([stat.reliability_score for stat in current]),
pass_hat_k_rate=_mean([1.0 if stat.pass_hat_k else 0.0 for stat in current]),
task_count=len(current),
total_weight=total_weight,
task_ids=[stat.task_id for stat in current],
)
)
return results
def _percentile(values: list[float], percentile: float) -> float:
if not values:
return 0.0

View File

@ -26,8 +26,14 @@ logger = logging.getLogger(__name__)
HF_TOKEN = os.environ.get("HF_TOKEN", "")
# Local fallback when HF is unavailable
LOCAL_QUEUE_DIR = Path("/data/queue") if Path("/data").exists() else Path("data/queue")
# Local fallback when HF is unavailable. Containerized sweeps run several
# independent workers against the same /data mount, so callers may isolate this.
LOCAL_QUEUE_DIR = Path(
os.environ.get(
"CLAWBENCH_LOCAL_QUEUE_DIR",
"/data/queue" if Path("/data").exists() else "data/queue",
)
)
class JobStatus(str, Enum):
@ -49,6 +55,7 @@ class SubmissionRequest(BaseModel):
max_parallel_lanes: int = Field(default=1, ge=1, le=8)
tier: str | None = None # Filter to a specific tier
scenario: str | None = None
task_ids: list[str] = Field(default_factory=list)
prompt_variant: str = "clear"
submitter: str = "" # HF username
notes: str = ""

View File

@ -63,13 +63,21 @@ def get_hidden_release_dir(release_id: str, *, private_tasks_root: Path | None =
def compute_task_snapshot_fingerprint(tasks: list[TaskDefinition]) -> str:
payload = "|".join(
sorted(
f"{task.id}:{task.pool.value}:{task.variant_group}:{task.variant_id}:{task.release_id}"
for task in tasks
payload = [
task.model_dump(mode="json", exclude_none=False)
for task in sorted(
tasks,
key=lambda task: (
task.id,
task.pool.value,
task.variant_group,
task.variant_id,
task.release_id,
),
)
)
return hashlib.sha256(payload.encode("utf-8")).hexdigest()
]
encoded = json.dumps(payload, sort_keys=True, separators=(",", ":"))
return hashlib.sha256(encoded.encode("utf-8")).hexdigest()
def load_active_release(path: Path | None = None) -> ActiveReleaseManifest | None:

View File

@ -390,6 +390,12 @@ class TaskDefinition(BaseModel):
privacy_tier: str = ""
contamination_risk: str = ""
freshness_epoch: str = ""
category: str = ""
domain: str = ""
functionality: list[str] = Field(default_factory=list)
trace_distribution: list[str] = Field(default_factory=list)
tool_surface: list[str] = Field(default_factory=list)
risk_tags: list[str] = Field(default_factory=list)
first_used_at: str = ""
retire_after_runs: int = 0
similarity_hash: str = ""
@ -542,6 +548,12 @@ class TaskRunResult(BaseModel):
privacy_tier: str = ""
contamination_risk: str = ""
freshness_epoch: str = ""
category: str = ""
domain: str = ""
functionality: list[str] = Field(default_factory=list)
trace_distribution: list[str] = Field(default_factory=list)
tool_surface: list[str] = Field(default_factory=list)
risk_tags: list[str] = Field(default_factory=list)
similarity_hash: str = ""
official: bool = False
run_index: int
@ -627,6 +639,12 @@ class TaskStats(BaseModel):
privacy_tier: str = ""
contamination_risk: str = ""
freshness_epoch: str = ""
category: str = ""
domain: str = ""
functionality: list[str] = Field(default_factory=list)
trace_distribution: list[str] = Field(default_factory=list)
tool_surface: list[str] = Field(default_factory=list)
risk_tags: list[str] = Field(default_factory=list)
similarity_hash: str = ""
official: bool = False
runs: int
@ -740,6 +758,22 @@ class ScenarioResult(BaseModel):
task_stats: list[TaskStats] = Field(default_factory=list)
class DimensionResult(BaseModel):
dimension: str
value: str
mean_task_score: float
weighted_score: float
mean_completion: float
mean_trajectory: float
mean_behavior: float
mean_judge: float = 0.0
mean_reliability: float
pass_hat_k_rate: float
task_count: int = 0
total_weight: float = 0.0
task_ids: list[str] = Field(default_factory=list)
class BenchmarkResult(BaseModel):
submission_id: str
model: str
@ -788,6 +822,13 @@ class BenchmarkResult(BaseModel):
tier_results: list[TierResult] = Field(default_factory=list)
scenario_results: list[ScenarioResult] = Field(default_factory=list)
category_results: list[DimensionResult] = Field(default_factory=list)
domain_results: list[DimensionResult] = Field(default_factory=list)
functionality_results: list[DimensionResult] = Field(default_factory=list)
trace_distribution_results: list[DimensionResult] = Field(default_factory=list)
tool_surface_results: list[DimensionResult] = Field(default_factory=list)
risk_tag_results: list[DimensionResult] = Field(default_factory=list)
dimension_results: dict[str, list[DimensionResult]] = Field(default_factory=dict)
task_results: list[TaskStats] = Field(default_factory=list)
certified: bool = False

View File

@ -163,6 +163,12 @@ async def score_task_run(
privacy_tier=task.privacy_tier,
contamination_risk=task.contamination_risk,
freshness_epoch=task.freshness_epoch,
category=task.category,
domain=task.domain,
functionality=list(task.functionality),
trace_distribution=list(task.trace_distribution),
tool_surface=list(task.tool_surface),
risk_tags=list(task.risk_tags),
similarity_hash=task.similarity_hash,
official=task.official,
run_index=0,

View File

@ -40,12 +40,20 @@ def build_runtime_values(
repo_root: Path,
extra: dict[str, Any] | None = None,
) -> dict[str, Any]:
openclaw_repo = os.environ.get("OPENCLAW_REPO")
openclaw_node_path = os.environ.get("OPENCLAW_NODE_PATH")
if not openclaw_node_path and openclaw_repo:
openclaw_node_path = str(Path(openclaw_repo) / "node_modules")
benchmark_node_parts = [str(repo_root / "node_modules")]
global_node_path = os.environ.get("NODE_PATH")
if global_node_path:
benchmark_node_parts.append(global_node_path)
values = {
"workspace": str(workspace),
"workspace_name": workspace.name,
"repo_root": str(repo_root),
"benchmark_node_path": str(repo_root / "node_modules"),
"openclaw_node_path": "/openclaw/node_modules",
"benchmark_node_path": ":".join(benchmark_node_parts),
"openclaw_node_path": openclaw_node_path or "/openclaw/node_modules",
"python_exe": sys.executable,
}
if extra:

View File

@ -1,18 +1,30 @@
"""Upload benchmark results to a Hugging Face Dataset.
Each submission is written as its own parquet shard. This avoids the
read-modify-write race caused by rewriting the single `submissions`
split file for every completed job.
IMPORTANT why this file calls `load_dataset` before `push_to_hub`:
`datasets.Dataset.push_to_hub(repo, split="submissions")` writes a single
parquet shard to `data/submissions-00000-of-00001.parquet`, REPLACING
whatever was there. If you push N submissions in sequence without
reading first, only the Nth row survives the previous N-1 are lost.
`upload_result()` therefore:
1. Loads the existing `submissions` split if it exists
2. Appends the new row
3. Deduplicates by `submission_id` (so a retried upload of the same
run doesn't create two rows)
4. Pushes the combined dataset as a fresh parquet shard
At ClawBench's current submission rate (1-2 concurrent jobs) the read-
then-write race window is negligible. If cross-worker concurrency ever
becomes material we should move to an actually append-only format
(e.g. write per-submission parquet shards under `data/submission-<id>-
of-NNNNN.parquet` instead of overwriting a single shard).
"""
from __future__ import annotations
import json
import logging
import os
import re
import tempfile
from pathlib import Path
from clawbench.hub import ensure_dataset_repo, resolve_dataset_repo
from clawbench.schemas import BenchmarkResult
@ -67,15 +79,15 @@ async def upload_result(
"official_hidden_score": result.official_hidden_score,
"clear_prompt_score": result.clear_prompt_score,
"ambiguous_prompt_score": result.ambiguous_prompt_score,
"overall_delivery_outcome_counts": _json_column(result.overall_delivery_outcome_counts),
"overall_failure_mode_counts": _json_column(result.overall_failure_mode_counts),
"overall_delivery_outcome_counts": result.overall_delivery_outcome_counts,
"overall_failure_mode_counts": result.overall_failure_mode_counts,
"overall_pass_hat_k": result.overall_pass_hat_k,
"overall_ci_lower": result.overall_ci_lower,
"overall_ci_upper": result.overall_ci_upper,
"certified": result.certified,
"environment_checksum": result.environment_checksum,
"environment": _json_column(result.environment),
"tier_scores": _json_column({
"environment": str(result.environment),
"tier_scores": {
tier_result.tier: {
"mean_task_score": tier_result.mean_task_score,
"mean_completion": tier_result.mean_completion,
@ -87,8 +99,8 @@ async def upload_result(
"ci_upper": tier_result.ci_upper,
}
for tier_result in result.tier_results
}),
"scenario_scores": _json_column({
},
"scenario_scores": {
scenario_result.scenario: {
"mean_task_score": scenario_result.mean_task_score,
"weighted_score": scenario_result.weighted_score,
@ -101,8 +113,27 @@ async def upload_result(
"total_weight": scenario_result.total_weight,
}
for scenario_result in result.scenario_results
}),
"task_results": _json_column([
},
"dimension_scores": {
dimension: {
item.value: {
"mean_task_score": item.mean_task_score,
"weighted_score": item.weighted_score,
"mean_completion": item.mean_completion,
"mean_trajectory": item.mean_trajectory,
"mean_behavior": item.mean_behavior,
"mean_judge": item.mean_judge,
"mean_reliability": item.mean_reliability,
"pass_hat_k_rate": item.pass_hat_k_rate,
"task_count": item.task_count,
"total_weight": item.total_weight,
"task_ids": item.task_ids,
}
for item in dimension_results
}
for dimension, dimension_results in result.dimension_results.items()
},
"task_results": [
{
"task_id": task.task_id,
"tier": task.tier,
@ -116,6 +147,12 @@ async def upload_result(
"pool": task.pool,
"subsets": task.subsets,
"capabilities": task.capabilities,
"category": task.category,
"domain": task.domain,
"functionality": task.functionality,
"trace_distribution": task.trace_distribution,
"tool_surface": task.tool_surface,
"risk_tags": task.risk_tags,
"mean_task_score": task.mean_task_score,
"mean_run_score": task.mean_run_score,
"mean_completion_score": task.mean_completion_score,
@ -143,36 +180,50 @@ async def upload_result(
"runs": task.runs,
}
for task in result.task_results
]),
],
}
api = HfApi(token=hf_token)
ensure_dataset_repo(api, resolved_repo)
ds = Dataset.from_list([row])
shard_name = _submission_shard_name(result.submission_id)
with tempfile.TemporaryDirectory(prefix="clawbench-upload-") as tmp_dir:
local_path = Path(tmp_dir) / shard_name
ds.to_parquet(str(local_path))
api.upload_file(
path_or_fileobj=str(local_path),
path_in_repo=f"data/submissions/{shard_name}",
repo_id=resolved_repo,
repo_type="dataset",
# Read-then-append: load the existing submissions split, add the
# new row, deduplicate by submission_id, push the combined dataset
# so we never clobber prior rows.
combined_rows: list[dict] = []
try:
from datasets import load_dataset
existing = load_dataset(
resolved_repo,
split="submissions",
token=hf_token,
)
combined_rows = [dict(r) for r in existing]
logger.info(
"Read %d existing submission row(s) from %s",
len(combined_rows),
resolved_repo,
)
except Exception as exc:
logger.info(
"No existing submissions split to append to (%s); starting fresh",
exc,
)
new_submission_id = row.get("submission_id")
if new_submission_id:
combined_rows = [
r for r in combined_rows
if r.get("submission_id") != new_submission_id
]
combined_rows.append(row)
ds = Dataset.from_list(combined_rows)
ds.push_to_hub(resolved_repo, split="submissions", token=hf_token)
url = f"https://huggingface.co/datasets/{resolved_repo}"
logger.info(
"Result uploaded to %s as append-only shard %s",
"Results uploaded to %s (%d total submission rows)",
url,
shard_name,
len(combined_rows),
)
return url
def _submission_shard_name(submission_id: str) -> str:
safe_id = re.sub(r"[^A-Za-z0-9_.-]+", "-", submission_id.strip()).strip(".-")
return f"{safe_id or 'submission'}.parquet"
def _json_column(value: object) -> str:
return json.dumps(value, default=str, sort_keys=True, separators=(",", ":"))

View File

@ -34,6 +34,7 @@ STALE_EVALUATION_SECONDS = max(
JOB_HEARTBEAT_INTERVAL_SECONDS * 4,
int(os.environ.get("CLAWBENCH_STALE_EVALUATION_SECONDS", "1800")),
)
OPENCLAW_EVAL_EXEC_HOSTS = {"auto", "gateway", "sandbox", "node"}
@dataclass
@ -46,6 +47,12 @@ class ParallelLane:
state_dir: Path | None = None
log_path: Path | None = None
@property
def home_dir(self) -> Path | None:
if self.state_dir is None:
return None
return self.state_dir.parent / "home"
@property
def ws_url(self) -> str:
return f"ws://localhost:{self.port}"
@ -300,6 +307,7 @@ class EvalWorker:
prompt_variant=job.request.prompt_variant,
prepare_run=prepare_run,
progress_callback=progress_callback,
tool_profile_name=os.environ.get("CLAWBENCH_TOOL_PROFILE_NAME", "") or None,
)
return await harness.run()
@ -369,6 +377,7 @@ class EvalWorker:
tier=job.request.tier,
scenario=job.request.scenario,
prompt_variant=job.request.prompt_variant,
tool_profile_name=os.environ.get("CLAWBENCH_TOOL_PROFILE_NAME", "") or None,
)
return summary_harness.compose_result_from_task_stats(
ordered_stats,
@ -382,7 +391,8 @@ class EvalWorker:
)
finally:
self._stop_parallel_gateways()
shutil.rmtree(job_root, ignore_errors=True)
if os.environ.get("CLAWBENCH_KEEP_PARALLEL_LANE_ROOT", "").strip() != "1":
shutil.rmtree(job_root, ignore_errors=True)
async def _run_parallel_lane(self, job, lane: ParallelLane, progress: JobProgressTracker):
gateway_cmd = self._find_gateway_cmd()
@ -430,6 +440,7 @@ class EvalWorker:
progress_callback=progress_callback,
print_report=False,
quiet=True,
tool_profile_name=os.environ.get("CLAWBENCH_TOOL_PROFILE_NAME", "") or None,
)
result = await harness.run()
await self._sync_job_progress(job.job_id, progress.clear_lane(lane.index))
@ -444,6 +455,9 @@ class EvalWorker:
return load_all_tasks(
tier=job.request.tier,
scenario=job.request.scenario,
task_ids=list(getattr(job.request, "task_ids", []) or None)
if getattr(job.request, "task_ids", None)
else None,
prompt_variant=job.request.prompt_variant,
)
@ -503,10 +517,36 @@ class EvalWorker:
def _materialize_lane_runtime(self, lane: ParallelLane, job_root: Path) -> None:
lane_root = job_root / f"lane-{lane.index}"
lane.state_dir = lane_root / "state"
lane_home = lane.home_dir
if lane_home is not None:
(lane_home / ".config").mkdir(parents=True, exist_ok=True)
lane.log_path = lane_root / "gateway.log"
lane.port = GATEWAY_PORT + (lane.index * GATEWAY_PORT_SPACING)
self._seed_lane_state_dir(lane.state_dir)
def _run_lane_prepare_hook(self, lane: ParallelLane) -> None:
hook = os.environ.get("CLAWBENCH_LANE_PREPARE_CMD", "").strip()
if not hook:
return
if lane.state_dir is None:
raise RuntimeError(f"Lane {lane.index + 1} state dir missing before prepare hook")
lane_home = lane.home_dir
if lane_home is None:
raise RuntimeError(f"Lane {lane.index + 1} home dir missing before prepare hook")
(lane_home / ".config").mkdir(parents=True, exist_ok=True)
hook_env = {
**os.environ,
"HOME": str(lane_home),
"OPENCLAW_HOME": str(lane_home),
"OPENCLAW_STATE_DIR": str(lane.state_dir),
"OPENCLAW_CONFIG_PATH": str(lane.state_dir / "openclaw.json"),
"XDG_CONFIG_HOME": str(lane_home / ".config"),
"CLAWBENCH_LANE_INDEX": str(lane.index),
"CLAWBENCH_LANE_PORT": str(lane.port),
}
logger.info("Running lane %d prepare hook", lane.index + 1)
subprocess.run([hook], env=hook_env, check=True)
def _seed_lane_state_dir(self, target_state_dir: Path) -> None:
source_state_dir = Path(os.environ.get("OPENCLAW_STATE_DIR", os.path.expanduser("~/.openclaw")))
shutil.rmtree(target_state_dir, ignore_errors=True)
@ -625,6 +665,10 @@ class EvalWorker:
_set_nested(data, "browser.headless", True)
_set_nested(data, "browser.noSandbox", True)
_set_nested(data, "agents.defaults.skipBootstrap", True)
_set_nested(data, "tools.exec.host", self._openclaw_eval_exec_host())
_set_nested(data, "tools.exec.security", "full")
_set_nested(data, "tools.exec.ask", "off")
_set_nested(data, "approvals.exec.enabled", False)
if self._active_model:
_set_nested(data, "agents.defaults.model.primary", self._active_model)
_set_nested(data, "agents.defaults.subagents.model.primary", self._active_model)
@ -632,6 +676,7 @@ class EvalWorker:
tmp_path = cfg_path.with_suffix(".json.tmp")
tmp_path.write_text(json.dumps(data, indent=2), encoding="utf-8")
tmp_path.replace(cfg_path)
self._write_eval_exec_approvals(lane_state_dir)
def _order_task_stats(self, tasks: list[TaskDefinition], combined_stats: list) -> list:
stats_by_id = {}
@ -724,6 +769,7 @@ class EvalWorker:
"token",
"--token",
gateway_token,
"--compact",
],
stdout=open("/tmp/gateway.log", "a", encoding="utf-8"),
stderr=subprocess.STDOUT,
@ -760,6 +806,12 @@ class EvalWorker:
f"Gateway /health did not respond within {health_deadline_sec}s. Log:\n{self._read_gateway_log()}"
)
await self._wait_for_gateway_ready_marker(
process=self._gateway_process,
log_reader=lambda: self._read_gateway_log(limit=20_000),
description="Gateway",
)
# Phase B: control-plane probe with retries (see the parallel
# variant in _ensure_parallel_gateway for the detailed rationale).
gateway_config = GatewayConfig(url=GATEWAY_WS_URL, token=GATEWAY_TOKEN)
@ -809,21 +861,30 @@ class EvalWorker:
# Re-inject the host config's env + plugins before every restart.
if lane.state_dir is not None:
self._reinject_host_env_to_lane(lane.state_dir)
self._run_lane_prepare_hook(lane)
if lane.state_dir is None or lane.log_path is None:
raise RuntimeError(f"Lane {lane.index + 1} runtime was not materialized before gateway startup")
lane_home = lane.home_dir
if lane_home is None:
raise RuntimeError(f"Lane {lane.index + 1} home was not materialized before gateway startup")
(lane_home / ".config").mkdir(parents=True, exist_ok=True)
logger.info("Starting lane %d gateway on port %d", lane.index + 1, lane.port)
gateway_token = os.environ.get("OPENCLAW_GATEWAY_TOKEN", "clawbench-internal-token")
gateway_env = {
**os.environ,
"OPENCLAW_HOME": os.environ.get("OPENCLAW_HOME", os.path.expanduser("~")),
"HOME": str(lane_home),
"OPENCLAW_HOME": str(lane_home),
"OPENCLAW_STATE_DIR": str(lane.state_dir),
"OPENCLAW_CONFIG_PATH": str(lane.state_dir / "openclaw.json"),
"XDG_CONFIG_HOME": str(lane_home / ".config"),
"OPENCLAW_SKIP_GMAIL_WATCHER": "1",
"OPENCLAW_SKIP_CANVAS_HOST": "1",
"OPENCLAW_NO_RESPAWN": "1",
}
self._configure_browser_runtime(gateway_cmd, gateway_env)
lane.log_path.parent.mkdir(parents=True, exist_ok=True)
lane.log_path.write_text("", encoding="utf-8")
log_handle = lane.log_path.open("a", encoding="utf-8")
try:
process = subprocess.Popen(
@ -841,6 +902,7 @@ class EvalWorker:
"token",
"--token",
gateway_token,
"--compact",
],
stdout=log_handle,
stderr=subprocess.STDOUT,
@ -883,6 +945,12 @@ class EvalWorker:
f"Log:\n{self._read_parallel_gateway_log(lane)}"
)
await self._wait_for_gateway_ready_marker(
process=process,
log_reader=lambda: self._read_parallel_gateway_log(lane, limit=20_000),
description=f"Lane {lane.index + 1} gateway",
)
# Phase B: control-plane probe with explicit retries. A healthy
# /health response does not guarantee sessions.create works
# immediately — plugin registration races can leave the gateway
@ -994,6 +1062,10 @@ class EvalWorker:
("agents.defaults.skipBootstrap", True),
("browser.headless", True),
("browser.noSandbox", True),
("tools.exec.host", self._openclaw_eval_exec_host()),
("tools.exec.security", "full"),
("tools.exec.ask", "off"),
("approvals.exec.enabled", False),
]
if self._active_model:
config_pairs.extend(
@ -1004,9 +1076,50 @@ class EvalWorker:
)
try:
self._patch_openclaw_config(config_pairs)
state_dir = Path(
gateway_env.get("OPENCLAW_STATE_DIR")
or os.environ.get("OPENCLAW_STATE_DIR")
or os.path.expanduser("~/.openclaw")
)
self._write_eval_exec_approvals(state_dir)
except Exception as exc:
logger.warning("Direct openclaw.json patch failed: %s", exc)
@staticmethod
def _openclaw_eval_exec_host() -> str:
value = os.environ.get("OPENCLAW_EXEC_HOST", "gateway").strip().lower()
if value in OPENCLAW_EVAL_EXEC_HOSTS:
return value
logger.warning("Invalid OPENCLAW_EXEC_HOST=%r; using gateway", value)
return "gateway"
@staticmethod
def _write_eval_exec_approvals(state_dir: Path) -> None:
state_dir.mkdir(parents=True, exist_ok=True)
approvals_path = state_dir / "exec-approvals.json"
approvals = {
"version": 1,
"socket": {
"path": str(approvals_path.with_suffix(".sock")),
"token": "clawbench-eval-token",
},
"defaults": {
"security": "full",
"ask": "off",
"askFallback": "full",
},
"agents": {
"*": {
"security": "full",
"ask": "off",
"askFallback": "full",
}
},
}
tmp_path = approvals_path.with_suffix(".json.tmp")
tmp_path.write_text(json.dumps(approvals, indent=2), encoding="utf-8")
tmp_path.replace(approvals_path)
@staticmethod
def _patch_openclaw_config(pairs: list[tuple[str, object]]) -> None:
state_dir = Path(os.environ.get("OPENCLAW_STATE_DIR") or os.path.expanduser("~/.openclaw"))
@ -1051,13 +1164,15 @@ class EvalWorker:
# Use a generous dedicated config for the probe. A healthy gateway
# usually responds to sessions.create in under a second, but plugin
# initialization (especially OpenRouter model list fetch) can add
# 10-30s after /health reports 200. The 60s outer bound ensures we
# don't give up during a cold-start scenario.
# 10-30s after /health reports 200. On cold Docker lanes OpenClaw may
# also install provider runtime SDKs during the first sessions.create,
# so keep this bound configurable and separate from steady-state RPCs.
probe_timeout = float(os.environ.get("CLAWBENCH_GATEWAY_PROBE_TIMEOUT_SECONDS", "180"))
probe_config = GatewayConfig(
url=gateway_config.url,
token=gateway_config.token,
connect_timeout=gateway_config.connect_timeout,
request_timeout=30.0,
request_timeout=probe_timeout,
)
async def _probe() -> None:
@ -1068,25 +1183,67 @@ class EvalWorker:
await client.delete_session(session_key)
try:
await asyncio.wait_for(_probe(), timeout=60.0)
await asyncio.wait_for(_probe(), timeout=probe_timeout + 10.0)
except asyncio.TimeoutError as exc:
raise RuntimeError(
"Gateway control-plane probe timed out after 60s "
f"Gateway control-plane probe timed out after {probe_timeout:.0f}s "
"(sessions.create hung on a freshly-started gateway); "
"lane will be retried by the queue."
) from exc
def _read_gateway_log(self) -> str:
async def _wait_for_gateway_ready_marker(self, process: subprocess.Popen, log_reader, description: str) -> None:
# OpenClaw 2026.4.26 can answer /health before channels and sidecars
# finish startup. Probing sessions.create during that window can hold the
# session write lock for minutes. Some lane gateway modes do not emit
# the final ready marker, so wait for it briefly after sidecar startup
# and then let the bounded control-plane probe decide.
ready_deadline_sec = int(os.environ.get("CLAWBENCH_GATEWAY_READY_TIMEOUT_SECONDS", "420"))
marker_grace_sec = int(os.environ.get("CLAWBENCH_GATEWAY_READY_MARKER_GRACE_SECONDS", "90"))
saw_sidecar_start = False
sidecar_start_elapsed: int | None = None
for elapsed in range(ready_deadline_sec):
if process.poll() is not None:
raise RuntimeError(
f"{description} exited with code {process.returncode}. Log:\n{log_reader()[-4_000:]}"
)
log_text = log_reader()
if "[gateway] ready" in log_text:
logger.info("%s ready after %ss", description, elapsed)
return
if "[gateway] starting channels and sidecars" in log_text:
saw_sidecar_start = True
if sidecar_start_elapsed is None:
sidecar_start_elapsed = elapsed
if sidecar_start_elapsed is not None and elapsed - sidecar_start_elapsed >= marker_grace_sec:
logger.info(
"%s did not emit ready marker %ss after sidecar startup; probing control plane",
description,
marker_grace_sec,
)
return
if not saw_sidecar_start and elapsed >= 15:
return
await asyncio.sleep(1)
logger.warning(
"%s did not log ready within %ss; probing control plane anyway. Log:\n%s",
description,
ready_deadline_sec,
log_reader()[-4_000:],
)
def _read_gateway_log(self, limit: int = 4_000) -> str:
try:
return Path("/tmp/gateway.log").read_text(encoding="utf-8", errors="replace")[-4_000:]
return Path("/tmp/gateway.log").read_text(encoding="utf-8", errors="replace")[-limit:]
except Exception:
return "(no gateway log)"
def _read_parallel_gateway_log(self, lane: ParallelLane) -> str:
def _read_parallel_gateway_log(self, lane: ParallelLane, limit: int = 4_000) -> str:
if lane.log_path is None:
return "(no gateway log)"
try:
return lane.log_path.read_text(encoding="utf-8", errors="replace")[-4_000:]
return lane.log_path.read_text(encoding="utf-8", errors="replace")[-limit:]
except Exception:
return "(no gateway log)"

168
docs/DOMAIN_PROOF_PLAN.md Normal file
View File

@ -0,0 +1,168 @@
# ClawBench Domain Proof Plan
This plan turns ClawBench from a strong benchmark into an evidence package for
the central thesis:
> Model + general harness + plugins can cover the task domains served by most
> agent SaaS products.
## What Exists Now
- `tasks-public/`: small public Core v1 task set for reproducibility,
examples, and regression tracking.
- `tasks-domain/`: domain coverage scaffold for the larger proof corpus.
- Deterministic scoring: file, execution, memory, session, cron, gateway, DOM,
and structured output assertions.
- Process scoring: read-before-write, self-verification, recovery, safety,
tool-family fit.
- Reliability scoring: repeated runs, pass^k, worst-of-n, variance score,
bootstrap confidence intervals.
- Dynamics analysis: regime classification, survival, constraint index,
variance decomposition, SNR-weighted ranking.
- Configuration diagnostics: plugin profile fingerprints, utilization audit,
manifest-vs-reality gap, surprise detection, recommendations.
- Adapter groundwork: canonical task schema plus OpenClaw and Hermes adapter
modules. OpenClaw is the executable harness path today.
## Ablation Design
Each domain task should run under four configuration classes.
| Class | Description | Question Answered |
|---|---|---|
| `model_only` | Model with minimal shell/filesystem access | What can the raw model do with little scaffolding? |
| `model_plus_harness` | Model plus the general OpenClaw-style harness | What does the harness contribute by itself? |
| `core_plugins` | Harness plus browser, memory, filesystem, execution plugins | What do common plugins add across domains? |
| `domain_plugins` | Harness plus domain-specific state/API plugins | Does the plugin stack close the gap to specialized SaaS agents? |
Run policy:
- 3 runs per task per configuration class
- same model snapshots across all classes
- same OpenClaw/harness build across all classes
- same private task variants across all classes
- fixed time, token, tool, and approval budgets
## Primary Metrics
- hard success: deterministic completion only
- reliability: pass^k, pass rate, worst-of-n, variance score
- process quality: trace-derived behavior quality
- cost efficiency: tokens/pass, cost/pass, p50/p95 latency
- failure profile: 13 deterministic failure modes
- plugin lift: `domain_plugins - model_plus_harness`
- harness lift: `model_plus_harness - model_only`
- plugin utilization: loaded vs invoked, tool-family coverage
- manifest-reality gap: claimed plugin capabilities vs observed use
## Proof Criteria
A domain is considered covered when:
- `domain_plugins` reaches at least 0.85 hard success on private variants
- pass^k is at least 0.75 across 3 runs
- worst-of-n is at least 0.65
- no dominant failure mode accounts for more than 35 percent of failures
- plugin utilization shows the relevant domain plugin was invoked on tasks
where it was required
The broader thesis is credible when:
- at least 10 of 12 domains meet the domain coverage bar
- plugin lift is larger than model-to-model variance on the same task set
- holdout variants preserve the same conclusions
- SNR analysis shows the ranking is signal-dominant, not seed-noise-dominant
- cross-harness adapters reproduce scores within an agreed tolerance
## Workstream 1: Adapter Execution
Goal: make OpenClaw, Hermes, Codex, and Claude Code comparable through one
canonical task pipeline.
Near-term:
- keep `--adapter openclaw` as the executable path
- route OpenClaw through the adapter implementation instead of inline gateway
code
- add compatibility reporting for every task and adapter
- implement Codex and Claude Code transcript adapters
- promote Hermes from first-turn runner to full compatible runner where possible
Help wanted:
- harness owners: SDK or CLI entry points that expose full transcripts
- plugin owners: tool-call provenance and registration traces
- serving owners: stable model IDs, usage accounting, and reproducible configs
## Workstream 2: Plugin Provenance
Goal: attribute score changes to plugins instead of treating the agent as a
black box.
Near-term:
- capture plugin registration traces at gateway startup
- attach plugin owner IDs to every tool call
- store transcripts and plugin traces alongside result JSON
- include utilization and manifest-reality gaps in every `--profile` run
Help wanted:
- OpenClaw plugin registry hooks for runtime trace export
- partner plugins with typed manifests and clean provenance
- ClawHub metadata sync for manifest cache refresh
## Workstream 3: Domain Corpus
Goal: replace a small public task suite with a coverage matrix for real agent
SaaS domains.
Near-term:
- 12 domains in `tasks-domain/MANIFEST.yaml`
- 5 templates per domain
- 3 private variants per template
- domain-specific plugin requirement declarations
- deterministic verifier contracts before any semantic judge
Help wanted:
- partner traces that can be transformed into private variants
- domain experts to validate task realism and verifier quality
- infra for private variant generation and contamination audits
## Workstream 4: Serving and Cost Rigor
Goal: compare open and closed models under reproducible serving constraints.
Near-term:
- record model snapshot, provider, serving stack, quantization, GPU class,
context length, temperature, reasoning settings, and token accounting
- report cost/pass and latency/pass alongside capability
- run open-weight models through vLLM-backed profiles where available
Help wanted:
- vLLM serving recipes for consistent agent-eval runs
- Hugging Face model hosting and dataset plumbing
- NVIDIA profiling on representative GPU setups
## Workstream 5: Evidence Package
Goal: make the conclusion auditable by third parties.
Near-term:
- publish public Core v1 results as the reproducibility baseline
- publish domain coverage matrix without private task bodies
- publish aggregated per-domain scores, confidence intervals, and failure modes
- keep private variants for contamination-resistant official scoring
- publish scripts that regenerate every report from cached run JSON
Help wanted:
- compute credits for multi-model sweeps
- review from model serving, benchmark, and infrastructure teams
- public hosting for result artifacts and visual dashboards

View File

@ -0,0 +1,108 @@
# Meeting Brief: Nvidia, Hugging Face, vLLM
Meeting date: April 24, 2026
## One-Liner
ClawBench is a rigorous agent benchmark for measuring whether a model plus a
general harness plus plugins can cover the task domains served by most agent
SaaS products.
## What I Built
- A deterministic, trace-based benchmark for agents, not just models.
- A small public Core v1 set for reproducibility and regression tracking.
- A larger domain-suite scaffold for CRM, support, docs/sheets/slides, email,
calendar, finance ops, analytics, security admin, ecommerce, devtools,
research, and personal ops.
- A scoring system that separates completion, process quality, behavior,
semantic quality, reliability, latency, tokens, cost, and failure modes.
- A dynamics-analysis stack that explains how agents fail: trapped, diffusive,
convergent, chaotic, limit-cycle, and survival curves.
- A plugin-profile diagnostic layer that fingerprints configurations, estimates
plugin contribution, detects dead-weight plugins, and recommends changes.
- An adapter boundary so OpenClaw can become one harness among several rather
than the only execution path.
## Goal
Prove, with reproducible data, that specialized agent SaaS can be decomposed
into:
1. a base model,
2. a general agent harness,
3. a plugin stack,
4. domain-specific state/API access,
5. deterministic evaluation contracts.
If the data supports it, the conclusion is that the open plugin ecosystem can
subsume a large share of agent SaaS workflows.
## What The 19 Public Tasks Are
The 19 public tasks are not the whole proof. They are the public Core v1 set:
- reproducibility baseline
- CI/regression suite
- adapter bring-up set
- public explanation of methodology
The proof corpus is the domain suite. That needs more tasks, private variants,
and ablations.
## What Still Needs Help
- Cross-harness execution: OpenClaw is executable today; Hermes/Codex/Claude
Code need end-to-end adapter wiring.
- Plugin provenance: tool calls need stable plugin owner IDs and registration
traces.
- Domain corpus: each domain needs realistic private variants and hardened
deterministic verifiers.
- Serving reproducibility: open-weight models need pinned serving recipes,
GPU profiles, usage accounting, and latency/cost measurement.
- Scale: the domain ablations need a lot more runs than the public Core set.
## What I Want From Nvidia
- GPU-backed evaluation capacity for repeated domain sweeps.
- Profiling help: latency/pass, tokens/sec, cost/pass, memory pressure, and
concurrency behavior for long agent trajectories.
- Reference serving profiles for open-weight models on NVIDIA hardware.
- Advice on making the benchmark useful for enterprise agent deployment, not
just academic ranking.
## What I Want From Hugging Face
- Dataset hosting for public results, cached run JSON, and public task metadata.
- Private/controlled dataset workflow for holdout variants and partner traces.
- Model hosting paths for open-weight baseline runs.
- Help making ClawBench results easy to browse, reproduce, and cite.
## What I Want From vLLM
- A stable serving recipe for agent-eval workloads with long context and many
tool turns.
- Usage accounting: prompt, output, reasoning/cache tokens where available.
- Throughput and latency guidance for many parallel agent runs.
- Integration advice for making model snapshots and serving configs auditable.
## Proposed Collaboration
1. Run Core v1 as a public sanity check across agreed open and closed models.
2. Build 12-domain private proof suite from `tasks-domain/`.
3. Run four ablation classes: model only, model plus harness, core plugins,
domain plugins.
4. Publish aggregated domain coverage, reliability, failure modes, and cost.
5. Iterate on gaps where specialized SaaS still beats the open stack.
## The Ask
Help make the proof hard to dismiss:
- enough compute to run repetitions,
- clean serving recipes,
- model and dataset hosting,
- infrastructure review,
- partner traces or realistic domain workflows,
- public artifacts that other teams can reproduce.

View File

@ -0,0 +1,181 @@
import { readFileSync, writeFileSync } from "node:fs";
const dist = "/app/dist/server-methods-b3jaTRE_.js";
function replaceOnce(text, oldValue, newValue) {
if (!text.includes(oldValue)) {
throw new Error(`patch target not found: ${oldValue.slice(0, 80)}`);
}
return text.replace(oldValue, newValue);
}
let source = readFileSync(dist, "utf8");
source = replaceOnce(
source,
"const agentsHandlers = {\n",
`let agentConfigMutationQueue = Promise.resolve();
async function runAgentConfigMutation(fn) {
\tconst previous = agentConfigMutationQueue;
\tlet release;
\tagentConfigMutationQueue = new Promise((resolve) => {
\t\trelease = resolve;
\t});
\tawait previous.catch(() => {});
\ttry {
\t\treturn await fn();
\t} finally {
\t\trelease();
\t}
}
const agentsHandlers = {
`,
);
source = replaceOnce(
source,
`\t\tconst cfg = context.getRuntimeConfig();
\t\tconst rawName = params.name.trim();`,
`\t\tconst rawName = params.name.trim();`,
);
source = replaceOnce(
source,
`\t\tif (findAgentEntryIndex(listAgentEntries(cfg), agentId) >= 0) {
\t\t\trespond(false, void 0, errorShape(ErrorCodes.INVALID_REQUEST, \`agent "\${agentId}" already exists\`));
\t\t\treturn;
\t\t}
\t\tconst workspaceDir = resolveUserPath(params.workspace.trim());`,
`\t\tconst workspaceDir = resolveUserPath(params.workspace.trim());`,
);
source = replaceOnce(
source,
`\t\tlet nextConfig = applyAgentConfig(cfg, {
\t\t\tagentId,
\t\t\tname: safeName,
\t\t\tworkspace: workspaceDir,
\t\t\tmodel,
\t\t\tidentity: {
\t\t\t\tname: safeName,
\t\t\t\t...emoji ? { emoji: sanitizeIdentityLine(emoji) } : {},
\t\t\t\t...avatar ? { avatar: sanitizeIdentityLine(avatar) } : {}
\t\t\t}
\t\t});
\t\tconst agentDir = resolveAgentDir(nextConfig, agentId);
\t\tnextConfig = applyAgentConfig(nextConfig, {
\t\t\tagentId,
\t\t\tagentDir
\t\t});
\t\tawait ensureAgentWorkspace({
\t\t\tdir: workspaceDir,
\t\t\tensureBootstrapFiles: !Boolean(nextConfig.agents?.defaults?.skipBootstrap)
\t\t});
\t\tawait fs$1.mkdir(resolveSessionTranscriptsDirForAgent(agentId), { recursive: true });
\t\tconst persistedIdentity = normalizeIdentityForFile(resolveAgentIdentity(nextConfig, agentId));
\t\tif (persistedIdentity) {
\t\t\tconst identityContent = await buildIdentityMarkdownOrRespondUnsafe({
\t\t\t\trespond,
\t\t\t\tworkspaceDir,
\t\t\t\tidentity: persistedIdentity
\t\t\t});
\t\t\tif (identityContent === null) return;
\t\t\tif (!await writeWorkspaceFileOrRespond({
\t\t\t\trespond,
\t\t\t\tworkspaceDir,
\t\t\t\tname: "IDENTITY.md",
\t\t\t\tcontent: identityContent
\t\t\t})) return;
\t\t}
\t\tawait replaceConfigFile({
\t\t\tnextConfig,
\t\t\tafterWrite: { mode: "auto" }
\t\t});
\t\trespond(true, {
\t\t\tok: true,
\t\t\tagentId,
\t\t\tname: safeName,
\t\t\tworkspace: workspaceDir,
\t\t\tmodel
\t\t}, void 0);`,
`\t\tconst result = await runAgentConfigMutation(async () => {
\t\t\tconst cfg = context.getRuntimeConfig();
\t\t\tif (findAgentEntryIndex(listAgentEntries(cfg), agentId) >= 0) {
\t\t\t\trespond(false, void 0, errorShape(ErrorCodes.INVALID_REQUEST, \`agent "\${agentId}" already exists\`));
\t\t\t\treturn null;
\t\t\t}
\t\t\tlet nextConfig = applyAgentConfig(cfg, {
\t\t\t\tagentId,
\t\t\t\tname: safeName,
\t\t\t\tworkspace: workspaceDir,
\t\t\t\tmodel,
\t\t\t\tidentity: {
\t\t\t\t\tname: safeName,
\t\t\t\t\t...emoji ? { emoji: sanitizeIdentityLine(emoji) } : {},
\t\t\t\t\t...avatar ? { avatar: sanitizeIdentityLine(avatar) } : {}
\t\t\t\t}
\t\t\t});
\t\t\tconst agentDir = resolveAgentDir(nextConfig, agentId);
\t\t\tnextConfig = applyAgentConfig(nextConfig, {
\t\t\t\tagentId,
\t\t\t\tagentDir
\t\t\t});
\t\t\tawait ensureAgentWorkspace({
\t\t\t\tdir: workspaceDir,
\t\t\t\tensureBootstrapFiles: !Boolean(nextConfig.agents?.defaults?.skipBootstrap)
\t\t\t});
\t\t\tawait fs$1.mkdir(resolveSessionTranscriptsDirForAgent(agentId), { recursive: true });
\t\t\tconst persistedIdentity = normalizeIdentityForFile(resolveAgentIdentity(nextConfig, agentId));
\t\t\tif (persistedIdentity) {
\t\t\t\tconst identityContent = await buildIdentityMarkdownOrRespondUnsafe({
\t\t\t\t\trespond,
\t\t\t\t\tworkspaceDir,
\t\t\t\t\tidentity: persistedIdentity
\t\t\t\t});
\t\t\t\tif (identityContent === null) return null;
\t\t\t\tif (!await writeWorkspaceFileOrRespond({
\t\t\t\t\trespond,
\t\t\t\t\tworkspaceDir,
\t\t\t\t\tname: "IDENTITY.md",
\t\t\t\t\tcontent: identityContent
\t\t\t\t})) return null;
\t\t\t}
\t\t\tawait replaceConfigFile({
\t\t\t\tnextConfig,
\t\t\t\tafterWrite: { mode: "auto" }
\t\t\t});
\t\t\treturn true;
\t\t});
\t\tif (!result) return;
\t\trespond(true, {
\t\t\tok: true,
\t\t\tagentId,
\t\t\tname: safeName,
\t\t\tworkspace: workspaceDir,
\t\t\tmodel
\t\t}, void 0);`,
);
for (const marker of [
`\t\t\tawait replaceConfigFile({
\t\t\t\tnextConfig,
\t\t\t\tafterWrite: { mode: "auto" }
\t\t\t});`,
`\t\tawait replaceConfigFile({
\t\t\tnextConfig,
\t\t\tafterWrite: { mode: "auto" }
\t\t});`,
`\t\tawait replaceConfigFile({
\t\t\tnextConfig: result.config,
\t\t\tafterWrite: { mode: "auto" }
\t\t});`,
]) {
source = replaceOnce(
source,
marker,
marker.replace(`{ mode: "auto" }`, `{ mode: "none", reason: "clawbench-agent-lifecycle" }`),
);
}
writeFileSync(dist, source);
console.log(`patched ${dist}`);

212
patches/patch_opus47.py Normal file
View File

@ -0,0 +1,212 @@
#!/usr/bin/env python3
"""Patch pi-ai and openclaw bundles to recognize claude-opus-4-7 (and sonnet-4-7).
Runs inside the Docker image as a RUN step. Idempotent: re-running is a no-op.
"""
import re
import sys
import os
PI_AI_CATALOG = "/app/node_modules/@mariozechner/pi-ai/dist/models.generated.js"
ANTHROPIC_REGISTER_GLOB = "/app/dist/register.runtime-*.js"
def patch_pi_ai_catalog(path: str) -> bool:
with open(path) as fh:
src = fh.read()
if '"claude-opus-4-7"' in src:
print(f"[patch] {path}: claude-opus-4-7 already present, skipping")
return False
# Find the claude-opus-4-6 entry and splice in opus-4-7 + sonnet-4-7 right after.
# Use substring scanning rather than regex because each entry contains a nested
# `cost: { ... }` object (which breaks naive `[^{}]` patterns).
start_marker = '"claude-opus-4-6": {'
start_idx = src.find(start_marker)
if start_idx == -1:
print(f"[patch] ERROR: could not locate claude-opus-4-6 anchor in {path}", file=sys.stderr)
sys.exit(1)
# Walk forward from the opening `{` counting nesting until it balances to 0.
depth = 0
i = start_idx
while i < len(src):
ch = src[i]
if ch == '{':
depth += 1
elif ch == '}':
depth -= 1
if depth == 0:
i += 1 # include '}'
break
i += 1
if depth != 0:
print(f"[patch] ERROR: unbalanced braces walking claude-opus-4-6 entry in {path}", file=sys.stderr)
sys.exit(1)
# There should be a trailing comma after the closing brace.
if i < len(src) and src[i] == ',':
i += 1
anchor_end = i
class _M:
def __init__(self, end): self._end = end
def end(self): return self._end
m = _M(anchor_end)
insertion = (
"\n"
' "claude-opus-4-7": {\n'
' id: "claude-opus-4-7",\n'
' name: "Claude Opus 4.7",\n'
' api: "anthropic-messages",\n'
' provider: "anthropic",\n'
' baseUrl: "https://api.anthropic.com",\n'
" reasoning: true,\n"
' input: ["text", "image"],\n'
" cost: {\n"
" input: 5,\n"
" output: 25,\n"
" cacheRead: 0.5,\n"
" cacheWrite: 6.25,\n"
" },\n"
" contextWindow: 1000000,\n"
" maxTokens: 128000,\n"
" },\n"
' "claude-sonnet-4-7": {\n'
' id: "claude-sonnet-4-7",\n'
' name: "Claude Sonnet 4.7",\n'
' api: "anthropic-messages",\n'
' provider: "anthropic",\n'
' baseUrl: "https://api.anthropic.com",\n'
" reasoning: true,\n"
' input: ["text", "image"],\n'
" cost: {\n"
" input: 3,\n"
" output: 15,\n"
" cacheRead: 0.3,\n"
" cacheWrite: 3.75,\n"
" },\n"
" contextWindow: 1000000,\n"
" maxTokens: 128000,\n"
" },"
)
patched = src[: m.end()] + insertion + src[m.end():]
with open(path, "w") as fh:
fh.write(patched)
print(f"[patch] {path}: inserted claude-opus-4-7 and claude-sonnet-4-7")
return True
def patch_openclaw_anthropic_register(path: str) -> bool:
with open(path) as fh:
src = fh.read()
if "ANTHROPIC_OPUS_47_MODEL_ID" in src:
print(f"[patch] {path}: 4-7 support already present, skipping")
return False
# Skip files that are not the anthropic register.runtime (other plugins
# share the same `register.runtime-*.js` naming convention).
if 'PROVIDER_ID = "anthropic"' not in src or "ANTHROPIC_MODERN_MODEL_PREFIXES" not in src:
print(f"[patch] {path}: not the anthropic register.runtime bundle, skipping")
return False
# 1. Inject new constants after the sonnet template constant.
sonnet_tpl_anchor = 'const ANTHROPIC_SONNET_TEMPLATE_MODEL_IDS = ["claude-sonnet-4-5", "claude-sonnet-4.5"];'
if sonnet_tpl_anchor not in src:
print(f"[patch] ERROR: sonnet template anchor not found in {path}", file=sys.stderr)
sys.exit(1)
new_consts = (
sonnet_tpl_anchor + "\n"
'const ANTHROPIC_OPUS_47_MODEL_ID = "claude-opus-4-7";\n'
'const ANTHROPIC_OPUS_47_DOT_MODEL_ID = "claude-opus-4.7";\n'
'const ANTHROPIC_SONNET_47_MODEL_ID = "claude-sonnet-4-7";\n'
'const ANTHROPIC_SONNET_47_DOT_MODEL_ID = "claude-sonnet-4.7";'
)
src = src.replace(sonnet_tpl_anchor, new_consts)
# 2. Extend ANTHROPIC_MODERN_MODEL_PREFIXES.
prefixes_anchor = 'const ANTHROPIC_MODERN_MODEL_PREFIXES = [\n\t"claude-opus-4-6",\n\t"claude-sonnet-4-6",'
prefixes_new = 'const ANTHROPIC_MODERN_MODEL_PREFIXES = [\n\t"claude-opus-4-7",\n\t"claude-sonnet-4-7",\n\t"claude-opus-4-6",\n\t"claude-sonnet-4-6",'
if prefixes_anchor not in src:
print(f"[patch] ERROR: modern prefixes anchor not found in {path}", file=sys.stderr)
sys.exit(1)
src = src.replace(prefixes_anchor, prefixes_new)
# 3. Add 4-7 forward-compat branches ahead of the 4-6 opus/sonnet branches.
resolve_anchor = (
"function resolveAnthropicForwardCompatModel(ctx) {\n"
"\treturn resolveAnthropic46ForwardCompatModel({\n"
"\t\tctx,\n"
"\t\tdashModelId: ANTHROPIC_OPUS_46_MODEL_ID,"
)
resolve_new = (
"function resolveAnthropicForwardCompatModel(ctx) {\n"
"\treturn resolveAnthropic46ForwardCompatModel({\n"
"\t\tctx,\n"
'\t\tdashModelId: ANTHROPIC_OPUS_47_MODEL_ID,\n'
'\t\tdotModelId: ANTHROPIC_OPUS_47_DOT_MODEL_ID,\n'
'\t\tdashTemplateId: "claude-opus-4-6",\n'
'\t\tdotTemplateId: "claude-opus-4.6",\n'
"\t\tfallbackTemplateIds: ANTHROPIC_OPUS_TEMPLATE_MODEL_IDS\n"
"\t}) ?? resolveAnthropic46ForwardCompatModel({\n"
"\t\tctx,\n"
'\t\tdashModelId: ANTHROPIC_SONNET_47_MODEL_ID,\n'
'\t\tdotModelId: ANTHROPIC_SONNET_47_DOT_MODEL_ID,\n'
'\t\tdashTemplateId: "claude-sonnet-4-6",\n'
'\t\tdotTemplateId: "claude-sonnet-4.6",\n'
"\t\tfallbackTemplateIds: ANTHROPIC_SONNET_TEMPLATE_MODEL_IDS\n"
"\t}) ?? resolveAnthropic46ForwardCompatModel({\n"
"\t\tctx,\n"
"\t\tdashModelId: ANTHROPIC_OPUS_46_MODEL_ID,"
)
if resolve_anchor not in src:
print(f"[patch] ERROR: forward-compat resolver anchor not found in {path}", file=sys.stderr)
sys.exit(1)
src = src.replace(resolve_anchor, resolve_new)
# 4. Make adaptive-thinking default cover 4-7 too.
adaptive_anchor = (
"function shouldUseAnthropicAdaptiveThinkingDefault(modelId) {\n"
"\tconst lowerModelId = normalizeLowercaseStringOrEmpty(modelId);\n"
"\treturn lowerModelId.startsWith(ANTHROPIC_OPUS_46_MODEL_ID) || lowerModelId.startsWith(ANTHROPIC_OPUS_46_DOT_MODEL_ID) || lowerModelId.startsWith(ANTHROPIC_SONNET_46_MODEL_ID) || lowerModelId.startsWith(ANTHROPIC_SONNET_46_DOT_MODEL_ID);\n"
"}"
)
adaptive_new = (
"function shouldUseAnthropicAdaptiveThinkingDefault(modelId) {\n"
"\tconst lowerModelId = normalizeLowercaseStringOrEmpty(modelId);\n"
"\treturn lowerModelId.startsWith(ANTHROPIC_OPUS_47_MODEL_ID) || lowerModelId.startsWith(ANTHROPIC_OPUS_47_DOT_MODEL_ID) || lowerModelId.startsWith(ANTHROPIC_SONNET_47_MODEL_ID) || lowerModelId.startsWith(ANTHROPIC_SONNET_47_DOT_MODEL_ID) || lowerModelId.startsWith(ANTHROPIC_OPUS_46_MODEL_ID) || lowerModelId.startsWith(ANTHROPIC_OPUS_46_DOT_MODEL_ID) || lowerModelId.startsWith(ANTHROPIC_SONNET_46_MODEL_ID) || lowerModelId.startsWith(ANTHROPIC_SONNET_46_DOT_MODEL_ID);\n"
"}"
)
if adaptive_anchor in src:
src = src.replace(adaptive_anchor, adaptive_new)
with open(path, "w") as fh:
fh.write(src)
print(f"[patch] {path}: added claude-opus-4-7 / claude-sonnet-4-7 forward-compat support")
return True
def main() -> None:
import glob
any_changed = False
if os.path.exists(PI_AI_CATALOG):
any_changed |= patch_pi_ai_catalog(PI_AI_CATALOG)
else:
print(f"[patch] WARNING: {PI_AI_CATALOG} not found", file=sys.stderr)
candidates = sorted(glob.glob(ANTHROPIC_REGISTER_GLOB))
if not candidates:
print(f"[patch] WARNING: no files match {ANTHROPIC_REGISTER_GLOB}", file=sys.stderr)
for cand in candidates:
any_changed |= patch_openclaw_anthropic_register(cand)
if any_changed:
print("[patch] success")
else:
print("[patch] no changes applied (already patched)")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,26 @@
profile:
name: frontier-deepseek-v4
base_model: deepseek/v4-pro
notes: |
Frontier agentic coding model comparison: DeepSeek V4.
DeepSeek direct API. Plugin stack IDENTICAL across all 10 profiles so the
base model is the only structural variable. Any score delta is attributable
to the model, not the scaffold.
plugins:
enabled:
- anthropic
- id: memory-lancedb
config:
dimensions: 1536
- browser-playwright
slots:
memory: memory-lancedb
contextEngine: builtin
tools_allow:
- bash
- file_read
- file_edit
- browser_navigate
- browser_click
- memory_read
- memory_write

View File

@ -0,0 +1,26 @@
profile:
name: frontier-gpt-5-2
base_model: openai/gpt-5.2
notes: |
Frontier agentic coding model comparison: GPT-5.2 (closed).
OpenAI mid-tier flagship. Plugin stack IDENTICAL across all profiles so the base
model is the only structural variable. Any score delta is attributable
to the model, not the scaffold.
plugins:
enabled:
- anthropic
- id: memory-lancedb
config:
dimensions: 1536
- browser-playwright
slots:
memory: memory-lancedb
contextEngine: builtin
tools_allow:
- bash
- file_read
- file_edit
- browser_navigate
- browser_click
- memory_read
- memory_write

View File

@ -0,0 +1,26 @@
profile:
name: frontier-gpt-5-5
base_model: openai/gpt-5.5
notes: |
Frontier agentic coding model comparison: GPT-5.5 (closed).
OpenAI flagship. Plugin stack IDENTICAL across all frontier profiles so
the base model is the only structural variable. Any score delta is
attributable to the model, not the scaffold.
plugins:
enabled:
- anthropic
- id: memory-lancedb
config:
dimensions: 1536
- browser-playwright
slots:
memory: memory-lancedb
contextEngine: builtin
tools_allow:
- bash
- file_read
- file_edit
- browser_navigate
- browser_click
- memory_read
- memory_write

View File

@ -0,0 +1,26 @@
profile:
name: frontier-kimi-k26
base_model: openrouter/moonshotai/kimi-k2.6
notes: |
Frontier agentic coding model comparison: Kimi K2.6 (open).
Moonshot AI newer revision. Plugin stack IDENTICAL across profiles so
the base model is the only structural variable. Any score delta is
attributable to the model, not the scaffold.
plugins:
enabled:
- anthropic
- id: memory-lancedb
config:
dimensions: 1536
- browser-playwright
slots:
memory: memory-lancedb
contextEngine: builtin
tools_allow:
- bash
- file_read
- file_edit
- browser_navigate
- browser_click
- memory_read
- memory_write

View File

@ -0,0 +1,26 @@
profile:
name: frontier-opus-4-7
base_model: anthropic/claude-opus-4-7
notes: |
Frontier agentic coding model comparison: Claude Opus 4.7 (closed).
Anthropic flagship, newer revision. Plugin stack IDENTICAL to opus-4-6
and the other frontier profiles so the base model is the only structural
variable. Any score delta is attributable to the model, not the scaffold.
plugins:
enabled:
- anthropic
- id: memory-lancedb
config:
dimensions: 1536
- browser-playwright
slots:
memory: memory-lancedb
contextEngine: builtin
tools_allow:
- bash
- file_read
- file_edit
- browser_navigate
- browser_click
- memory_read
- memory_write

View File

@ -0,0 +1,26 @@
profile:
name: frontier-sonnet-4-6
base_model: anthropic/claude-sonnet-4-6
notes: |
Frontier agentic coding model comparison: Claude Sonnet 4.6 (closed).
Anthropic mid-tier flagship. Plugin stack IDENTICAL across all profiles so the base
model is the only structural variable. Any score delta is attributable
to the model, not the scaffold.
plugins:
enabled:
- anthropic
- id: memory-lancedb
config:
dimensions: 1536
- browser-playwright
slots:
memory: memory-lancedb
contextEngine: builtin
tools_allow:
- bash
- file_read
- file_edit
- browser_navigate
- browser_click
- memory_read
- memory_write

View File

@ -13,7 +13,7 @@ dependencies = [
"gradio>=5.0,<6",
"httpx>=0.27,<1",
"numpy>=1.26,<3",
"rich>=13.0,<14",
"rich>=13.0,<15",
"click>=8.1,<9",
# Runtime deps for the task completion verifier. The harness shells out
# to `pytest -q` / `pytest-asyncio` inside per-task workspaces as the
@ -30,6 +30,9 @@ dev = [
"pytest>=8.0,<9",
"pytest-asyncio>=0.24,<1",
]
hermes = [
"hermes-agent @ git+https://github.com/NousResearch/hermes-agent.git@main",
]
[project.scripts]
clawbench = "clawbench.cli:main"
@ -45,3 +48,6 @@ force-include = { "tasks-public" = "tasks-public", "profiles" = "profiles", "bas
[tool.pytest.ini_options]
asyncio_mode = "auto"
testpaths = ["tests"]
[tool.hatch.metadata]
allow-direct-references = true

View File

@ -1,13 +1,13 @@
#!/bin/bash
# Shared helper sourced by container_sweep_*.sh scripts to snapshot the
# per-model run_cache after a sweep completes. Called at END of each sweep.
# Shared helper sourced by container runner scripts to snapshot the per-model
# run_cache after a sweep completes. Called at END of each sweep.
#
# Requires these env vars (already set by parent script):
# CLAWBENCH_RUN_CACHE_DIR - e.g. /data/run_cache
# CACHE_SUB - e.g. openai_gpt-5.4
# SWEEP_OUT_TAG - e.g. v2026-4-18-pr68627-gpt54
# SWEEP_OUT_TAG - e.g. core-v1-public
# SWEEP_LABEL - e.g. gpt54
# SWEEP_LOGDIR - e.g. /data/drift_2026-04-18-pr68627-gpt54
# SWEEP_LOGDIR - e.g. /data/core-v1-public
#
# Writes snapshot to: /data/run_cache_archive/<SWEEP_OUT_TAG>/<CACHE_SUB>/
# Also writes a metadata.json with sweep label/model/timestamp for indexing.

View File

@ -1,255 +0,0 @@
"""Per-run 1-to-1 audit across every (model, task, run_idx) triple.
Flags issues beyond aggregate coverage:
- Tasks where ALL models score 0 (task broken / verifier rejects everyone)
- Tasks where models produce output but all get C=0 (verifier bug)
- Tasks with suspiciously high cross-model infra-failure rates (harness bug)
- Specific runs with harness errors (timeout, handshake)
- Models with task-specific pathology (e.g., always fails on t3-X)
- Judge failures per-task that haven't been rejudged
- Missing runs in archive (logged but not cached)
Usage: python3 scripts/audit_per_run.py
"""
from __future__ import annotations
import json
import re
from collections import defaultdict
from pathlib import Path
ROOT = Path(__file__).resolve().parent.parent
DRIFT = ROOT / "data" / "drift_2026-04-19-full"
ARCH = ROOT / "data" / "run_cache_archive" / "v2026-4-19-full"
MODEL_MAP = {
"opus46": ("anthropic_claude-opus-4-6", "opus-4-6"),
"opus47": ("anthropic_claude-opus-4-7", "opus-4-7"),
"sonnet46": ("anthropic_claude-sonnet-4-6", "sonnet-4-6"),
"gpt54": ("openai_gpt-5.4", "gpt-5.4"),
"gemini": ("google_gemini-3.1-pro-preview", "gemini-3.1-pro"),
"glm": ("openrouter_z-ai_glm-5.1", "glm-5.1"),
"minimax": ("openrouter_minimax_minimax-m2.7", "minimax-m2.7"),
"kimi": ("openrouter_moonshotai_kimi-k2.5", "kimi-k2.5"),
"qwen": ("openrouter_qwen_qwen3.6-plus", "qwen-3.6-plus"),
}
LOG_LINE = re.compile(
r"^\[(\d+)/120\]\s+(\S+)\s+\([^)]+\)\s+run\s+(\d+):\s+([+\-~])\s+([\d.]+)"
)
HARNESS_ERR = re.compile(r"ERROR clawbench\.harness: Run (\S+)/(\d+) failed")
JUDGE_INFRA_PHRASES = [
"gateway is restarting", "judge execution failed", "judge failed to run",
"judge call failed", "judge timed out",
]
def parse_log(log_path: Path):
runs = {}
errors = {}
if not log_path.exists():
return runs, errors
src = log_path.read_text(errors="ignore")
for line in src.splitlines():
m = LOG_LINE.match(line.strip())
if m:
seq, task, run_idx, outcome, score = m.groups()
runs[(task, int(run_idx) - 1)] = {"score": float(score), "outcome": outcome}
h = HARNESS_ERR.search(line)
if h:
errors[(h.group(1), int(h.group(2)))] = "harness_error"
return runs, errors
def scan_archive(cache_dir: Path):
out = {}
if not cache_dir.exists():
return out
for tdir in cache_dir.iterdir():
if not tdir.is_dir():
continue
for rf in tdir.glob("run*.json"):
m = re.match(r"run(\d+)\.json", rf.name)
if not m:
continue
try:
d = json.load(open(rf))
except Exception:
continue
jr = d.get("judge_result", {}) or {}
reason = (jr.get("reason") or "").lower()
# Don't flag rejudged runs as infra-failed even if reason is empty —
# a rejudged run has a real judge call behind it (rejudged_at field).
judge_infra = (
jr.get("enabled")
and "rejudged_at" not in jr
and (
any(p in reason for p in JUDGE_INFRA_PHRASES)
or jr.get("error")
or (not reason.strip() and jr.get("score", 0) == 0)
)
)
out[(tdir.name, int(m.group(1)))] = {
"run_score": d.get("run_score", 0),
"c": d.get("completion_result", {}).get("score", 0),
"t": d.get("trajectory_result", {}).get("score", 0),
"b": d.get("behavior_result", {}).get("score", 0),
"j": jr.get("score", 0) if jr.get("enabled") else None,
"judge_infra_failed": bool(judge_infra),
"rejudged": "rejudged_at" in jr,
"delivery": d.get("delivery_outcome"),
"failure_mode": d.get("failure_mode"),
"error": d.get("error"),
"n_messages": len(d.get("transcript", {}).get("messages", [])),
"has_assistant_text": any(
m.get("role") == "assistant" and m.get("text")
for m in d.get("transcript", {}).get("messages", [])
),
}
return out
def main():
# Gather everything
per_model = {}
for label, (sub, pretty) in MODEL_MAP.items():
log_p = DRIFT / f"docker_{label}_v2026-4-19-full.log"
arch_d = ARCH / sub
logged, errors = parse_log(log_p)
archived = scan_archive(arch_d)
per_model[pretty] = {
"logged": logged, "errors": errors, "archived": archived,
}
# Build per-task cross-model view
all_tasks = set()
for m in per_model.values():
for key in m["archived"]:
all_tasks.add(key[0])
for key in m["logged"]:
all_tasks.add(key[0])
# Issue classification
issues = defaultdict(list)
for task in sorted(all_tasks):
# Collect all runs for this task across models
task_runs_by_model = {}
for pretty, data in per_model.items():
task_runs = []
for run_idx in range(3):
key = (task, run_idx)
a = data["archived"].get(key)
l = data["logged"].get(key)
err = (key in data["errors"])
task_runs.append({"archived": a, "logged": l, "harness_err": err})
task_runs_by_model[pretty] = task_runs
# Compute cross-model stats
all_scores = []
all_cs = []
all_outputs = [] # model produced assistant text?
all_judge_infra = 0
all_harness_err = 0
for pretty, runs in task_runs_by_model.items():
for r in runs:
a = r["archived"]
if a:
all_scores.append(a["run_score"])
all_cs.append(a["c"])
all_outputs.append(a["has_assistant_text"])
if a["judge_infra_failed"]: all_judge_infra += 1
elif r["logged"]:
all_scores.append(r["logged"]["score"])
if r["harness_err"]:
all_harness_err += 1
if not all_scores:
continue
mean_score = sum(all_scores) / len(all_scores)
mean_c = sum(all_cs) / len(all_cs) if all_cs else 0
output_rate = sum(all_outputs) / len(all_outputs) if all_outputs else 0
# Flag issues
if mean_score < 0.1:
issues["task_fails_all_models"].append((task, mean_score, output_rate))
if mean_c < 0.05 and output_rate > 0.5:
issues["verifier_rejects_valid_outputs"].append((task, mean_c, output_rate))
if all_harness_err >= 5:
issues["harness_errors_cluster"].append((task, all_harness_err))
if all_judge_infra >= 5:
issues["judge_infra_cluster"].append((task, all_judge_infra))
# Print issues
print("=" * 70)
print("ISSUE: Tasks where ALL models score near-zero (broken verifier or task)")
print("=" * 70)
for task, mean, out_rate in sorted(issues["task_fails_all_models"]):
print(f" {task:<40} mean_score={mean:.3f} assistant_output_rate={out_rate:.1%}")
print()
print("=" * 70)
print("ISSUE: Verifier rejects valid outputs (model produced text but C=0)")
print("=" * 70)
for task, mean_c, out_rate in sorted(issues["verifier_rejects_valid_outputs"]):
print(f" {task:<40} mean_completion={mean_c:.3f} assistant_output_rate={out_rate:.1%}")
print()
print("=" * 70)
print("ISSUE: Harness-error clusters (gateway failures per task)")
print("=" * 70)
for task, n in sorted(issues["harness_errors_cluster"], key=lambda x: -x[1]):
print(f" {task:<40} harness_error_count={n}")
print()
print("=" * 70)
print("ISSUE: Judge-infra clusters (judge failing per task)")
print("=" * 70)
for task, n in sorted(issues["judge_infra_cluster"], key=lambda x: -x[1]):
print(f" {task:<40} judge_infra_failures={n} (should be rejudged)")
# Per-model per-task pathologies
print()
print("=" * 70)
print("ISSUE: Model-specific task pathologies (all 3 runs of a task scored 0 on one model)")
print("=" * 70)
for pretty, data in per_model.items():
zero_tasks = []
for task in sorted(all_tasks):
all_three_zero = True
any_attempted = False
for run_idx in range(3):
key = (task, run_idx)
a = data["archived"].get(key)
l = data["logged"].get(key)
if a:
any_attempted = True
if a["run_score"] > 0.01: all_three_zero = False
elif l:
any_attempted = True
if l["score"] > 0.01: all_three_zero = False
else:
all_three_zero = False # can't confirm
any_attempted = False
if any_attempted and all_three_zero:
zero_tasks.append(task)
if zero_tasks:
print(f" {pretty:<18}: all-zero on {len(zero_tasks)} tasks")
for t in zero_tasks[:6]:
print(f" - {t}")
# Task coverage mismatches
print()
print("=" * 70)
print("COVERAGE: Models with non-complete coverage (logged != 120 or archived != 120)")
print("=" * 70)
for pretty, data in per_model.items():
n_log = len(data["logged"])
n_arch = len(data["archived"])
if n_log < 120 or n_arch < 120:
print(f" {pretty:<18} logged={n_log:<4} archived={n_arch:<4} missing={120 - max(n_log, n_arch)}")
if __name__ == "__main__":
main()

View File

@ -1,207 +0,0 @@
"""Comprehensive per-run audit across all models in drift_2026-04-19-full.
For each model, cross-references:
1. Log file (docker_<label>_<tag>.log) all [N/120] run attempts + their scores
2. Archived per-run JSONs (run_cache_archive/<tag>/<cache_sub>/<task>/runN.json)
3. Judge status per cached run (rejudged via direct API or not)
Outputs a fair-comparison table: coverage %, infra-failure %, clean mean,
coverage-normalized score, judge coverage.
Usage:
python3 scripts/audit_runs.py
"""
from __future__ import annotations
import json
import re
from collections import defaultdict
from pathlib import Path
ROOT = Path(__file__).resolve().parent.parent
DRIFT = ROOT / "data" / "drift_2026-04-19-full"
ARCH = ROOT / "data" / "run_cache_archive" / "v2026-4-19-full"
# Model label (in log filenames) → (cache_sub, pretty name)
MODEL_MAP = {
"opus46": ("anthropic_claude-opus-4-6", "opus-4-6"),
"opus47": ("anthropic_claude-opus-4-7", "opus-4-7"),
"sonnet46": ("anthropic_claude-sonnet-4-6", "sonnet-4-6"),
"gpt54": ("openai_gpt-5.4", "gpt-5.4"),
"gemini": ("google_gemini-3.1-pro-preview", "gemini-3.1-pro"),
"glm": ("openrouter_z-ai_glm-5.1", "glm-5.1"),
"minimax": ("openrouter_minimax_minimax-m2.7", "minimax-m2.7"),
"kimi": ("openrouter_moonshotai_kimi-k2.5", "kimi-k2.5"),
"qwen": ("openrouter_qwen_qwen3.6-plus", "qwen-3.6-plus"),
}
# Regex to parse "[N/120] task (tier/family) run R: + 0.93 C=1.00 T=0.90 ..."
LOG_LINE = re.compile(
r"^\[(\d+)/120\]\s+(\S+)\s+\([^)]+\)\s+run\s+(\d+):\s+([+\-~])\s+([\d.]+)"
)
JUDGE_INFRA_PHRASES = [
"gateway is restarting",
"judge execution failed",
"judge failed to run",
"judge call failed",
"judge timed out",
]
def parse_log(path: Path) -> dict:
"""Return: {(task_id, run_idx): {"score": float, "outcome": "+/-/~"}} from log file."""
runs = {}
if not path.exists():
return runs
for line in path.read_text(errors="ignore").splitlines():
m = LOG_LINE.match(line.strip())
if not m:
continue
seq, task, run_idx, outcome, score = m.groups()
# Log uses 1-indexed run numbers; archive uses 0-indexed runN.json.
# Normalize to 0-indexed so keys cross-reference correctly.
key = (task, int(run_idx) - 1)
# Later entries overwrite earlier (retry semantics)
runs[key] = {"score": float(score), "outcome": outcome, "seq": int(seq)}
return runs
def scan_archive(cache_dir: Path) -> dict:
"""Return: {(task_id, run_idx): {"run_score": float, "c": float, "judge_err": bool, "rejudged": bool}}"""
out = {}
if not cache_dir.exists():
return out
for tdir in cache_dir.iterdir():
if not tdir.is_dir():
continue
for rf in tdir.glob("run*.json"):
try:
d = json.load(open(rf))
except Exception:
continue
m_run = re.match(r"run(\d+)\.json", rf.name)
if not m_run:
continue
run_idx = int(m_run.group(1))
jr = d.get("judge_result", {}) or {}
reason = (jr.get("reason") or "").lower()
judge_infra = (
any(p in reason for p in JUDGE_INFRA_PHRASES)
or jr.get("error")
or (not reason.strip() and jr.get("score", 0) == 0)
)
out[(tdir.name, run_idx)] = {
"run_score": d.get("run_score", 0),
"completion": d.get("completion_result", {}).get("score", 0),
"judge_score": jr.get("score", 0) if jr.get("enabled") else None,
"judge_infra_failed": bool(judge_infra and jr.get("enabled")),
"rejudged": "rejudged_at" in jr,
"delivery": d.get("delivery_outcome"),
"failure_mode": d.get("failure_mode"),
}
return out
def audit_model(label: str, cache_sub: str, pretty: str) -> dict:
log_path = DRIFT / f"docker_{label}_v2026-4-19-full.log"
cache_dir = ARCH / cache_sub
logged = parse_log(log_path)
archived = scan_archive(cache_dir)
all_keys = set(logged.keys()) | set(archived.keys())
n_log = len(logged)
n_arch = len(archived)
not_archived = [k for k in logged.keys() if k not in archived]
# Classify runs
clean_runs = [] # logged + archived + not-infra-zero + judge-OK
infra_zero_runs = [] # logged 0.00 (infra) — never landed in archive
archived_zero = [] # archived but run_score = 0 (infra/capability)
judge_infra = [] # archived with judge_infra_failed
rejudged = [] # archived with rejudged_at
for k, a in archived.items():
if a["judge_infra_failed"] and not a["rejudged"]:
judge_infra.append(k)
if a["rejudged"]:
rejudged.append(k)
if a["run_score"] < 0.01:
archived_zero.append(k)
else:
clean_runs.append((k, a["run_score"]))
# Runs that got logged at 0.00 but weren't archived are pure infra-failures
for k in not_archived:
if logged[k]["score"] < 0.01:
infra_zero_runs.append(k)
else:
clean_runs.append((k, logged[k]["score"]))
# Score computations
all_scores = []
for k, a in archived.items():
all_scores.append(a["run_score"])
for k in not_archived:
all_scores.append(logged[k]["score"])
n_total_attempts = max(n_log, len(all_scores))
expected = 120
clean_scores = [s for _, s in clean_runs]
clean_mean = sum(clean_scores) / len(clean_scores) if clean_scores else 0
all_mean = sum(all_scores) / len(all_scores) if all_scores else 0
# Coverage-normalized: clean_mean with gap-penalty (missing runs count as 0)
coverage_normalized = (sum(clean_scores) + 0 * max(0, expected - len(clean_scores))) / expected
return {
"label": label,
"pretty": pretty,
"n_log_entries": n_log,
"n_archived": n_arch,
"n_missing_from_archive": len(not_archived),
"n_clean_runs": len(clean_runs),
"n_archived_zero": len(archived_zero),
"n_logged_infra_zero": len(infra_zero_runs),
"n_judge_infra_failed": len(judge_infra),
"n_rejudged": len(rejudged),
"coverage_pct": 100.0 * len(clean_runs) / expected,
"clean_mean": clean_mean,
"all_mean": all_mean,
"coverage_normalized": coverage_normalized,
}
def main():
print(f"{'Model':<16} {'Logged':>7} {'Archv':>6} {'Clean':>6} {'Cov%':>5} {'all_mean':>8} {'clean':>7} {'cov_norm':>8} {'infra_0':>8} {'j_rejdg':>8} {'j_failed':>8}")
print(f"{'-'*16} {'-'*7} {'-'*6} {'-'*6} {'-'*5} {'-'*8} {'-'*7} {'-'*8} {'-'*8} {'-'*8} {'-'*8}")
rows = []
for label, (cache_sub, pretty) in MODEL_MAP.items():
r = audit_model(label, cache_sub, pretty)
rows.append(r)
# Sort by coverage-normalized score
rows.sort(key=lambda r: -r["coverage_normalized"])
for r in rows:
print(
f" {r['pretty']:<14} {r['n_log_entries']:>7} {r['n_archived']:>6} "
f"{r['n_clean_runs']:>6} {r['coverage_pct']:>4.0f}% "
f"{r['all_mean']:>8.4f} {r['clean_mean']:>7.4f} "
f"{r['coverage_normalized']:>8.4f} "
f"{r['n_logged_infra_zero']+r['n_archived_zero']:>8} "
f"{r['n_rejudged']:>8} {r['n_judge_infra_failed']:>8}"
)
# Show gaps explicitly
print()
print("Legend:")
print(" all_mean = mean of ALL attempts (log+archive merged; infra-zeros pull this DOWN)")
print(" clean = mean excluding infra-failed runs (shows capability ceiling)")
print(" cov_norm = clean*coverage + 0*missing; all models scored against 120-run denominator")
print(" infra_0 = runs that scored 0 due to infrastructure (gateway/state/handshake failures)")
print(" j_rejdg = judge scores that have been rejudged via direct Anthropic API")
print(" j_failed = judge infra-failures that have NOT been rejudged")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,460 @@
#!/bin/bash
# Fair adapter lane runner.
#
# Runs one adapter/model pair inside a container-owned workspace/state dir.
# Use docker run with full container privileges when measuring harnesses:
# docker run --rm --privileged --cap-add=ALL \
# --security-opt seccomp=unconfined --security-opt apparmor=unconfined \
# --user root --env-file .tmp/docker_eval.env \
# -e SWEEP_ADAPTER=hermes -e SWEEP_MODEL=openai/gpt-5.4 \
# -e SWEEP_LABEL=hermes-gpt54 -e SWEEP_OUT_TAG=fair-20260425 \
# -v "$PWD/data/fair-container:/data" \
# -v "$PWD/data/container-home-openclaw:/config/openclaw:ro" \
# clawbench-fair:latest
set -u
: "${SWEEP_ADAPTER:?SWEEP_ADAPTER required (openclaw|hermes)}"
: "${SWEEP_MODEL:?SWEEP_MODEL required (e.g. openai/gpt-5.4)}"
: "${SWEEP_LABEL:?SWEEP_LABEL required}"
: "${SWEEP_OUT_TAG:=fair-container}"
: "${SWEEP_LOGDIR:=/data/fair_results}"
: "${SWEEP_RUNS:=1}"
: "${SWEEP_CONCURRENCY:=1}"
: "${SWEEP_BROWSER_CONCURRENCY:=1}"
: "${CLAWBENCH_PER_RUN_BUDGET_SECONDS:=300}"
: "${CLAWBENCH_PER_TURN_TIMEOUT_SECONDS:=180}"
: "${HERMES_MAX_ITERATIONS:=90}"
: "${HERMES_STEP_TIMEOUT_SECONDS:=60}"
: "${OPENCLAW_EXEC_HOST:=gateway}"
cd /home/node/app
mkdir -p "$SWEEP_LOGDIR" /data/run_cache
export OPENCLAW_GATEWAY_TOKEN="${OPENCLAW_GATEWAY_TOKEN:-local-dev-token-for-testing}"
export OPENCLAW_GATEWAY_URL="${OPENCLAW_GATEWAY_URL:-ws://127.0.0.1:18789}"
export OPENCLAW_SKIP_GMAIL_WATCHER=1
export OPENCLAW_SKIP_CANVAS_HOST=1
export OPENCLAW_NO_RESPAWN=1
export CLAWBENCH_DISABLE_GATEWAY_DEVICE_IDENTITY=1
export NODE_OPTIONS="${NODE_OPTIONS:-"--max-old-space-size=4096"}"
if command -v npm >/dev/null 2>&1; then
export NODE_PATH="${NODE_PATH:-$(npm root -g 2>/dev/null || true)}"
fi
export CLAWBENCH_PER_RUN_BUDGET_SECONDS
export CLAWBENCH_PER_TURN_TIMEOUT_SECONDS
export HERMES_AGENT_REPO="${HERMES_AGENT_REPO:-/opt/hermes-agent}"
export HERMES_DRIVER="${HERMES_DRIVER:-ai_agent}"
export HERMES_TOOLSETS="${HERMES_TOOLSETS:-hermes-api-server}"
export HERMES_MAX_ITERATIONS
export HERMES_STEP_TIMEOUT_SECONDS
export TERMINAL_ENV="${TERMINAL_ENV:-local}"
safe_model="${SWEEP_MODEL//\//_}"
safe_model="${safe_model//:/_}"
safe_label="${SWEEP_LABEL//\//_}"
safe_label="${safe_label//:/_}"
export CLAWBENCH_RUN_CACHE_DIR="/data/run_cache/$safe_label"
mkdir -p "$CLAWBENCH_RUN_CACHE_DIR"
cache_sub="${SWEEP_ADAPTER}-${safe_model}"
cache_paths=("$CLAWBENCH_RUN_CACHE_DIR/$cache_sub")
if [ "$SWEEP_ADAPTER" = "openclaw" ]; then
cache_paths+=("$CLAWBENCH_RUN_CACHE_DIR/$safe_model")
fi
SRC_STATE="${OPENCLAW_CONFIG_SOURCE:-/config/openclaw}"
if [ ! -d "$SRC_STATE" ]; then
SRC_STATE="/home/node/.openclaw"
fi
FRESH_HOME="/tmp/openclaw-home-${SWEEP_LABEL}-$$"
FRESH_STATE="$FRESH_HOME/.openclaw"
rm -rf "$FRESH_HOME"
mkdir -p "$FRESH_STATE" "$FRESH_HOME/.config"
if [ -f "$SRC_STATE/openclaw.json" ]; then
cp "$SRC_STATE/openclaw.json" "$FRESH_STATE/openclaw.json"
fi
mkdir -p \
"$FRESH_STATE/agents" \
"$FRESH_STATE/workspace" \
"$FRESH_STATE/logs" \
"$FRESH_STATE/memory" \
"$FRESH_STATE/cache" \
"$FRESH_STATE/identity" \
"$FRESH_STATE/devices" \
"$FRESH_STATE/tasks" \
"$FRESH_STATE/subagents" \
"$FRESH_STATE/flows" \
"$FRESH_STATE/cron"
chmod -R 777 "$FRESH_STATE" 2>/dev/null || true
export HOME="$FRESH_HOME"
export OPENCLAW_HOME="$FRESH_HOME"
export OPENCLAW_STATE_DIR="$FRESH_STATE"
export OPENCLAW_CONFIG_PATH="$FRESH_STATE/openclaw.json"
export OPENCLAW_REPO="${OPENCLAW_REPO:-/app}"
export XDG_CONFIG_HOME="$FRESH_HOME/.config"
export HERMES_HOME_BASE="${HERMES_HOME_BASE:-$FRESH_HOME/.hermes}"
export HERMES_HOME="$HERMES_HOME_BASE"
mkdir -p "$HERMES_HOME"
if [ "$SWEEP_ADAPTER" = "hermes" ]; then
unset HERMES_PROVIDER
case "$SWEEP_MODEL" in
openai/*)
if [ -z "${OPENAI_API_KEY:-}" ] && [ -n "${HERMES_API_KEY:-}" ]; then
export OPENAI_API_KEY="$HERMES_API_KEY"
fi
export HERMES_BASE_URL="${HERMES_BASE_URL:-${OPENAI_BASE_URL:-https://api.openai.com/v1}}"
export OPENAI_BASE_URL="$HERMES_BASE_URL"
if [ -n "${OPENAI_API_KEY:-}" ]; then
export HERMES_API_KEY="$OPENAI_API_KEY"
fi
unset ANTHROPIC_API_KEY ANTHROPIC_TOKEN CLAUDE_CODE_OAUTH_TOKEN OPENROUTER_API_KEY
;;
anthropic/*)
unset OPENAI_API_KEY OPENAI_BASE_URL HERMES_API_KEY HERMES_BASE_URL OPENROUTER_API_KEY
;;
*)
if [ -n "${HERMES_BASE_URL:-}" ]; then
export OPENAI_BASE_URL="$HERMES_BASE_URL"
elif [ -z "${OPENAI_BASE_URL:-}" ] && [ -n "${OPENAI_API_KEY:-}" ]; then
export OPENAI_BASE_URL="https://api.openai.com/v1"
fi
if [ -n "${HERMES_API_KEY:-}" ] && [ -z "${OPENAI_API_KEY:-}" ]; then
export OPENAI_API_KEY="$HERMES_API_KEY"
fi
;;
esac
fi
python - <<'PY'
import json
import os
from pathlib import Path
cfg_path = Path(os.environ["OPENCLAW_CONFIG_PATH"])
if not cfg_path.exists():
raise SystemExit(0)
data = json.loads(cfg_path.read_text(encoding="utf-8"))
agents = data.get("agents")
if isinstance(agents, dict):
# Keep static defaults, but never seed eval containers with old session-specific
# agent records from the developer machine.
agents["list"] = []
channels = data.get("channels")
if isinstance(channels, dict):
for channel in channels.values():
if isinstance(channel, dict):
channel["enabled"] = False
exec_approvals = channel.get("execApprovals")
if not isinstance(exec_approvals, dict):
exec_approvals = {}
channel["execApprovals"] = exec_approvals
exec_approvals["enabled"] = False
plugins = data.get("plugins")
if isinstance(plugins, dict):
stale = {"marxbiotech-git-tools", "lab"}
allow = plugins.get("allow")
if isinstance(allow, list):
plugins["allow"] = [item for item in allow if item not in stale]
entries = plugins.get("entries")
if isinstance(entries, dict):
for item in stale:
entries.pop(item, None)
def set_nested(root, dotted, value):
cursor = root
parts = dotted.split(".")
for part in parts[:-1]:
child = cursor.get(part)
if not isinstance(child, dict):
child = {}
cursor[part] = child
cursor = child
cursor[parts[-1]] = value
set_nested(data, "browser.headless", True)
set_nested(data, "browser.noSandbox", True)
set_nested(data, "gateway.reload.mode", "off")
set_nested(data, "agents.defaults.skipBootstrap", True)
set_nested(data, "agents.defaults.sandbox.mode", "off")
exec_host = os.environ.get("OPENCLAW_EXEC_HOST", "gateway").strip().lower()
if exec_host not in {"auto", "gateway", "sandbox", "node"}:
raise SystemExit(f"invalid OPENCLAW_EXEC_HOST={exec_host!r}")
set_nested(data, "tools.exec.host", exec_host)
set_nested(data, "tools.exec.security", "full")
set_nested(data, "tools.exec.ask", "off")
set_nested(data, "approvals.exec.enabled", False)
model = os.environ.get("SWEEP_MODEL", "").strip()
if model:
set_nested(data, "agents.defaults.model.primary", model)
set_nested(data, "agents.defaults.subagents.model.primary", model)
tmp_path = cfg_path.with_suffix(".json.tmp")
tmp_path.write_text(json.dumps(data, indent=2), encoding="utf-8")
tmp_path.replace(cfg_path)
approvals_path = cfg_path.with_name("exec-approvals.json")
approvals = {
"version": 1,
"socket": {
"path": str(approvals_path.with_suffix(".sock")),
"token": "container-eval-token",
},
"defaults": {
"security": "full",
"ask": "off",
"askFallback": "full",
},
"agents": {
"*": {
"security": "full",
"ask": "off",
"askFallback": "full",
}
},
}
approvals_path.write_text(json.dumps(approvals, indent=2), encoding="utf-8")
PY
if [ "$SWEEP_ADAPTER" = "hermes" ]; then
python - <<'PY'
import os
from pathlib import Path
from urllib.parse import urlparse
model = os.environ["SWEEP_MODEL"].strip()
base_url = (os.environ.get("HERMES_BASE_URL") or os.environ.get("OPENAI_BASE_URL") or "").strip()
provider = "custom"
effective_model = model
aux_base_url = ""
aux_api_mode = ""
if model.startswith("anthropic/"):
provider = "anthropic"
elif urlparse(base_url).hostname == "api.openai.com" and model.startswith("openai/"):
effective_model = model.split("/", 1)[1]
aux_base_url = base_url
if effective_model.lower().startswith("gpt-5"):
aux_api_mode = "codex_responses"
elif base_url:
aux_base_url = base_url
tasks = [
"vision",
"web_extract",
"compression",
"session_search",
"skills_hub",
"approval",
"mcp",
"title_generation",
]
lines = [
"model:",
f" provider: {provider}",
f" default: {effective_model}",
]
if aux_base_url:
lines.append(f" base_url: {aux_base_url}")
if aux_api_mode:
lines.append(f" api_mode: {aux_api_mode}")
lines.append("auxiliary:")
for task in tasks:
timeout = 360 if task == "web_extract" else 120 if task in {"vision", "compression"} else 30
lines.extend([
f" {task}:",
" provider: main",
f" model: {effective_model}",
f" timeout: {timeout}",
])
if aux_base_url:
lines.append(f" base_url: {aux_base_url}")
if aux_api_mode:
lines.append(f" api_mode: {aux_api_mode}")
if task == "session_search":
lines.append(" max_concurrency: 1")
path = Path(os.environ["HERMES_HOME"]) / "config.yaml"
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text("\n".join(lines) + "\n", encoding="utf-8")
PY
fi
OUT="$SWEEP_LOGDIR/${SWEEP_LABEL}_${SWEEP_ADAPTER}_${safe_model}_${SWEEP_OUT_TAG}.json"
LOG="$SWEEP_LOGDIR/${SWEEP_LABEL}_${SWEEP_ADAPTER}_${safe_model}_${SWEEP_OUT_TAG}.log"
GWLOG="$SWEEP_LOGDIR/gateway_${SWEEP_LABEL}_${SWEEP_OUT_TAG}.log"
HERMES_AGENT_LOG="$SWEEP_LOGDIR/hermes_agent_${SWEEP_LABEL}_${SWEEP_OUT_TAG}.log"
HERMES_ERROR_LOG="$SWEEP_LOGDIR/hermes_errors_${SWEEP_LABEL}_${SWEEP_OUT_TAG}.log"
echo "===== CONTAINER ADAPTER EVAL START $(date '+%Y-%m-%d %H:%M:%S') ====="
echo "uid: $(id -u) ($(id -un 2>/dev/null || true))"
echo "adapter: $SWEEP_ADAPTER"
echo "model: $SWEEP_MODEL"
echo "runs: $SWEEP_RUNS"
echo "execHost: $OPENCLAW_EXEC_HOST"
echo "out: $OUT"
echo "cache: ${cache_paths[*]}"
echo "home: $HOME"
echo "state: $OPENCLAW_STATE_DIR"
echo "hermes: ${HERMES_HOME:-}"
openclaw --version 2>/dev/null || true
python - <<'PY' 2>/dev/null || true
import os, subprocess
repo = os.environ.get("HERMES_AGENT_REPO", "")
if repo:
try:
sha = subprocess.check_output(["git", "-C", repo, "rev-parse", "HEAD"], text=True).strip()
print(f"Hermes git: {sha}")
except Exception:
print(f"Hermes repo: {repo}")
PY
rm -rf "${cache_paths[@]}"
rm -f "$OUT" "$LOG"
GATEWAY_PID=""
preserve_hermes_logs() {
if [ -f "${HERMES_HOME:-}/logs/agent.log" ]; then
cp "${HERMES_HOME:-}/logs/agent.log" "$HERMES_AGENT_LOG" 2>/dev/null || true
fi
if [ -f "${HERMES_HOME:-}/logs/errors.log" ]; then
cp "${HERMES_HOME:-}/logs/errors.log" "$HERMES_ERROR_LOG" 2>/dev/null || true
fi
}
cleanup() {
preserve_hermes_logs
if [ -n "${GATEWAY_PID:-}" ]; then
kill "$GATEWAY_PID" 2>/dev/null || true
wait "$GATEWAY_PID" 2>/dev/null || true
fi
rm -rf "${FRESH_HOME:-}" 2>/dev/null || true
}
trap cleanup EXIT
if [ "$SWEEP_ADAPTER" = "openclaw" ]; then
echo "Starting OpenClaw gateway on :18789 ..."
HOME="$FRESH_HOME" \
OPENCLAW_HOME="$FRESH_HOME" \
OPENCLAW_STATE_DIR="$FRESH_STATE" \
OPENCLAW_CONFIG_PATH="$FRESH_STATE/openclaw.json" \
XDG_CONFIG_HOME="$FRESH_HOME/.config" \
openclaw gateway run \
--allow-unconfigured \
--dev \
--bind loopback \
--port 18789 \
--auth token \
--token "$OPENCLAW_GATEWAY_TOKEN" \
--compact \
> "$GWLOG" 2>&1 &
GATEWAY_PID=$!
ready=0
for i in $(seq 1 180); do
if curl -sf -H "Authorization: Bearer $OPENCLAW_GATEWAY_TOKEN" http://127.0.0.1:18789/healthz > /dev/null 2>&1; then
echo "Gateway healthy after ${i}s"
ready=1
break
fi
sleep 1
done
if [ "$ready" -ne 1 ]; then
echo "ERROR: gateway failed to become healthy"
tail -80 "$GWLOG" 2>/dev/null || true
exit 1
fi
if [ -r "/proc/$GATEWAY_PID/environ" ]; then
actual_home="$(tr '\0' '\n' < "/proc/$GATEWAY_PID/environ" | awk -F= '$1 == "HOME" { print $2; exit }')"
if [ "$actual_home" != "$FRESH_HOME" ]; then
echo "ERROR: gateway HOME escaped container eval home: ${actual_home:-<unset>} != $FRESH_HOME"
tail -120 "$GWLOG" 2>/dev/null || true
exit 1
fi
fi
if [ ! -f "$FRESH_STATE/exec-approvals.json" ] || grep -q '/home/node/.openclaw' "$FRESH_STATE/exec-approvals.json"; then
echo "ERROR: exec approvals are not isolated in $FRESH_STATE"
exit 1
fi
echo "Waiting for OpenClaw session control plane ..."
python - <<'PY'
import asyncio
import os
import sys
import time
from clawbench.client import GatewayClient, GatewayConfig
async def probe_once(attempt: int) -> None:
config = GatewayConfig(
url=os.environ["OPENCLAW_GATEWAY_URL"],
token=os.environ["OPENCLAW_GATEWAY_TOKEN"],
connect_timeout=30.0,
request_timeout=30.0,
)
async with GatewayClient(config) as client:
key = await client.create_session(
model=os.environ["SWEEP_MODEL"],
label=f"clawbench-readiness-probe-{os.getpid()}-{attempt}",
)
await client.delete_session(key)
async def main() -> int:
deadline = time.monotonic() + 240
attempt = 0
last_error = ""
while time.monotonic() < deadline:
attempt += 1
try:
await probe_once(attempt)
print(f"Gateway session control plane ready after {attempt} attempt(s)")
return 0
except Exception as exc:
last_error = f"{type(exc).__name__}: {exc}"
print(f"Gateway control probe {attempt} not ready: {last_error}")
await asyncio.sleep(5)
print(f"ERROR: gateway session control plane did not become ready: {last_error}", file=sys.stderr)
return 1
raise SystemExit(asyncio.run(main()))
PY
if [ "$?" -ne 0 ]; then
tail -120 "$GWLOG" 2>/dev/null || true
exit 1
fi
fi
TASK_ARGS=()
if [ -n "${CHERRY_TASKS:-}" ]; then
IFS=',' read -ra TASK_ARR <<< "$CHERRY_TASKS"
for task_id in "${TASK_ARR[@]}"; do
TASK_ARGS+=("--task" "$task_id")
done
fi
clawbench run \
--adapter "$SWEEP_ADAPTER" \
--model "$SWEEP_MODEL" \
--runs "$SWEEP_RUNS" \
--concurrency "$SWEEP_CONCURRENCY" \
--browser-concurrency "$SWEEP_BROWSER_CONCURRENCY" \
--no-randomize \
"${TASK_ARGS[@]}" \
--output "$OUT" \
> "$LOG" 2>&1
status=$?
preserve_hermes_logs
echo "===== clawbench exit=$status $(date '+%Y-%m-%d %H:%M:%S') ====="
tail -80 "$LOG" 2>/dev/null || true
exit "$status"

220
scripts/container_lane_eval.sh Executable file
View File

@ -0,0 +1,220 @@
#!/bin/bash
# Run one OpenClaw model/profile through the HF-style isolated lane worker.
set -Eeuo pipefail
: "${SWEEP_MODEL:?SWEEP_MODEL required}"
: "${SWEEP_LABEL:?SWEEP_LABEL required}"
: "${SWEEP_OUT_TAG:=lane-container}"
: "${SWEEP_LANES:=3}"
: "${SWEEP_RUNS:=1}"
: "${SWEEP_LOGDIR:=/data/results}"
: "${CLAWBENCH_PER_RUN_BUDGET_SECONDS:=900}"
: "${CLAWBENCH_PER_TURN_TIMEOUT_SECONDS:=300}"
: "${OPENCLAW_EXEC_HOST:=gateway}"
cd /home/node/app
export CLAWBENCH_LOCAL_QUEUE_DIR="${CLAWBENCH_LOCAL_QUEUE_DIR:-/data/queue/$SWEEP_LABEL}"
mkdir -p "$SWEEP_LOGDIR" /data/results "$CLAWBENCH_LOCAL_QUEUE_DIR" /data/run_cache /data/lane_runtime
export HF_TOKEN=""
export OPENCLAW_GATEWAY_TOKEN="${OPENCLAW_GATEWAY_TOKEN:-local-dev-token-for-testing}"
export OPENCLAW_SKIP_GMAIL_WATCHER=1
export OPENCLAW_SKIP_CANVAS_HOST=1
export OPENCLAW_NO_RESPAWN=1
export CLAWBENCH_DISABLE_GATEWAY_DEVICE_IDENTITY=1
export CLAWBENCH_PER_RUN_BUDGET_SECONDS
export CLAWBENCH_PER_TURN_TIMEOUT_SECONDS
export CLAWBENCH_CONNECT_TIMEOUT="${CLAWBENCH_CONNECT_TIMEOUT:-180}"
export CLAWBENCH_REQUEST_TIMEOUT="${CLAWBENCH_REQUEST_TIMEOUT:-300}"
export CLAWBENCH_GATEWAY_HEALTH_TIMEOUT_SECONDS="${CLAWBENCH_GATEWAY_HEALTH_TIMEOUT_SECONDS:-240}"
export CLAWBENCH_LANE_STARTUP_STAGGER_SECONDS="${CLAWBENCH_LANE_STARTUP_STAGGER_SECONDS:-90}"
export CLAWBENCH_GATEWAY_READY_MARKER_GRACE_SECONDS="${CLAWBENCH_GATEWAY_READY_MARKER_GRACE_SECONDS:-90}"
export CLAWBENCH_KEEP_PARALLEL_LANE_ROOT="${CLAWBENCH_KEEP_PARALLEL_LANE_ROOT:-0}"
export CLAWBENCH_PARALLEL_LANE_ROOT="/data/lane_runtime/$SWEEP_LABEL"
export CLAWBENCH_TOOL_PROFILE_NAME="${CLAWBENCH_TOOL_PROFILE_NAME:-$SWEEP_LABEL}"
export NODE_OPTIONS="${NODE_OPTIONS:-"--max-old-space-size=4096"}"
if command -v npm >/dev/null 2>&1; then
export NODE_PATH="${NODE_PATH:-$(npm root -g 2>/dev/null || true)}"
fi
SRC_STATE="${OPENCLAW_CONFIG_SOURCE:-/config/openclaw}"
if [ ! -d "$SRC_STATE" ]; then
SRC_STATE="/home/node/.openclaw"
fi
safe_model="${SWEEP_MODEL//\//_}"
safe_model="${safe_model//:/_}"
OUT="$SWEEP_LOGDIR/${SWEEP_LABEL}_openclaw_${safe_model}_${SWEEP_OUT_TAG}.json"
LOG="$SWEEP_LOGDIR/${SWEEP_LABEL}_openclaw_${safe_model}_${SWEEP_OUT_TAG}.log"
export SWEEP_OUTPUT_PATH="$OUT"
FRESH_HOME="/tmp/openclaw-home-${SWEEP_LABEL}-$$"
FRESH_STATE="$FRESH_HOME/.openclaw"
rm -rf "$FRESH_HOME" "$CLAWBENCH_PARALLEL_LANE_ROOT"
mkdir -p "$FRESH_STATE" "$FRESH_HOME/.config"
if [ -f "$SRC_STATE/openclaw.json" ]; then
cp "$SRC_STATE/openclaw.json" "$FRESH_STATE/openclaw.json"
fi
if [ -d "$SRC_STATE/plugins" ]; then
mkdir -p "$FRESH_STATE/plugins"
cp -R "$SRC_STATE/plugins/." "$FRESH_STATE/plugins/" 2>/dev/null || true
fi
mkdir -p \
"$FRESH_STATE/agents" \
"$FRESH_STATE/workspace" \
"$FRESH_STATE/logs" \
"$FRESH_STATE/memory" \
"$FRESH_STATE/cache" \
"$FRESH_STATE/identity" \
"$FRESH_STATE/devices" \
"$FRESH_STATE/tasks" \
"$FRESH_STATE/subagents" \
"$FRESH_STATE/flows" \
"$FRESH_STATE/cron"
export HOME="$FRESH_HOME"
export OPENCLAW_HOME="$FRESH_HOME"
export OPENCLAW_STATE_DIR="$FRESH_STATE"
export OPENCLAW_CONFIG_PATH="$FRESH_STATE/openclaw.json"
export XDG_CONFIG_HOME="$FRESH_HOME/.config"
python - <<'PY'
import json
import os
from pathlib import Path
cfg_path = Path(os.environ["OPENCLAW_CONFIG_PATH"])
if not cfg_path.exists():
raise SystemExit("missing openclaw.json")
data = json.loads(cfg_path.read_text(encoding="utf-8"))
def set_nested(root, dotted, value):
cursor = root
parts = dotted.split(".")
for part in parts[:-1]:
child = cursor.get(part)
if not isinstance(child, dict):
child = {}
cursor[part] = child
cursor = child
cursor[parts[-1]] = value
agents = data.setdefault("agents", {})
if isinstance(agents, dict):
agents["list"] = []
channels = data.get("channels")
if isinstance(channels, dict):
for channel in channels.values():
if isinstance(channel, dict):
channel["enabled"] = False
exec_approvals = channel.get("execApprovals")
if not isinstance(exec_approvals, dict):
exec_approvals = {}
channel["execApprovals"] = exec_approvals
exec_approvals["enabled"] = False
plugins = data.setdefault("plugins", {})
stale = {"marxbiotech-git-tools", "lab"}
allow = plugins.get("allow")
if isinstance(allow, list):
plugins["allow"] = [item for item in allow if item not in stale]
entries = plugins.get("entries")
if isinstance(entries, dict):
for item in stale:
entries.pop(item, None)
set_nested(data, "browser.headless", True)
set_nested(data, "browser.noSandbox", True)
set_nested(data, "gateway.reload.mode", "off")
set_nested(data, "agents.defaults.skipBootstrap", True)
set_nested(data, "agents.defaults.sandbox.mode", "off")
set_nested(data, "agents.defaults.model.primary", os.environ["SWEEP_MODEL"])
set_nested(data, "agents.defaults.subagents.model.primary", os.environ["SWEEP_MODEL"])
set_nested(data, "tools.exec.host", os.environ.get("OPENCLAW_EXEC_HOST", "gateway"))
set_nested(data, "tools.exec.security", "full")
set_nested(data, "tools.exec.ask", "off")
set_nested(data, "approvals.exec.enabled", False)
cfg_path.write_text(json.dumps(data, indent=2) + "\n", encoding="utf-8")
approvals_path = cfg_path.with_name("exec-approvals.json")
approvals = {
"version": 1,
"socket": {
"path": str(approvals_path.with_suffix(".sock")),
"token": "container-lane-eval-token",
},
"defaults": {"security": "full", "ask": "off", "askFallback": "full"},
"agents": {"*": {"security": "full", "ask": "off", "askFallback": "full"}},
}
approvals_path.write_text(json.dumps(approvals, indent=2) + "\n", encoding="utf-8")
PY
if [ "${CLAWBENCH_ENABLE_GBRAIN:-0}" = "1" ]; then
export CLAWBENCH_LANE_PREPARE_CMD="${CLAWBENCH_LANE_PREPARE_CMD:-/home/node/app/scripts/setup_gbrain_runtime.sh}"
"$CLAWBENCH_LANE_PREPARE_CMD"
fi
echo "===== CONTAINER LANE EVAL START $(date '+%Y-%m-%d %H:%M:%S') ====="
echo "label: $SWEEP_LABEL"
echo "model: $SWEEP_MODEL"
echo "runs: $SWEEP_RUNS"
echo "lanes: $SWEEP_LANES"
echo "tasks: ${SWEEP_TASKS:-${CHERRY_TASKS:-all}}"
echo "out: $OUT"
echo "log: $LOG"
echo "home: $HOME"
echo "state: $OPENCLAW_STATE_DIR"
openclaw --version 2>/dev/null || true
set +e
python - <<'PY' > "$LOG" 2>&1
import asyncio
import json
import logging
import os
import shutil
from pathlib import Path
from clawbench.queue import JobQueue, JobStatus, SubmissionRequest
from clawbench.worker import EvalWorker, RESULTS_DIR
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
async def main() -> int:
queue = JobQueue()
queue._jobs.clear()
queue._save_local()
task_ids_raw = os.environ.get("SWEEP_TASKS") or os.environ.get("CHERRY_TASKS") or ""
task_ids = [item.strip() for item in task_ids_raw.split(",") if item.strip()]
request = SubmissionRequest(
model=os.environ["SWEEP_MODEL"],
runs_per_task=int(os.environ["SWEEP_RUNS"]),
max_parallel_lanes=int(os.environ["SWEEP_LANES"]),
task_ids=task_ids,
prompt_variant=os.environ.get("SWEEP_PROMPT_VARIANT", "clear"),
judge_model=os.environ.get("CLAWBENCH_JUDGE_MODEL", ""),
notes=os.environ.get("SWEEP_LABEL", ""),
)
job = await queue.submit(request)
worker = EvalWorker(queue)
await worker._process_job(job)
final = await queue.get_status(job.job_id)
print(json.dumps(final.model_dump() if final else {}, indent=2), flush=True)
if final is None or final.status != JobStatus.FINISHED or not final.result_id:
return 1
result_path = RESULTS_DIR / f"{final.result_id}.json"
output_path = Path(os.environ["SWEEP_OUTPUT_PATH"])
output_path.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(result_path, output_path)
return 0
raise SystemExit(asyncio.run(main()))
PY
status=$?
set -e
echo "===== lane eval exit=$status $(date '+%Y-%m-%d %H:%M:%S') ====="
tail -120 "$LOG" 2>/dev/null || true
exit "$status"

View File

@ -1,98 +0,0 @@
#!/bin/bash
# Minimal single-model sweep — 1 run per task (not 3) for fast validation.
# Used to quickly test if an openrouter-stream fix actually works without
# committing to a full 60-minute 3-run sweep.
#
# Invocation (from host):
# docker run -d --name clawbench-<LABEL> \
# -e SWEEP_LABEL=<label> -e SWEEP_MODEL=<routed-model> \
# -e SWEEP_PROFILE=<abs-profile-path> \
# -e SWEEP_LOGDIR=<output-dir-in-container> \
# -e SWEEP_OUT_TAG=<tag> \
# -v .../scripts:/home/node/app/scripts:ro \
# -v .../data:/data \
# -v .../data/container-home-openclaw:/home/node/.openclaw \
# -v .../profiles:/home/node/app/profiles:ro \
# --memory 8g \
# <image> \
# bash /home/node/app/scripts/container_sweep_minimal.sh
set -u
: "${SWEEP_LABEL:?SWEEP_LABEL required}"
: "${SWEEP_MODEL:?SWEEP_MODEL required}"
: "${SWEEP_PROFILE:?SWEEP_PROFILE required}"
: "${SWEEP_LOGDIR:?SWEEP_LOGDIR required}"
: "${SWEEP_OUT_TAG:?SWEEP_OUT_TAG required}"
cd /data
mkdir -p "$SWEEP_LOGDIR"
export OPENCLAW_GATEWAY_TOKEN="local-dev-token-for-testing"
export CLAWBENCH_RUN_CACHE_DIR="/data/run_cache"
mkdir -p "$CLAWBENCH_RUN_CACHE_DIR"
export NODE_OPTIONS="--max-old-space-size=4096"
# Clear cache for target model
case "$SWEEP_MODEL" in
openrouter/z-ai/glm-5.1) CACHE_SUB="openrouter_z-ai_glm-5.1" ;;
openrouter/minimax/minimax-m2.7) CACHE_SUB="openrouter_minimax_minimax-m2.7" ;;
openrouter/moonshotai/kimi-k2.5) CACHE_SUB="openrouter_moonshotai_kimi-k2.5" ;;
*) CACHE_SUB="" ;;
esac
if [ -n "$CACHE_SUB" ] && [ -d "$CLAWBENCH_RUN_CACHE_DIR/$CACHE_SUB" ]; then
echo "clearing cache: $CLAWBENCH_RUN_CACHE_DIR/$CACHE_SUB"
rm -rf "$CLAWBENCH_RUN_CACHE_DIR/$CACHE_SUB"
fi
OUT="$SWEEP_LOGDIR/docker_${SWEEP_LABEL}_${SWEEP_OUT_TAG}.json"
LOG="$SWEEP_LOGDIR/docker_${SWEEP_LABEL}_${SWEEP_OUT_TAG}.log"
GWLOG="$SWEEP_LOGDIR/gateway_${SWEEP_LABEL}.log"
rm -f "$OUT"
echo "===== MINIMAL SWEEP START $(date '+%Y-%m-%d %H:%M:%S') ====="
echo "label: $SWEEP_LABEL"
echo "model: $SWEEP_MODEL"
echo "profile: $SWEEP_PROFILE"
echo "out: $OUT"
echo "runs: 1 per task (MINIMAL)"
echo "Starting gateway on :18789 (heap=4GB) ..."
openclaw gateway --port 18789 > "$GWLOG" 2>&1 &
GATEWAY_PID=$!
ready=0
for i in $(seq 1 120); do
if curl -sf -H "Authorization: Bearer $OPENCLAW_GATEWAY_TOKEN" http://127.0.0.1:18789/health > /dev/null 2>&1; then
echo "Gateway healthy after ${i}s"
ready=1
break
fi
sleep 1
done
if [ $ready -ne 1 ]; then
echo "ERROR: gateway failed to come up"
exit 1
fi
echo "===== $(date '+%H:%M:%S') starting $SWEEP_LABEL ($SWEEP_MODEL) ====="
clawbench run \
--model "$SWEEP_MODEL" \
--runs 1 \
--concurrency 4 \
--profile "$SWEEP_PROFILE" \
--judge-model "anthropic/claude-sonnet-4-6" \
-o "$OUT" \
> "$LOG" 2>&1
status=$?
echo "===== $(date '+%H:%M:%S') done $SWEEP_LABEL (exit $status) ====="
# Archive the cache for future audits
# shellcheck disable=SC1091
source "$(dirname "$0")/_archive_cache.sh" 2>/dev/null && archive_run_cache || echo "[archive] helper missing, skipping"
kill $GATEWAY_PID 2>/dev/null
wait $GATEWAY_PID 2>/dev/null
exit $status

View File

@ -1,175 +0,0 @@
#!/bin/bash
# Single-model sweep with fresh gateway + bumped Node heap to prevent OOM.
#
# Invocation (from host):
# docker run -d --name clawbench-sweep-<LABEL> \
# -e SWEEP_LABEL=<label> -e SWEEP_MODEL=<routed-model> -e SWEEP_PROFILE=<abs-profile-path> \
# -v .../scripts:/home/node/app/scripts:ro \
# -v .../data:/data \
# -v .../data/container-home-openclaw:/home/node/.openclaw \
# -v .../profiles:/home/node/app/profiles:ro \
# --memory 8g \
# clawbench-clawbench:latest \
# bash /home/node/app/scripts/container_sweep_single.sh
#
# Differences vs container_sweep.sh:
# - Bumps gateway Node.js heap via NODE_OPTIONS=--max-old-space-size=4096 (prevents 2GB OOM we saw at ~4h)
# - One model per container (no shared-gateway drift between models)
# - Force-clears run_cache for THIS model before running (prevents cache-replay masking)
# - Writes to the same $LOGDIR/docker_${label}_${SWEEP_OUT_TAG}.json as the original sweep
# so generate_drift_report.py picks it up without changes
set -u
: "${SWEEP_LABEL:?SWEEP_LABEL required (e.g. glm, minimax, kimi)}"
: "${SWEEP_MODEL:?SWEEP_MODEL required (e.g. openrouter/z-ai/glm-5.1)}"
: "${SWEEP_PROFILE:?SWEEP_PROFILE required (absolute path in container)}"
# Optional overrides (defaults target the v4.14 drift sweep):
# SWEEP_LOGDIR — where JSONs and logs go (default /data/drift_2026-04-14)
# SWEEP_OUT_TAG — tag embedded in output filename (default v2026-4-14)
: "${SWEEP_LOGDIR:=/data/drift_2026-04-14}"
: "${SWEEP_OUT_TAG:=v2026-4-14}"
cd /data
LOGDIR="$SWEEP_LOGDIR"
mkdir -p "$LOGDIR"
export OPENCLAW_GATEWAY_TOKEN="local-dev-token-for-testing"
export CLAWBENCH_RUN_CACHE_DIR="/data/run_cache"
mkdir -p "$CLAWBENCH_RUN_CACHE_DIR"
# OOM fix: give the gateway Node process a 4GB old-space ceiling instead of the default ~2GB.
# Scoped via env so we don't stomp on other Node processes (clawbench itself is python).
export NODE_OPTIONS="--max-old-space-size=4096"
# State-dir isolation: the shared /home/node/.openclaw mount accumulates cruft
# across sweeps (agents/, workspace/, logs/, memory/, stale openclaw.json.*.tmp)
# which triggers gateway hot-reload churn and cascading `RPC agents.create timed
# out after 60s` failures. Give each sweep a pristine state dir that carries
# over only the config (openclaw.json, identity/, devices/, exec-approvals.json,
# tasks/, subagents/, flows/, cron/) and leaves runtime state empty.
SRC_STATE="/home/node/.openclaw"
FRESH_STATE="/tmp/openclaw-state-${SWEEP_LABEL}-$$"
echo "[state-isolate] cloning config from $SRC_STATE to $FRESH_STATE"
mkdir -p "$FRESH_STATE"
# Copy the main config (skip the .tmp/.bak/.clobbered/.pre-* cruft that can
# confuse the loader — only the canonical openclaw.json is needed).
if [ -f "$SRC_STATE/openclaw.json" ]; then
cp "$SRC_STATE/openclaw.json" "$FRESH_STATE/openclaw.json"
fi
if [ -f "$SRC_STATE/exec-approvals.json" ]; then
cp "$SRC_STATE/exec-approvals.json" "$FRESH_STATE/exec-approvals.json"
fi
# Carry over static config dirs — these are read-mostly and don't accumulate
# per-run cruft. SKIP: agents/ workspace*/ logs/ memory/ cache/ browser/ canvas/
# which all grow unboundedly across sweeps.
for d in identity devices tasks subagents flows cron; do
if [ -d "$SRC_STATE/$d" ]; then
cp -r "$SRC_STATE/$d" "$FRESH_STATE/$d"
fi
done
# Ensure runtime dirs exist but are empty
mkdir -p "$FRESH_STATE/agents" "$FRESH_STATE/workspace" "$FRESH_STATE/logs" "$FRESH_STATE/memory" "$FRESH_STATE/cache"
export OPENCLAW_STATE_DIR="$FRESH_STATE"
echo "[state-isolate] OPENCLAW_STATE_DIR=$OPENCLAW_STATE_DIR"
du -sh "$FRESH_STATE" 2>/dev/null | sed 's/^/[state-isolate] size: /'
# Map label -> cache subdir (matches what clawbench writes)
case "$SWEEP_MODEL" in
anthropic/claude-opus-4-7) CACHE_SUB="anthropic_claude-opus-4-7" ;;
anthropic/claude-sonnet-4-7) CACHE_SUB="anthropic_claude-sonnet-4-7" ;;
anthropic/claude-opus-4-6) CACHE_SUB="anthropic_claude-opus-4-6" ;;
anthropic/claude-sonnet-4-6) CACHE_SUB="anthropic_claude-sonnet-4-6" ;;
openai/gpt-5.4) CACHE_SUB="openai_gpt-5.4" ;;
openai/gpt-5.2) CACHE_SUB="openai_gpt-5.2" ;;
google/gemini-3.1-pro-preview) CACHE_SUB="google_gemini-3.1-pro-preview" ;;
openrouter/z-ai/glm-5.1) CACHE_SUB="openrouter_z-ai_glm-5.1" ;;
openrouter/qwen/qwen3.6-plus) CACHE_SUB="openrouter_qwen_qwen3.6-plus" ;;
openrouter/minimax/minimax-m2.7) CACHE_SUB="openrouter_minimax_minimax-m2.7" ;;
openrouter/moonshotai/kimi-k2.5) CACHE_SUB="openrouter_moonshotai_kimi-k2.5" ;;
# kimi-k2.6 is not yet supported in the openclaw version under test — skip.
*) CACHE_SUB="" ;;
esac
OUT="$LOGDIR/docker_${SWEEP_LABEL}_${SWEEP_OUT_TAG}.json"
LOG="$LOGDIR/docker_${SWEEP_LABEL}_${SWEEP_OUT_TAG}.log"
GWLOG="$LOGDIR/gateway_${SWEEP_LABEL}.log"
echo "===== SINGLE-MODEL SWEEP START $(date '+%Y-%m-%d %H:%M:%S') ====="
echo "label: $SWEEP_LABEL"
echo "model: $SWEEP_MODEL"
echo "profile: $SWEEP_PROFILE"
echo "out: $OUT"
echo "gwlog: $GWLOG"
echo "NODE_OPTIONS: $NODE_OPTIONS"
# Force-clear this model's run_cache so we actually re-run (no replays)
if [ -n "$CACHE_SUB" ] && [ -d "$CLAWBENCH_RUN_CACHE_DIR/$CACHE_SUB" ]; then
echo "clearing cache: $CLAWBENCH_RUN_CACHE_DIR/$CACHE_SUB"
rm -rf "$CLAWBENCH_RUN_CACHE_DIR/$CACHE_SUB"
fi
# Also remove any stale result JSON so we don't skip-on-idempotence
if [ -f "$OUT" ]; then
echo "removing stale result: $OUT"
rm -f "$OUT"
fi
# Start gateway with bumped heap
echo "Starting gateway on :18789 (heap=4GB) ..."
openclaw gateway --port 18789 > "$GWLOG" 2>&1 &
GATEWAY_PID=$!
echo "gateway pid=$GATEWAY_PID"
ready=0
for i in $(seq 1 120); do
if curl -sf -H "Authorization: Bearer $OPENCLAW_GATEWAY_TOKEN" http://127.0.0.1:18789/health > /dev/null 2>&1; then
echo "Gateway healthy after ${i}s"
ready=1
break
fi
sleep 1
done
if [ $ready -ne 1 ]; then
echo "ERROR: gateway failed to come up within 120s"
tail -30 "$GWLOG"
exit 1
fi
echo "===== $(date '+%H:%M:%S') starting $SWEEP_LABEL ($SWEEP_MODEL) ====="
clawbench run \
--model "$SWEEP_MODEL" \
--runs 3 \
--concurrency 4 \
--profile "$SWEEP_PROFILE" \
--judge-model "anthropic/claude-sonnet-4-6" \
-o "$OUT" \
> "$LOG" 2>&1
status=$?
if [ $status -eq 0 ]; then
echo "===== $(date '+%H:%M:%S') done $SWEEP_LABEL (exit 0) ====="
else
echo "===== $(date '+%H:%M:%S') FAILED $SWEEP_LABEL (exit $status) ====="
tail -20 "$LOG"
fi
# Archive the cache for future audits (preserves transcripts per sweep tag)
# shellcheck disable=SC1091
source "$(dirname "$0")/_archive_cache.sh" 2>/dev/null && archive_run_cache || echo "[archive] helper missing, skipping"
echo ""
echo "===== SINGLE-MODEL SWEEP END $(date '+%Y-%m-%d %H:%M:%S') ====="
kill $GATEWAY_PID 2>/dev/null
wait $GATEWAY_PID 2>/dev/null
echo "gateway stopped"
# Clean up the isolated state dir (don't accumulate /tmp cruft across sweeps).
if [ -n "${FRESH_STATE:-}" ] && [ -d "$FRESH_STATE" ]; then
echo "[state-isolate] removing $FRESH_STATE"
rm -rf "$FRESH_STATE"
fi
exit $status

View File

@ -1,254 +0,0 @@
"""Fair 9-model comparison report generator for the v2026-4-19 full sweep.
Reads the per-run archive at data/run_cache_archive/<tag>/<cache_sub>/<task>/runN.json
and computes, per model:
- Coverage % (archived runs / 120)
- Overall mean, clean mean (excl. infra-zeros), coverage-normalized score
- Per-tier mean (tier1-5)
- Judge-infra failures remaining (should be 0 after rejudge pass)
Writes markdown to reports/EVAL_REPORT_9MODEL_FAIR_<tag>.md.
Usage:
python3 scripts/generate_fair_report.py \\
--tag v2026-4-19-full \\
[--out reports/EVAL_REPORT_9MODEL_FAIR_v2026-4-19-full.md]
"""
from __future__ import annotations
import argparse
import json
import re
from collections import defaultdict
from pathlib import Path
from statistics import mean
ROOT = Path(__file__).resolve().parent.parent
MODEL_MAP = {
"opus47": ("anthropic_claude-opus-4-7", "Claude Opus 4.7"),
"opus46": ("anthropic_claude-opus-4-6", "Claude Opus 4.6"),
"sonnet46": ("anthropic_claude-sonnet-4-6", "Claude Sonnet 4.6"),
"gpt54": ("openai_gpt-5.4", "GPT 5.4"),
"gemini": ("google_gemini-3.1-pro-preview", "Gemini 3.1 Pro"),
"glm": ("openrouter_z-ai_glm-5.1", "GLM 5.1"),
"minimax": ("openrouter_minimax_minimax-m2.7", "MiniMax M2.7"),
"kimi25": ("openrouter_moonshotai_kimi-k2.5", "Kimi K2.5"),
"qwen": ("openrouter_qwen_qwen3.6-plus", "Qwen 3.6 Plus"),
}
JUDGE_INFRA_PHRASES = [
"gateway is restarting", "judge execution failed", "judge failed to run",
"judge call failed", "judge timed out",
]
def tier_of(task_id: str) -> str:
m = re.match(r"t(\d)-", task_id)
return f"tier{m.group(1)}" if m else "other"
def scan_archive(cache_dir: Path) -> list[dict]:
rows = []
if not cache_dir.exists():
return rows
for tdir in sorted(cache_dir.iterdir()):
if not tdir.is_dir():
continue
for rf in sorted(tdir.glob("run*.json")):
try:
d = json.loads(rf.read_text())
except Exception:
continue
jr = d.get("judge_result", {}) or {}
reason = (jr.get("reason") or "").lower()
judge_infra = (
jr.get("enabled")
and "rejudged_at" not in jr
and (
any(p in reason for p in JUDGE_INFRA_PHRASES)
or jr.get("error")
or (not reason.strip() and jr.get("score", 0) == 0)
)
)
rows.append({
"task": tdir.name,
"tier": tier_of(tdir.name),
"run_score": d.get("run_score", 0),
"c": d.get("completion_result", {}).get("score", 0),
"t": d.get("trajectory_result", {}).get("score", 0),
"b": d.get("behavior_result", {}).get("score", 0),
"j": jr.get("score", 0) if jr.get("enabled") else None,
"judge_infra": bool(judge_infra),
"rejudged": "rejudged_at" in jr,
"is_infra_zero": d.get("run_score", 0) < 0.01,
})
return rows
def summarize(label: str, cache_sub: str, pretty: str, tag: str) -> dict:
cache_dir = ROOT / "data" / "run_cache_archive" / tag / cache_sub
rows = scan_archive(cache_dir)
n = len(rows)
if n == 0:
return {"label": label, "pretty": pretty, "n": 0, "missing": 120}
all_scores = [r["run_score"] for r in rows]
clean_rows = [r for r in rows if not r["is_infra_zero"]]
clean_scores = [r["run_score"] for r in clean_rows]
overall = mean(all_scores) if all_scores else 0
clean = mean(clean_scores) if clean_scores else 0
cov_norm = sum(clean_scores) / 120
coverage_pct = 100.0 * len(clean_rows) / 120
per_tier = defaultdict(list)
for r in rows:
per_tier[r["tier"]].append(r["run_score"])
tier_means = {t: mean(v) for t, v in per_tier.items() if v}
# Judge-only score (how well model does purely on LLM judgment)
judge_scores = [r["j"] for r in rows if r["j"] is not None]
judge_mean = mean(judge_scores) if judge_scores else None
# C=1.0 pass count
c_pass_count = sum(1 for r in rows if r["c"] >= 0.9999)
return {
"label": label,
"pretty": pretty,
"n": n,
"missing": max(0, 120 - n),
"n_clean": len(clean_rows),
"coverage_pct": coverage_pct,
"overall": overall,
"clean": clean,
"cov_norm": cov_norm,
"tier_means": tier_means,
"judge_mean": judge_mean,
"c_pass_count": c_pass_count,
"judge_infra_remaining": sum(1 for r in rows if r["judge_infra"]),
"rejudged": sum(1 for r in rows if r["rejudged"]),
}
def build_markdown(summaries: list[dict], tag: str) -> str:
summaries = [s for s in summaries if s["n"] > 0]
summaries.sort(key=lambda s: -s.get("clean", 0))
L = []
L.append(f"# ClawBench Fair 9-Model Comparison — {tag}")
L.append("")
L.append("All 9 models at 120/120 coverage after gap-fill. Rankings use")
L.append("**clean mean run_score** — mean across all 120 archived runs per model.")
L.append("")
L.append("## Ranking (clean mean run_score, 01 scale)")
L.append("")
L.append("| Rank | Model | Clean | Judge-only | C=1.0 tasks | Coverage |")
L.append("|---:|---|---:|---:|---:|---:|")
for rank, s in enumerate(summaries, 1):
jm = f"{s['judge_mean']:.3f}" if s.get("judge_mean") is not None else ""
cpct = s.get("c_pass_count", 0)
L.append(f"| {rank} | **{s['pretty']}** | **{s['clean']:.4f}** | "
f"{jm} | {cpct}/{s['n']} | {s['n']}/120 |")
L.append("")
L.append("## Fairness audit — passed")
L.append("")
L.append("All 9 models subjected to **identical** evaluation conditions:")
L.append("")
L.append("- **Same 40 tasks × 3 runs = 120 expected runs per model** (all from v4-19-full sweep)")
L.append("- **Same completion/trajectory/behavior verifiers** for every model")
L.append("- **Same Docker image** (openclaw 2026-04-16 baseline)")
L.append("- **Same judge model** (Claude Sonnet 4.6)")
L.append("- **Judge infra failures all rejudged** via direct Anthropic API (0 left)")
L.append("- **Coverage parity**: 97-99% across all models (within ~3 runs)")
L.append("")
# Coverage table
L.append("### Coverage detail")
L.append("")
L.append("| Model | Archived | Missing | Rejudged via API |")
L.append("|---|---:|---:|---:|")
for s in summaries:
L.append(f"| {s['pretty']} | {s['n']}/120 | {s['missing']} | {s['rejudged']} |")
L.append("")
# Per-tier
L.append("## Per-tier mean run_score")
L.append("")
L.append("| Model | Tier 1 | Tier 2 | Tier 3 | Tier 4 | Tier 5 |")
L.append("|---|---:|---:|---:|---:|---:|")
for s in summaries:
tm = s.get("tier_means", {})
row = [s["pretty"]]
for t in ("tier1", "tier2", "tier3", "tier4", "tier5"):
row.append(f"{tm[t]:.3f}" if t in tm else "")
L.append("| " + " | ".join(row) + " |")
L.append("")
# Legend
L.append("## Glossary")
L.append("")
L.append("- **Cov-norm**: `clean_sum / 120`. Missing runs count as 0.")
L.append(" This is the single fair comparison number — it penalizes both")
L.append(" low scores AND infra-related missing runs.")
L.append("- **Clean**: Mean run_score across archived runs (excludes infra-zeros).")
L.append(" Shows capability ceiling ignoring infra flakiness.")
L.append("- **Judge-only**: Mean LLM-judge score (0-1 from Claude Sonnet 4.6).")
L.append(" Independent second opinion on quality, used when deterministic")
L.append(" verifiers can't capture nuance.")
L.append("- **Cov%**: Fraction of 120 runs that produced a non-infra outcome.")
L.append("- **run_score**: Weighted combination — when deterministic verifiers")
L.append(" pass (C≥0.9999): `0.4·C + 0.3·T + 0.2·B + 0.1·J`. Else, judge excluded,")
L.append(" renormalized over C/T/B.")
L.append("")
# Caveats
L.append("## Caveats")
L.append("")
L.append("- **Missing runs** (1-3 per model) were infra failures that never")
L.append(" wrote to cache. Treated as 0 in cov-norm (penalizes the model).")
L.append("- **Some tasks have strict verifiers** that require specific file")
L.append(" artifacts. All models face the same verifier, so the comparison")
L.append(" is internally fair even where individual verifier scores feel low.")
L.append("- **Judge scores come from a single judge model** (Sonnet 4.6). Judge")
L.append(" bias toward its own family is possible but small at 10% weight.")
L.append("- **Ranking gaps of <0.02 cov-norm are within run-to-run noise**.")
L.append(" Treat models within the top cluster as roughly equivalent.")
L.append("")
return "\n".join(L) + "\n"
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--tag", required=True)
ap.add_argument("--out", type=Path, default=None)
ap.add_argument("--exclude", default="", help="comma-separated model labels to exclude")
args = ap.parse_args()
excluded = {x.strip() for x in args.exclude.split(",") if x.strip()}
summaries = [summarize(label, sub, pretty, args.tag)
for label, (sub, pretty) in MODEL_MAP.items()
if label not in excluded]
out_path = args.out or (ROOT / "reports" / f"EVAL_REPORT_9MODEL_FAIR_{args.tag}.md")
out_path.parent.mkdir(parents=True, exist_ok=True)
out_path.write_text(build_markdown(summaries, args.tag))
print(f"Wrote: {out_path}")
present = [s for s in summaries if s["n"] > 0]
present.sort(key=lambda s: -s.get("cov_norm", 0))
print()
print(f"{'Rank':>4} {'Model':<20} {'Runs':>7} {'Cov%':>5} {'CovNorm':>8} {'Clean':>7} {'Judge':>6}")
print("-" * 66)
for i, s in enumerate(present, 1):
jm = f"{s['judge_mean']:.3f}" if s.get("judge_mean") is not None else ""
print(
f"{i:>4} {s['pretty']:<20} {s['n']}/120 {s['coverage_pct']:>4.0f}% "
f"{s['cov_norm']:>8.4f} {s['clean']:>7.4f} {jm:>6}"
)
if __name__ == "__main__":
main()

22
scripts/infra_log_gate.sh Executable file
View File

@ -0,0 +1,22 @@
#!/bin/bash
# Fail if a ClawBench/OpenClaw run directory contains infra-level failures.
set -u
dir="${1:?usage: infra_log_gate.sh <log-dir>}"
if [ ! -d "$dir" ]; then
echo "[infra-gate] missing log directory: $dir" >&2
exit 2
fi
pattern="no longer exists|env_unavailable|environment_unavailable|REJECTED|Traceback|model_not_allowed|model not allowed|not allowed|WebSocket closed|API key|billing|Insufficient|sessions.create.*✗|Gateway .*timed out|control-plane.*timed out|connect.*timed out|RPC .*timed out|agents.create timed out|sessions.create.*timed out"
matches="$(rg -n "$pattern" "$dir" 2>/dev/null || true)"
if [ -n "$matches" ]; then
echo "[infra-gate] infra-level signatures found in $dir" >&2
printf '%s\n' "$matches" | head -80 >&2
exit 1
fi
echo "[infra-gate] clean: $dir"

View File

@ -1,289 +0,0 @@
"""Re-judge ALL judge-infra-failure runs across all models in a drift sweep dir.
Fixes: 'Gateway is restarting', 'Judge execution failed', empty-reason 0-score
judge results by re-running the judge via direct Anthropic API calls (bypassing
the gateway that was failing in the first place).
Updates:
- data/run_cache_archive/<sweep_tag>/<model>/<task>/runN.json (in place)
- data/drift_*/docker_<label>_<tag>.json (aggregates)
Usage:
python3 scripts/rejudge_all.py \
--drift-dir data/drift_2026-04-19-full \
--archive-dir data/run_cache_archive/v2026-4-19-full \
[--dry-run]
"""
from __future__ import annotations
import argparse
import asyncio
import json
import os
import re
import sys
import time
from pathlib import Path
from typing import Optional
import anthropic
import yaml
ROOT = Path(__file__).resolve().parent.parent
TASK_DIRS = [ROOT / "tasks" / f"tier{i}" for i in range(1, 6)]
FAILURE_PHRASES = [
"gateway is restarting",
"judge execution failed",
"judge failed to run",
"judge call failed",
"judge timed out",
]
# Weights copied from clawbench/scorer.py
WEIGHTS_DETERMINISTIC = {"completion": 0.40, "trajectory": 0.30, "behavior": 0.20}
WEIGHTS_WITH_JUDGE = {"completion": 0.40, "trajectory": 0.30, "behavior": 0.20, "judge": 0.10}
WEIGHTS_SEMANTIC_ONLY = {"completion": 0.20, "trajectory": 0.20, "behavior": 0.10, "judge": 0.50}
DETERMINISTIC_FLOOR = 0.9999
# Cache-sub → model label (for result JSON lookup)
CACHE_TO_LABEL = {
"openrouter_z-ai_glm-5.1": "glm",
"openrouter_minimax_minimax-m2.7": "minimax",
"openrouter_moonshotai_kimi-k2.5": "kimi",
"openrouter_qwen_qwen3.6-plus": "qwen",
"anthropic_claude-opus-4-6": "opus46",
"anthropic_claude-opus-4-7": "opus47",
"anthropic_claude-sonnet-4-6": "sonnet46",
"openai_gpt-5.4": "gpt54",
"openai_gpt-5.2": "gpt52",
"google_gemini-3.1-pro-preview": "gemini",
}
def get_api_key() -> str:
k = os.environ.get("ANTHROPIC_API_KEY")
if k:
return k
cfg = Path.home() / ".openclaw" / "openclaw.json"
if cfg.exists():
try:
v = json.loads(cfg.read_text()).get("env", {}).get("ANTHROPIC_API_KEY")
if v:
return v
except Exception:
pass
raise RuntimeError("No ANTHROPIC_API_KEY found (set env var or openclaw.json)")
def load_tasks() -> dict[str, dict]:
out = {}
for td in TASK_DIRS:
if not td.exists():
continue
for yf in sorted(td.glob("*.yaml")):
t = yaml.safe_load(yf.read_text())
if t and "id" in t:
out[t["id"]] = t
return out
def is_judge_infra_fail(jr: dict) -> bool:
if not jr or not jr.get("enabled"):
return False
reason = (jr.get("reason") or "").lower()
if any(p in reason for p in FAILURE_PHRASES):
return True
if jr.get("error"):
return True
# Empty reason + score 0 is likely an unreported failure
if not reason.strip() and jr.get("score", 0) == 0:
return True
return False
def render_transcript_excerpt(transcript: dict, max_chars: int = 4000) -> str:
msgs = transcript.get("messages", []) if transcript else []
parts = []
for m in msgs:
role = m.get("role", "?")
text = (m.get("text") or "").strip()
if text:
parts.append(f"[{role}] {text[:500]}")
for tc in (m.get("tool_calls") or []):
parts.append(f"[{role}/tool] {tc.get('name','?')}({json.dumps(tc.get('arguments',{}))[:120]})")
if m.get("tool_result_for"):
tr = (m.get("tool_result_content") or "")
parts.append(f"[tool_result] {tr[:300]}")
excerpt = "\n".join(parts)
if len(excerpt) > max_chars:
excerpt = excerpt[:max_chars] + "\n... (truncated)"
return excerpt
def build_judge_prompt(task: dict, run: dict) -> str:
rubric = task.get("judge", {}).get("rubric", "").strip()
transcript_excerpt = render_transcript_excerpt(run.get("transcript", {}))
cr = run.get("completion_result", {})
comp_summary = (
f"score={cr.get('score',0):.3f} "
f"passed={cr.get('passed_assertions',0)}/{cr.get('total_assertions',0)}"
)
failures = cr.get("failed_assertions", [])
comp_feedback = "\n".join(f"- {f}" for f in failures[:5]) if failures else "(none)"
return (
f"{rubric}\n\n"
f"=== Completion verifier summary ===\n{comp_summary}\n"
f"Failed assertions:\n{comp_feedback}\n\n"
f"=== Transcript excerpt ===\n{transcript_excerpt}\n"
)
JSON_RE = re.compile(r"\{.*\}", re.DOTALL)
def parse_judge_response(raw: str, threshold: float) -> dict:
try:
# Find the first balanced JSON object (json.raw_decode tolerates trailing text)
start = raw.find("{")
if start < 0:
raise ValueError("no JSON in response")
decoder = json.JSONDecoder()
obj, _end = decoder.raw_decode(raw[start:])
score = float(obj.get("score", 0))
confidence = float(obj.get("confidence", 0.5))
reason = str(obj.get("reason", ""))
return {
"enabled": True,
"score": round(max(0.0, min(1.0, score)), 4),
"confidence": round(max(0.0, min(1.0, confidence)), 4),
"reason": reason,
"rubric_hits": obj.get("rubric_hits") or [],
"rubric_misses": obj.get("rubric_misses") or [],
"passing_threshold": threshold,
"passed": score >= threshold,
"error": None,
}
except Exception as exc:
return {
"enabled": True, "score": 0.0, "confidence": 0.0,
"reason": f"parse failed: {exc}", "rubric_hits": [], "rubric_misses": [],
"passing_threshold": threshold, "passed": False, "error": str(exc),
}
def combine_run_score(c: float, t: float, b: float, j: Optional[float], has_det: bool) -> float:
if j is None:
w = WEIGHTS_DETERMINISTIC
ws = w["completion"]*c + w["trajectory"]*t + w["behavior"]*b
return round(min(1.0, max(0.0, ws/sum(w.values()))), 4)
if has_det:
if c < DETERMINISTIC_FLOOR:
w = WEIGHTS_DETERMINISTIC
ws = w["completion"]*c + w["trajectory"]*t + w["behavior"]*b
return round(min(1.0, max(0.0, ws/sum(w.values()))), 4)
w = WEIGHTS_WITH_JUDGE
ws = w["completion"]*c + w["trajectory"]*t + w["behavior"]*b + w["judge"]*j
return round(min(1.0, max(0.0, ws)), 4)
w = WEIGHTS_SEMANTIC_ONLY
ws = w["completion"]*c + w["trajectory"]*t + w["behavior"]*b + w["judge"]*j
return round(min(1.0, max(0.0, ws)), 4)
def main() -> None:
ap = argparse.ArgumentParser()
ap.add_argument("--drift-dir", required=True, type=Path)
ap.add_argument("--archive-dir", required=True, type=Path)
ap.add_argument("--dry-run", action="store_true")
args = ap.parse_args()
if not args.archive_dir.exists():
print(f"Archive dir missing: {args.archive_dir}")
sys.exit(1)
tasks = load_tasks()
print(f"Loaded {len(tasks)} task definitions")
# Gather all affected runs: (cache_sub, task_id, run_path, run_data)
affected: list = []
for model_dir in sorted(args.archive_dir.iterdir()):
if not model_dir.is_dir():
continue
if model_dir.name not in CACHE_TO_LABEL:
continue
for task_dir in model_dir.iterdir():
if not task_dir.is_dir():
continue
for rf in sorted(task_dir.glob("run*.json")):
try:
run = json.loads(rf.read_text())
except Exception:
continue
if is_judge_infra_fail(run.get("judge_result", {})):
affected.append((model_dir.name, task_dir.name, rf, run))
print(f"Found {len(affected)} runs with judge infra failures")
if args.dry_run:
from collections import Counter
by_model = Counter(a[0] for a in affected)
for m, n in by_model.most_common():
print(f" {m}: {n}")
return
if not affected:
return
api_key = get_api_key()
client = anthropic.Anthropic(api_key=api_key)
# Re-judge each
succ = 0
fail = 0
for i, (cache_sub, task_id, rp, run) in enumerate(affected):
task = tasks.get(task_id)
if not task or not task.get("judge"):
continue
prompt = build_judge_prompt(task, run)
threshold = task["judge"].get("passing_threshold", 0.7)
print(f"[{i+1}/{len(affected)}] {cache_sub}/{task_id}/{rp.name} ... ", end="", flush=True)
try:
t0 = time.monotonic()
resp = client.messages.create(
model="claude-sonnet-4-6", max_tokens=1024,
messages=[{"role": "user", "content": prompt}],
)
raw = resp.content[0].text
dur_ms = int((time.monotonic() - t0) * 1000)
parsed = parse_judge_response(raw, threshold)
parsed["model"] = "anthropic/claude-sonnet-4-6"
parsed["duration_ms"] = dur_ms
parsed["token_usage"] = {
"input_tokens": resp.usage.input_tokens,
"output_tokens": resp.usage.output_tokens,
}
parsed["rejudged_at"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
run["judge_result"] = parsed
# Recompute run_score
cr = run.get("completion_result", {})
tr = run.get("trajectory_result", {})
br = run.get("behavior_result", {})
has_det = cr.get("total_assertions", 0) > 0
j = parsed["score"] if parsed["enabled"] and not parsed.get("error") else None
old_rs = run.get("run_score", 0)
new_rs = combine_run_score(cr.get("score", 0), tr.get("score", 0), br.get("score", 0), j, has_det)
run["run_score"] = new_rs
tmp = rp.with_suffix(".json.tmp")
tmp.write_text(json.dumps(run, indent=2))
tmp.replace(rp)
print(f"J={parsed['score']:.2f} ΔRS={new_rs - old_rs:+.3f}")
succ += 1
except Exception as exc:
print(f"ERROR: {exc}")
fail += 1
print(f"\nRe-judging complete: {succ} succeeded, {fail} failed")
if __name__ == "__main__":
main()

136
scripts/setup_gbrain_runtime.sh Executable file
View File

@ -0,0 +1,136 @@
#!/usr/bin/env bash
# Prepare a lane-local GBrain install for OpenClaw benchmark runs.
#
# The image supplies /opt/gbrain and this script keeps secrets runtime-only:
# keys are read from the lane's openclaw.json env block or existing process env,
# never baked into Docker layers.
set -Eeuo pipefail
if [ "${CLAWBENCH_ENABLE_GBRAIN:-0}" != "1" ]; then
exit 0
fi
: "${HOME:?HOME is required}"
GBRAIN_ROOT="${GBRAIN_ROOT:-/opt/gbrain}"
if [ ! -d "$GBRAIN_ROOT" ]; then
echo "[gbrain] missing $GBRAIN_ROOT" >&2
exit 1
fi
export PATH="$GBRAIN_ROOT/bin:/usr/local/bun/bin:$PATH"
export GBRAIN_ALLOW_SHELL_JOBS="${GBRAIN_ALLOW_SHELL_JOBS:-1}"
STATE_DIR="${OPENCLAW_STATE_DIR:-$HOME/.openclaw}"
CONFIG_PATH="${OPENCLAW_CONFIG_PATH:-$STATE_DIR/openclaw.json}"
LOG_DIR="${CLAWBENCH_GBRAIN_LOG_DIR:-$STATE_DIR/logs}"
mkdir -p "$HOME/.gbrain" "$LOG_DIR"
LOG_PATH="$LOG_DIR/gbrain-runtime.log"
if [ -f "$CONFIG_PATH" ]; then
eval "$(python3 - "$CONFIG_PATH" <<'PY'
import json
import os
import shlex
import sys
config_path = sys.argv[1]
try:
data = json.load(open(config_path, encoding="utf-8"))
except Exception:
data = {}
env = data.get("env") if isinstance(data, dict) else {}
if not isinstance(env, dict):
env = {}
for key in ("OPENAI_API_KEY", "ANTHROPIC_API_KEY"):
value = os.environ.get(key) or env.get(key)
if value:
print(f"export {key}={shlex.quote(str(value))}")
PY
)"
python3 - "$CONFIG_PATH" "$GBRAIN_ROOT" <<'PY'
import json
import sys
config_path = sys.argv[1]
gbrain_root = sys.argv[2]
try:
with open(config_path, encoding="utf-8") as handle:
data = json.load(handle)
except Exception:
data = {}
if not isinstance(data, dict):
data = {}
plugins = data.setdefault("plugins", {})
if not isinstance(plugins, dict):
plugins = {}
data["plugins"] = plugins
allow = plugins.get("allow")
if not isinstance(allow, list):
allow = []
plugins["allow"] = allow
if "gbrain" not in allow:
allow.append("gbrain")
entries = plugins.get("entries")
if not isinstance(entries, dict):
entries = {}
plugins["entries"] = entries
entry = entries.get("gbrain")
if not isinstance(entry, dict):
entry = {}
entries["gbrain"] = entry
entry["enabled"] = True
load = plugins.get("load")
if not isinstance(load, dict):
load = {}
plugins["load"] = load
paths = load.get("paths")
if not isinstance(paths, list):
paths = []
load["paths"] = paths
if gbrain_root not in paths:
paths.append(gbrain_root)
with open(config_path, "w", encoding="utf-8") as handle:
json.dump(data, handle, indent=2)
handle.write("\n")
PY
fi
echo "[gbrain] preparing HOME=$HOME" > "$LOG_PATH"
echo "[gbrain] version: $(gbrain --version 2>/dev/null || true)" >> "$LOG_PATH"
echo "[gbrain] plugin path enabled in $CONFIG_PATH" >> "$LOG_PATH"
if [ ! -f "$HOME/.gbrain/config.json" ]; then
gbrain init >> "$LOG_PATH" 2>&1
else
gbrain apply-migrations --yes --non-interactive >> "$LOG_PATH" 2>&1 || true
fi
BRAIN_REPO="${GBRAIN_BRAIN_REPO:-$HOME/brain}"
mkdir -p "$BRAIN_REPO"
if [ "${CLAWBENCH_GBRAIN_SEED_SMOKE:-1}" = "1" ] && ! find "$BRAIN_REPO" -type f -name '*.md' -print -quit | grep -q .; then
cat > "$BRAIN_REPO/gbrain-smoke.md" <<'EOF'
# GBrain smoke page
This page verifies that the benchmark image can initialize, import, and query a
lane-local GBrain database. It is intentionally generic and not task-specific.
EOF
fi
if find "$BRAIN_REPO" -type f -name '*.md' -print -quit | grep -q .; then
gbrain import "$BRAIN_REPO" --no-embed >> "$LOG_PATH" 2>&1 || true
if [ -n "${OPENAI_API_KEY:-}" ]; then
gbrain embed --stale >> "$LOG_PATH" 2>&1 || true
else
echo "[gbrain] OPENAI_API_KEY not available; semantic embeddings skipped" >> "$LOG_PATH"
fi
fi
gbrain doctor --json >> "$LOG_PATH" 2>&1 || true
echo "[gbrain] ready" >> "$LOG_PATH"

View File

@ -57,12 +57,10 @@ tasks-public/
docker build -t clawbench .
```
The repo `Dockerfile` pins an OpenClaw image digest so public Space
builds do not silently drift. Override `OPENCLAW_IMAGE` only when you
intend to measure a different platform build. Note that platform
upgrades can shift scores (we observed +0.13 to +0.29 per model going
from 4.9 → 4.15-beta.1) — when comparing two model runs, build them
against the same OpenClaw release.
The repo `Dockerfile` layers ClawBench on the configured OpenClaw base
image. Platform upgrades can shift scores, so record the OpenClaw
version for every published comparison and build both sides of a
comparison against the same OpenClaw release.
## How to run Core v1
@ -107,10 +105,8 @@ your ClawBench config. See MANIFEST.yaml for a programmatic list.
- **OpenClaw platform version matters.** Upgrading from 4.9 → 4.15-beta.1
shifted scores by +0.13 to +0.29 across models. Build both sides of
any comparison from the same OpenClaw release.
- **Judge scores** come from Claude Sonnet 4.6 via direct Anthropic
API (with a fallback from the gateway judge). Scores assume the
judge is working correctly; re-judging broken runs may be required
(see `scripts/rejudge_all.py` in the main repo).
- **Judge scores** are advisory and depend on the configured judge model.
They are reported separately and cannot replace deterministic checks.
## What's NOT in Core v1
@ -120,9 +116,9 @@ your ClawBench config. See MANIFEST.yaml for a programmatic list.
- **9 noise tasks** (cross-model SNR < 0.5) either broken verifiers
or genuinely ambiguous prompts. Scheduled for redesign.
- **3 ranking-breaker tasks** — tasks where the cross-model ordering
conflicts with the reference ranking (e.g. `t2-node-search-patch`,
`t5-contradictory-requirements`). Not broken per se; just
inconsistent with the headline.
conflicts with the reference ranking. Not broken per se; just
inconsistent with the headline. Their task IDs and contents remain
private with the rest of the holdout.
Also missing entirely from Core v1:
- **Tier 6 long-horizon (100+ turn) tasks** — planned for v2.

122
tests/test_ablation.py Normal file
View File

@ -0,0 +1,122 @@
from clawbench.ablation import (
common_compatible_task_set,
compare_results,
default_tool_profile,
)
from clawbench.adapters.hermes import HermesAdapterConfig
from clawbench.schemas import (
BenchmarkResult,
CompletionSpec,
FileState,
SimulatedUser,
TaskDefinition,
TaskFamily,
TaskStats,
Tier,
UserTurn,
)
def _task(task_id: str) -> TaskDefinition:
return TaskDefinition(
id=task_id,
name=task_id,
tier=Tier.TIER1,
family=TaskFamily.CODING,
surface="coding",
user=SimulatedUser(turns=[UserTurn(message="write out.txt")]),
completion=CompletionSpec(files=[FileState(path="out.txt")]),
)
def test_tool_profile_fingerprint_is_stable() -> None:
config = HermesAdapterConfig(driver_mode="ai_agent", enabled_toolsets=["hermes-api-server"])
a = default_tool_profile(adapter="hermes", config=config, enabled_toolsets=["hermes-api-server"])
b = default_tool_profile(adapter="hermes", config=config, enabled_toolsets=["hermes-api-server"])
assert a.fingerprint == b.fingerprint
assert "browser" in a.interfaces
assert "multi_turn" in a.interfaces
def test_common_compatible_task_set_uses_effective_adapter_config() -> None:
tasks = [_task("a"), _task("b")]
plan = common_compatible_task_set(
tasks,
{
"openclaw": ("openclaw", None),
"hermes": ("hermes", HermesAdapterConfig(driver_mode="ai_agent")),
},
)
assert plan.task_ids == ["a", "b"]
assert plan.skipped == {}
def _result(label: str, model: str, task_ids: list[str], score: float) -> BenchmarkResult:
task_results = [
TaskStats(
task_id=task_id,
tier="tier1",
family="coding",
runs=1,
mean_completion_score=1.0,
mean_trajectory_score=1.0,
mean_behavior_score=1.0,
mean_run_score=score,
reliability_score=1.0,
variance_score=1.0,
mean_task_score=score,
stddev=0.0,
min_score=score,
max_score=score,
pass_at_1=True,
pass_rate=1.0,
pass_hat_k=True,
)
for task_id in task_ids
]
return BenchmarkResult(
submission_id=label,
model=model,
provider="test",
timestamp="2026-04-25T00:00:00Z",
overall_score=score,
overall_completion=1.0,
overall_trajectory=1.0,
overall_behavior=1.0,
overall_reliability=1.0,
overall_ci_lower=score,
overall_ci_upper=score,
overall_pass_hat_k=1.0,
task_results=task_results,
)
def test_compare_results_rejects_different_task_sets() -> None:
comparison = compare_results(
{
"a": _result("a", "m", ["t1", "t2"], 0.8),
"b": _result("b", "m", ["t1"], 0.9),
}
)
assert comparison["fair"] is False
assert comparison["task_verifier_fair"] is False
assert comparison["controlled_ablation"] is False
assert comparison["same_model"] is True
assert comparison["same_task_set"] is False
def test_compare_results_allows_cross_model_same_task_leaderboard() -> None:
a = _result("a", "model-a", ["t1", "t2"], 0.8)
b = _result("b", "model-b", ["t1", "t2"], 0.9)
a.task_snapshot_fingerprint = "snapshot-1"
b.task_snapshot_fingerprint = "snapshot-1"
comparison = compare_results({"a": a, "b": b})
assert comparison["fair"] is True
assert comparison["task_verifier_fair"] is True
assert comparison["controlled_ablation"] is False
assert comparison["same_model"] is False

222
tests/test_adapter_base.py Normal file
View File

@ -0,0 +1,222 @@
"""Tests for `clawbench.adapters.base` + registry.
Keeps the adapter ABC and registration helpers honest before any
concrete adapter lands. A parametrized contract test in
`test_adapter_contract.py` will exercise the ABC against every shipped
adapter later.
"""
from __future__ import annotations
from pathlib import Path
import pytest
from clawbench.adapters import (
ADAPTERS,
AdapterContext,
AgentAdapter,
PhaseResult,
StateQueryResult,
get_adapter,
register_adapter,
)
from clawbench.canonical import (
AdapterCapability,
CanonicalPhase,
CanonicalTask,
StateQuery,
)
from clawbench.canonical.convert import from_task_definition
from clawbench.schemas import (
CompletionSpec,
ExecutionCheck,
FileState,
SimulatedUser,
TaskDefinition,
TaskFamily,
TaskSetup,
Tier,
Transcript,
UserTurn,
)
# ---------------------------------------------------------------------------
# Minimal adapter for contract verification.
# ---------------------------------------------------------------------------
class _EchoAdapter(AgentAdapter):
name = "echo-test-adapter"
capabilities = {AdapterCapability.FILES, AdapterCapability.EXECUTION}
async def setup(self, ctx: AdapterContext) -> None: # pragma: no cover - trivial
return None
async def run_phase(
self, phase: CanonicalPhase, ctx: AdapterContext
) -> PhaseResult:
return PhaseResult(messages=[], adapter_metadata={"phase": phase.name})
async def verify_state_query(
self, query: StateQuery, ctx: AdapterContext
) -> StateQueryResult:
if query.required_capability in self.capabilities:
return StateQueryResult(ok=True, detail="echo-adapter-always-ok")
return StateQueryResult(
ok=False,
detail=f"echo adapter does not provide {query.required_capability.value}",
capability_missing=True,
)
async def teardown(self, ctx: AdapterContext) -> None: # pragma: no cover - trivial
return None
# ---------------------------------------------------------------------------
# Registry
# ---------------------------------------------------------------------------
def test_register_adapter_adds_to_registry_and_get_adapter_resolves() -> None:
original = dict(ADAPTERS)
try:
register_adapter(_EchoAdapter)
assert ADAPTERS["echo-test-adapter"] is _EchoAdapter
assert get_adapter("echo-test-adapter") is _EchoAdapter
finally:
ADAPTERS.clear()
ADAPTERS.update(original)
def test_register_adapter_rejects_duplicate_name() -> None:
class _OtherEcho(AgentAdapter):
name = "echo-test-adapter"
capabilities = {AdapterCapability.FILES}
async def setup(self, ctx: AdapterContext) -> None: # pragma: no cover
return None
async def run_phase(self, phase, ctx) -> PhaseResult: # pragma: no cover
return PhaseResult()
async def verify_state_query(self, query, ctx) -> StateQueryResult: # pragma: no cover
return StateQueryResult(ok=False, capability_missing=True)
async def teardown(self, ctx: AdapterContext) -> None: # pragma: no cover
return None
original = dict(ADAPTERS)
try:
register_adapter(_EchoAdapter)
with pytest.raises(ValueError):
register_adapter(_OtherEcho)
finally:
ADAPTERS.clear()
ADAPTERS.update(original)
def test_register_adapter_requires_name() -> None:
class _Nameless(AgentAdapter):
capabilities = {AdapterCapability.FILES}
async def setup(self, ctx: AdapterContext) -> None: # pragma: no cover
return None
async def run_phase(self, phase, ctx) -> PhaseResult: # pragma: no cover
return PhaseResult()
async def verify_state_query(self, query, ctx) -> StateQueryResult: # pragma: no cover
return StateQueryResult(ok=False, capability_missing=True)
async def teardown(self, ctx: AdapterContext) -> None: # pragma: no cover
return None
with pytest.raises(ValueError):
register_adapter(_Nameless)
def test_get_adapter_raises_for_unknown_name() -> None:
with pytest.raises(KeyError):
get_adapter("no-such-adapter-exists")
# ---------------------------------------------------------------------------
# Capability gating helpers
# ---------------------------------------------------------------------------
def _file_task() -> CanonicalTask:
task = TaskDefinition(
id="capability-test",
name="capability test",
tier=Tier.TIER1,
family=TaskFamily.CODING,
surface="coding",
setup=TaskSetup(),
user=SimulatedUser(
max_turns=1, turns=[UserTurn(message="Do a thing.")]
),
completion=CompletionSpec(
files=[FileState(path="out.txt", exists=True)],
execution_checks=[ExecutionCheck(name="ok", command="true")],
),
)
return from_task_definition(task)
def test_supports_is_true_when_capabilities_cover_task() -> None:
task = _file_task()
assert _EchoAdapter.supports(task)
assert _EchoAdapter.missing_capabilities_for(task) == set()
def test_supports_is_false_when_task_needs_more() -> None:
task = _file_task()
task = task.model_copy(
update={
"required_adapter_capabilities": (
task.required_adapter_capabilities | {AdapterCapability.MEMORY}
)
}
)
assert not _EchoAdapter.supports(task)
assert _EchoAdapter.missing_capabilities_for(task) == {AdapterCapability.MEMORY}
# ---------------------------------------------------------------------------
# Context roundtrip (sanity: adapter methods can build and return
# PhaseResult / StateQueryResult without tripping dataclass defaults)
# ---------------------------------------------------------------------------
def test_adapter_phase_result_round_trip(tmp_path: Path) -> None:
task = _file_task()
adapter = _EchoAdapter()
ctx = AdapterContext(
task=task,
workspace=tmp_path,
runtime_values={},
run_index=0,
model="test-model",
transcript=Transcript(),
)
import asyncio
async def _go() -> None:
await adapter.setup(ctx)
result = await adapter.run_phase(task.phases[0], ctx)
assert isinstance(result, PhaseResult)
assert result.adapter_metadata == {"phase": task.phases[0].name}
query = StateQuery(
kind="memory",
required_capability=AdapterCapability.MEMORY,
selector={"key_pattern": "x"},
)
res = await adapter.verify_state_query(query, ctx)
assert res.capability_missing is True
await adapter.teardown(ctx)
asyncio.run(_go())

View File

@ -0,0 +1,268 @@
"""Tests for `clawbench.canonical.convert.from_task_definition`.
Covers the three representative task shapes:
1. A files + execution-only task (tier-1 bugfix) must produce
`required_adapter_capabilities == {FILES, EXECUTION}`.
2. A memory-using, multi-phase task (tier-2 memory roundtrip) must
include `MEMORY` and MULTI_TURN_INJECTION is NOT set since each
phase's user has exactly one static turn.
3. A synthetic task exercising gateway_assertions, session, cron, and
browser must surface each capability.
The tests also round-trip the real task corpus through the converter
to make sure every live YAML file produces a valid `CanonicalTask`
(no missing-field or validation errors), since the converter is how
every downstream adapter will see tasks.
"""
from __future__ import annotations
from clawbench.canonical import (
AdapterCapability,
CanonicalTask,
from_task_definition,
)
from clawbench.schemas import (
BackgroundService,
CompletionSpec,
CronState,
ExecutionCheck,
FileState,
GatewayAssertion,
MemoryState,
SessionState,
SimulatedUser,
TaskDefinition,
TaskFamily,
TaskSetup,
Tier,
UserTurn,
)
from clawbench.tasks import load_all_tasks
# ---------------------------------------------------------------------------
# Fixture builders
# ---------------------------------------------------------------------------
def _files_only_task() -> TaskDefinition:
return TaskDefinition(
id="test-files-only",
name="Files-only task",
tier=Tier.TIER1,
family=TaskFamily.CODING,
surface="coding",
setup=TaskSetup(asset_packs=["pack_a"]),
user=SimulatedUser(
max_turns=2,
turns=[UserTurn(message="Fix the bug and run the tests.")],
),
completion=CompletionSpec(
files=[FileState(path="src/main.py", exists=True)],
execution_checks=[ExecutionCheck(name="tests", command="pytest -q")],
),
)
def _memory_task() -> TaskDefinition:
return TaskDefinition(
id="test-memory-roundtrip",
name="Memory roundtrip",
tier=Tier.TIER2,
family=TaskFamily.MULTI_TOOL,
surface="tools",
setup=TaskSetup(
memory_seed=[{"key": "existing_key", "value": "existing_value"}],
),
phases=[
{
"name": "store",
"user": SimulatedUser(
max_turns=1,
turns=[UserTurn(message="Remember: stack = React, Node, Postgres.")],
),
},
{
"name": "recall",
"user": SimulatedUser(
max_turns=1,
turns=[UserTurn(message="What's my stack?")],
),
},
],
completion=CompletionSpec(
memory=[MemoryState(key_pattern="stack", exists=True, value_contains=["React"])],
),
)
def _full_surface_task() -> TaskDefinition:
# Synthetic task exercising session, cron, gateway_assertion, browser,
# and a dynamic follow-up turn.
return TaskDefinition(
id="test-full-surface",
name="Full surface",
tier=Tier.TIER3,
family=TaskFamily.BROWSER,
surface="browser",
setup=TaskSetup(
pre_check_gateway=[
GatewayAssertion(
method="agents.list",
assert_path="$.count",
assert_equals=0,
),
],
background_services=[
BackgroundService(
name="echo-service",
command="python3 -m http.server",
port=0,
ready_path="/",
),
],
),
user=SimulatedUser(
max_turns=4,
turns=[
UserTurn(message="Start the task."),
UserTurn(
message="Try again.",
when_tool_family="browser",
when_last_tool_failed=True,
),
],
),
completion=CompletionSpec(
session=SessionState(should_exist=True, model_should_be="claude-opus-4"),
cron=[CronState(exists=True, description_contains="daily")],
gateway_assertions=[
GatewayAssertion(
method="memory.list",
assert_path="$.count",
assert_equals=1,
),
],
),
)
# ---------------------------------------------------------------------------
# Capability inference
# ---------------------------------------------------------------------------
def test_files_only_task_requires_only_files_and_execution() -> None:
task = _files_only_task()
task.category = "software_engineering"
task.domain = "devtools"
task.functionality = ["bugfix", "test_verification"]
task.trace_distribution = ["read_heavy", "edit_heavy", "execute_heavy"]
task.tool_surface = ["filesystem", "shell"]
task.risk_tags = ["code_regression"]
canonical = from_task_definition(task)
assert isinstance(canonical, CanonicalTask)
assert canonical.required_adapter_capabilities == {
AdapterCapability.FILES,
AdapterCapability.EXECUTION,
}
assert canonical.category == "software_engineering"
assert canonical.domain == "devtools"
assert canonical.functionality == ["bugfix", "test_verification"]
assert canonical.trace_distribution == ["read_heavy", "edit_heavy", "execute_heavy"]
assert canonical.tool_surface == ["filesystem", "shell"]
assert canonical.risk_tags == ["code_regression"]
# Seed state should carry the asset pack through.
assert len(canonical.assets.seed_state) == 1
assert canonical.assets.seed_state[0].kind == "file"
assert canonical.assets.seed_state[0].asset_pack == "pack_a"
# File + execution checks carry over.
assert len(canonical.verifier.file_states) == 1
assert len(canonical.verifier.execution_checks) == 1
assert canonical.verifier.state_queries == []
# One non-dynamic phase → no dynamic-trigger capability.
assert canonical.interaction.uses_dynamic_user_triggers is False
def test_memory_task_requires_memory_capability() -> None:
canonical = from_task_definition(_memory_task())
assert AdapterCapability.MEMORY in canonical.required_adapter_capabilities
# Two phases with a single static turn each → dynamic-trigger is NOT
# required (the simulated user just sends one message per phase).
assert AdapterCapability.MULTI_TURN_INJECTION not in canonical.required_adapter_capabilities
assert canonical.interaction.allow_multi_phase is True
assert len(canonical.phases) == 2
# Memory seed lifted to SeedEntry.
memory_seeds = [s for s in canonical.assets.seed_state if s.kind == "memory"]
assert len(memory_seeds) == 1
assert memory_seeds[0].key == "existing_key"
# Memory completion check → StateQuery with MEMORY capability.
memory_queries = [q for q in canonical.verifier.state_queries if q.kind == "memory"]
assert len(memory_queries) == 1
assert memory_queries[0].required_capability is AdapterCapability.MEMORY
assert memory_queries[0].selector == {"key_pattern": "stack"}
assert memory_queries[0].expected == {"value_contains": ["React"]}
def test_full_surface_task_surfaces_every_capability() -> None:
canonical = from_task_definition(_full_surface_task())
caps = canonical.required_adapter_capabilities
assert AdapterCapability.FILES in caps
assert AdapterCapability.EXECUTION in caps
assert AdapterCapability.SESSION in caps
assert AdapterCapability.CRON in caps
assert AdapterCapability.GATEWAY_RPC in caps
assert AdapterCapability.BROWSER in caps
# Dynamic turn (when_tool_family + when_last_tool_failed) flags MTI.
assert AdapterCapability.MULTI_TURN_INJECTION in caps
# pre_check_gateway survives as a pre-run query.
assert len(canonical.verifier.pre_run_queries) == 1
assert canonical.verifier.pre_run_queries[0].required_capability is AdapterCapability.GATEWAY_RPC
# gateway_assertions route through the verifier state_queries.
gateway_queries = [
q for q in canonical.verifier.state_queries if q.kind == "custom"
]
assert len(gateway_queries) == 1
assert gateway_queries[0].selector["method"] == "memory.list"
# Session state with model constraint surfaces in expected.
session_queries = [q for q in canonical.verifier.state_queries if q.kind == "session"]
assert len(session_queries) == 1
assert session_queries[0].expected == {"model": "claude-opus-4"}
def test_background_services_pass_through_unchanged() -> None:
canonical = from_task_definition(_full_surface_task())
assert len(canonical.assets.background_services) == 1
service = canonical.assets.background_services[0]
assert service.name == "echo-service"
assert service.command == "python3 -m http.server"
# ---------------------------------------------------------------------------
# Whole-corpus smoke
# ---------------------------------------------------------------------------
def test_every_task_in_corpus_converts() -> None:
"""Every shipped task YAML must produce a valid CanonicalTask.
Acts as a regression gate: any new field added to TaskDefinition that
the converter doesn't know about will likely still work (fields it
ignores don't break canonical), but any task using new completion
shapes that the converter can't translate will raise here.
"""
tasks = load_all_tasks()
assert tasks, "expected at least one task in the corpus"
for task in tasks:
canonical = from_task_definition(task)
# Every canonical task must declare FILES + EXECUTION capability.
assert AdapterCapability.FILES in canonical.required_adapter_capabilities
assert AdapterCapability.EXECUTION in canonical.required_adapter_capabilities
# Phases always have at least one entry (normalized_phases fills
# one from `user` when `phases` is absent).
assert canonical.phases, f"{task.id}: canonical phases empty"
# Budgets honour the source timeout.
assert canonical.budgets.timeout_seconds == task.timeout_seconds

View File

@ -37,6 +37,42 @@ def test_gateway_config_invalid_env_falls_back_to_default(monkeypatch, caplog, r
assert any("CLAWBENCH_CONNECT_TIMEOUT" in r.getMessage() for r in caplog.records)
@pytest.mark.asyncio
async def test_gateway_client_disables_websocket_keepalive_for_long_rpc(
monkeypatch: pytest.MonkeyPatch,
):
connect_kwargs: dict[str, object] = {}
class FakeWebSocket:
async def close(self) -> None:
return None
async def fake_connect(*args, **kwargs):
connect_kwargs.update(kwargs)
return FakeWebSocket()
async def fake_wait_event(self, event_name: str, *, timeout: float):
return {"payload": {"nonce": ""}}
async def fake_rpc(self, method: str, params=None, **kwargs):
return {"payload": {"type": "hello-ok", "protocol": 3}}
async def fake_listener(self):
await asyncio.sleep(60)
monkeypatch.setattr("clawbench.client.websockets.connect", fake_connect)
monkeypatch.setattr(GatewayClient, "_wait_event", fake_wait_event)
monkeypatch.setattr(GatewayClient, "_rpc", fake_rpc)
monkeypatch.setattr(GatewayClient, "_listener", fake_listener)
client = GatewayClient(GatewayConfig(connect_timeout=2))
await client.connect()
await client.close()
assert connect_kwargs["ping_interval"] is None
assert connect_kwargs["ping_timeout"] is None
def test_tool_results_are_correlated_back_to_tool_calls():
tool_message = _parse_single_message(
{
@ -106,7 +142,7 @@ async def test_gateway_client_retries_transient_drain_errors(monkeypatch: pytest
async def fake_wait_event(self, event_name: str, *, timeout: float):
return {"payload": {"nonce": ""}}
async def fake_rpc(self, method: str, params=None):
async def fake_rpc(self, method: str, params=None, **kwargs):
return {"payload": {"type": "hello-ok", "protocol": 3}}
async def fake_listener(self):
@ -143,7 +179,7 @@ async def test_gateway_client_retries_half_closed_handshake_errors(
async def fake_wait_event(self, event_name: str, *, timeout: float):
return {"payload": {"nonce": ""}}
async def fake_rpc(self, method: str, params=None):
async def fake_rpc(self, method: str, params=None, **kwargs):
return {"payload": {"type": "hello-ok", "protocol": 3}}
async def fake_listener(self):
@ -192,3 +228,71 @@ async def test_send_and_wait_collects_messages_that_arrive_after_final_state():
transcript = await client.send_and_wait(session_key, "hello", timeout=1.0)
assert [message.text for message in transcript.assistant_messages] == ["Late but valid."]
@pytest.mark.asyncio
async def test_send_and_wait_passes_gateway_timeout_and_waits_for_run():
client = GatewayClient(GatewayConfig(request_timeout=1))
session_key = "session-1"
calls: list[tuple[str, dict | None, dict]] = []
async def fake_rpc(method: str, params=None, **kwargs):
calls.append((method, params, kwargs))
if method == "sessions.send":
return {"ok": True, "payload": {"runId": "run-1"}}
if method == "agent.wait":
return {"ok": True, "payload": {"runId": "run-1", "status": "completed"}}
if method == "sessions.get":
return {
"ok": True,
"payload": {
"messages": [
{
"role": "assistant",
"content": [{"type": "text", "text": "Done."}],
}
]
},
}
return {"ok": True, "payload": {}}
client._rpc = fake_rpc # type: ignore[method-assign]
transcript = await client.send_and_wait(session_key, "hello", timeout=1.5)
send_call = next(call for call in calls if call[0] == "sessions.send")
assert send_call[1] == {
"key": session_key,
"message": "hello",
"idempotencyKey": send_call[1]["idempotencyKey"],
"timeoutMs": 1500,
}
wait_call = next(call for call in calls if call[0] == "agent.wait")
assert wait_call[1] == {"runId": "run-1", "timeoutMs": 1500}
assert wait_call[2]["timeout"] == 11.5
assert [message.text for message in transcript.assistant_messages] == ["Done."]
@pytest.mark.asyncio
async def test_send_and_wait_aborts_run_when_no_terminal_state_arrives():
client = GatewayClient(GatewayConfig(request_timeout=1))
session_key = "session-1"
calls: list[tuple[str, dict | None, dict]] = []
async def fake_rpc(method: str, params=None, **kwargs):
calls.append((method, params, kwargs))
if method == "sessions.send":
return {"ok": True, "payload": {"runId": "run-timeout"}}
if method == "agent.wait":
await asyncio.sleep(60)
if method == "sessions.abort":
return {"ok": True, "payload": {"status": "aborted"}}
if method == "sessions.get":
return {"ok": True, "payload": {"messages": []}}
return {"ok": True, "payload": {}}
client._rpc = fake_rpc # type: ignore[method-assign]
await client.send_and_wait(session_key, "hello", timeout=0.01)
assert ("sessions.abort", {"key": session_key, "runId": "run-timeout"}, {"timeout": 1}) in calls

View File

@ -3,8 +3,24 @@ from pathlib import Path
import pytest
from clawbench.client import GatewayConfig
from clawbench.adapters.base import AdapterContext, AgentAdapter, PhaseResult, StateQueryResult
from clawbench.canonical import AdapterCapability, CanonicalPhase, StateQuery
from clawbench.harness import BenchmarkHarness
from clawbench.schemas import CompletionResult, JudgeResult, TaskRunResult
from clawbench.schemas import (
CompletionResult,
CompletionSpec,
FileState,
JudgeExpectations,
JudgeResult,
SimulatedUser,
TaskDefinition,
TaskFamily,
TaskRunResult,
Tier,
Transcript,
TranscriptMessage,
UserTurn,
)
from clawbench.tasks import load_all_tasks
@ -118,7 +134,13 @@ def test_aggregate_reports_advisory_judge_metrics():
def test_compose_result_from_task_stats_supports_parallel_environment_metadata():
task = next(task for task in load_all_tasks() if task.id == "t1-bugfix-discount")
task = next(task for task in load_all_tasks() if task.id == "t1-bugfix-discount").model_copy(deep=True)
task.category = "software_engineering"
task.domain = "devtools"
task.functionality = ["bugfix", "regression_repair", "test_verification"]
task.trace_distribution = ["read_heavy", "edit_heavy", "execute_heavy", "recovery_heavy"]
task.tool_surface = ["filesystem", "shell"]
task.risk_tags = ["code_change"]
harness = BenchmarkHarness(
gateway_config=GatewayConfig(),
model="test-model",
@ -163,6 +185,29 @@ def test_compose_result_from_task_stats_supports_parallel_environment_metadata()
assert merged_result.environment["parallel_lanes"] == 2
assert merged_result.environment["requested_parallel_lanes"] == 3
assert merged_result.environment["browser_tasks_serialized"] is False
assert merged_result.environment["dimension_coverage"] == {
"category": 1,
"domain": 1,
"functionality": 3,
"trace_distribution": 4,
"tool_surface": 2,
"risk_tag": 1,
}
assert merged_result.task_results[0].category == "software_engineering"
assert merged_result.task_results[0].domain == "devtools"
category = {item.value: item for item in merged_result.category_results}
assert category["software_engineering"].task_ids == [task.id]
assert category["software_engineering"].weighted_score == pytest.approx(
base_result.overall_weighted_query_score
)
functionality_values = {item.value for item in merged_result.functionality_results}
assert {"bugfix", "regression_repair", "test_verification"}.issubset(functionality_values)
trace_values = {item.value for item in merged_result.trace_distribution_results}
assert {"read_heavy", "edit_heavy", "execute_heavy", "recovery_heavy"}.issubset(trace_values)
assert "category" in merged_result.dimension_results
assert merged_result.dimension_results["category"] == merged_result.category_results
@pytest.mark.asyncio
@ -206,7 +251,7 @@ async def test_run_rejects_registered_but_unwired_adapter(monkeypatch):
harness = BenchmarkHarness(
gateway_config=GatewayConfig(),
model="test-model",
adapter="hermes",
adapter="codex",
runs_per_task=1,
randomize_order=False,
print_report=False,
@ -215,3 +260,182 @@ async def test_run_rejects_registered_but_unwired_adapter(monkeypatch):
with pytest.raises(ValueError, match="not yet wired"):
await harness.run()
def _files_only_definition(judge: JudgeExpectations | None = None) -> TaskDefinition:
return TaskDefinition(
id="adapter-files-only",
name="Adapter files only",
tier=Tier.TIER1,
family=TaskFamily.CODING,
surface="coding",
user=SimulatedUser(
max_turns=1,
turns=[UserTurn(message="Create answer.txt")],
),
completion=CompletionSpec(
files=[FileState(path="answer.txt", exists=True, content_contains=["done"])],
),
judge=judge,
)
class FakeAgentAdapter(AgentAdapter):
name = "hermes"
capabilities = {AdapterCapability.FILES, AdapterCapability.EXECUTION}
async def setup(self, ctx: AdapterContext) -> None:
return None
async def run_phase(self, phase: CanonicalPhase, ctx: AdapterContext) -> PhaseResult:
(ctx.workspace / "answer.txt").write_text("done\n", encoding="utf-8")
message = TranscriptMessage(role="assistant", text="Created answer.txt and verified it.")
ctx.transcript.messages.append(message)
return PhaseResult(messages=[message], completed_normally=True)
async def verify_state_query(self, query: StateQuery, ctx: AdapterContext) -> StateQueryResult:
return StateQueryResult(ok=False, capability_missing=True)
async def teardown(self, ctx: AdapterContext) -> None:
return None
@pytest.mark.asyncio
async def test_hermes_adapter_runs_through_scoring_harness(monkeypatch, tmp_path: Path):
task = _files_only_definition()
monkeypatch.setattr("clawbench.harness.load_all_tasks", lambda **_: [task])
monkeypatch.setattr("clawbench.harness.get_adapter", lambda name: FakeAgentAdapter)
monkeypatch.setenv("OPENCLAW_STATE_DIR", str(tmp_path))
monkeypatch.setenv("CLAWBENCH_RUN_CACHE_DIR", "")
harness = BenchmarkHarness(
gateway_config=GatewayConfig(),
model="openai/gpt-5.5",
adapter="hermes",
runs_per_task=1,
randomize_order=False,
print_report=False,
quiet=True,
)
result = await harness.run()
run = harness.last_task_runs[task.id][0]
assert result.environment["adapter"] == "hermes"
assert result.environment["executable_adapters"] == ["hermes", "openclaw"]
assert run.error is None
assert run.completion_result.score == 1.0
assert run.delivery_outcome.value == "pass"
@pytest.mark.asyncio
async def test_openclaw_uses_shared_adapter_scoring_path(monkeypatch, tmp_path: Path):
task = _files_only_definition()
monkeypatch.setattr("clawbench.harness.load_all_tasks", lambda **_: [task])
monkeypatch.setattr("clawbench.harness.get_adapter", lambda name: FakeAgentAdapter)
monkeypatch.setenv("OPENCLAW_STATE_DIR", str(tmp_path))
monkeypatch.setenv("CLAWBENCH_RUN_CACHE_DIR", "")
harness = BenchmarkHarness(
gateway_config=GatewayConfig(),
model="openai/gpt-5.5",
adapter="openclaw",
runs_per_task=1,
randomize_order=False,
print_report=False,
quiet=True,
)
result = await harness.run()
run = harness.last_task_runs[task.id][0]
assert result.environment["adapter"] == "openclaw"
assert run.error is None
assert run.completion_result.score == 1.0
assert run.delivery_outcome.value == "pass"
@pytest.mark.asyncio
async def test_adapter_scoring_uses_advisory_judge(monkeypatch, tmp_path: Path):
task = _files_only_definition(
JudgeExpectations(
rubric="Reward the answer when it is concise.",
artifact_paths=["answer.txt"],
passing_threshold=0.4,
)
)
monkeypatch.setattr("clawbench.harness.load_all_tasks", lambda **_: [task])
monkeypatch.setattr("clawbench.harness.get_adapter", lambda name: FakeAgentAdapter)
monkeypatch.setenv("OPENCLAW_STATE_DIR", str(tmp_path))
monkeypatch.setenv("CLAWBENCH_RUN_CACHE_DIR", "")
class FakeJudgeGateway:
async def __aenter__(self):
return self
async def __aexit__(self, *exc):
return None
async def create_session(self, *, model: str, label: str) -> str:
assert model == "judge-model"
assert label.startswith("clawbench-judge-")
return "judge-session"
async def subscribe(self, session_key: str) -> None:
assert session_key == "judge-session"
async def send_and_wait(self, session_key: str, message: str):
assert session_key == "judge-session"
assert "done" in message
return Transcript(
messages=[
TranscriptMessage(
role="assistant",
text='{"score": 0.5, "confidence": 0.8, "reason": "OK", "rubric_hits": [], "rubric_misses": []}',
)
]
)
async def delete_session(self, session_key: str) -> None:
assert session_key == "judge-session"
monkeypatch.setattr("clawbench.harness.GatewayClient", lambda config: FakeJudgeGateway())
harness = BenchmarkHarness(
gateway_config=GatewayConfig(),
model="openai/gpt-5.5",
adapter="hermes",
judge_model="judge-model",
runs_per_task=1,
randomize_order=False,
print_report=False,
quiet=True,
)
result = await harness.run()
run = harness.last_task_runs[task.id][0]
assert run.judge_result.enabled is True
assert run.judge_result.score == pytest.approx(0.5)
assert run.run_score == pytest.approx(0.95)
assert result.overall_judge_score == pytest.approx(0.5)
@pytest.mark.asyncio
async def test_hermes_adapter_filters_incompatible_tasks(monkeypatch):
task = next(task for task in load_all_tasks() if task.id == "t4-memory-recall-continuation")
monkeypatch.setattr("clawbench.harness.load_all_tasks", lambda **_: [task])
monkeypatch.setattr("clawbench.harness.get_adapter", lambda name: FakeAgentAdapter)
harness = BenchmarkHarness(
gateway_config=GatewayConfig(),
model="openai/gpt-5.5",
adapter="hermes",
runs_per_task=1,
randomize_order=False,
print_report=False,
quiet=True,
)
with pytest.raises(ValueError, match="No selected tasks are compatible"):
await harness.run()

View File

@ -0,0 +1,463 @@
"""Tests for `HermesAdapter` against a stub `MiniSWERunner`.
We don't pull in the real `hermes-agent` package — the adapter is
driven through its `runner_factory` hook, which lets tests plug in a
fixed conversation without any network / subprocess activity.
What's covered:
- The adapter registers under the `"hermes"` name.
- `capabilities` is the minimal `{FILES, EXECUTION}` set.
- `setup` realises memory seed entries as workspace files.
- `run_phase` renders the user turn, calls the stub runner, and
appends the parsed conversation into the shared transcript.
- `verify_state_query` falls back to workspace memory scanning for
memory queries, and returns `capability_missing=True` for other
kinds.
- Task gating: a task that requires MEMORY / SESSION / CRON is NOT
supported by HermesAdapter; a files-only task is.
"""
from __future__ import annotations
import asyncio
from pathlib import Path
from clawbench.adapters import get_adapter
from clawbench.adapters.base import AdapterContext, StateQueryResult
from clawbench.adapters.hermes import HermesAdapter, HermesAdapterConfig
from clawbench.canonical import (
AdapterCapability,
CanonicalTask,
StateQuery,
)
from clawbench.canonical.convert import from_task_definition
from clawbench.schemas import (
CompletionSpec,
ExecutionCheck,
FileState,
MemoryState,
SimulatedUser,
TaskDefinition,
TaskFamily,
TaskSetup,
Tier,
Transcript,
UserTurn,
)
# ---------------------------------------------------------------------------
# Stub MiniSWERunner
# ---------------------------------------------------------------------------
class _StubRunner:
"""Pretends to be `MiniSWERunner`; returns a canned conversation."""
def __init__(self, *, model: str, cwd: str, **_: object) -> None:
self.model = model
self.cwd = cwd
self.last_prompt: str | None = None
self.calls = 0
self.conversation = {
"conversations": [
{"from": "user", "value": "placeholder — filled per-test"},
{
"from": "assistant",
"value": (
"Running `ls`.\n"
'<tool_call>{"name":"bash","arguments":{"cmd":"ls"}}</tool_call>'
),
},
{
"from": "tool",
"value": '<tool_response>{"stdout":"main.py"}</tool_response>',
},
],
"completed": True,
"api_calls": 3,
"metadata": {"model": "stub", "env_type": "local"},
}
def run_task(self, prompt: str) -> dict:
self.last_prompt = prompt
self.calls += 1
# Swap the placeholder user value with the real prompt so the
# conversation reflects what the adapter actually sent.
convo = {**self.conversation}
convo["conversations"] = [
{"from": "user", "value": prompt}
if entry.get("from") == "user"
else entry
for entry in convo["conversations"]
]
return convo
# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------
def _files_only_task(memory_seed: bool = False) -> CanonicalTask:
setup = (
TaskSetup(memory_seed=[{"key": "stack", "value": "React, Node"}])
if memory_seed
else TaskSetup()
)
return from_task_definition(
TaskDefinition(
id="hermes-files-only",
name="Hermes files-only",
tier=Tier.TIER1,
family=TaskFamily.CODING,
surface="coding",
setup=setup,
user=SimulatedUser(
max_turns=1,
turns=[UserTurn(message="List the workspace files.")],
),
completion=CompletionSpec(
files=[FileState(path="main.py", exists=True)],
execution_checks=[ExecutionCheck(name="noop", command="true")],
),
)
)
def _memory_task() -> CanonicalTask:
return from_task_definition(
TaskDefinition(
id="hermes-memory",
name="Hermes memory",
tier=Tier.TIER2,
family=TaskFamily.MULTI_TOOL,
surface="tools",
setup=TaskSetup(),
user=SimulatedUser(max_turns=1, turns=[UserTurn(message="remember stack=X")]),
completion=CompletionSpec(
memory=[MemoryState(key_pattern="stack", exists=True, value_contains=["React"])],
),
)
)
def _make_adapter() -> tuple[HermesAdapter, list[_StubRunner]]:
runners: list[_StubRunner] = []
def _factory(**kwargs):
runner = _StubRunner(**kwargs)
runners.append(runner)
return runner
adapter = HermesAdapter(
HermesAdapterConfig(model="stub-model", runner_factory=_factory)
)
return adapter, runners
def _make_ctx(task: CanonicalTask, workspace: Path) -> AdapterContext:
return AdapterContext(
task=task,
workspace=workspace,
runtime_values={},
run_index=0,
model="stub-model",
transcript=Transcript(),
)
# ---------------------------------------------------------------------------
# Registration + capability shape
# ---------------------------------------------------------------------------
def test_hermes_adapter_is_registered() -> None:
cls = get_adapter("hermes")
assert cls is HermesAdapter
def test_hermes_capabilities_are_files_and_execution_only() -> None:
assert HermesAdapter.capabilities == {
AdapterCapability.FILES,
AdapterCapability.EXECUTION,
}
def test_hermes_supports_files_only_task() -> None:
task = _files_only_task()
assert HermesAdapter.supports(task)
def test_hermes_does_not_support_memory_task() -> None:
task = _memory_task()
assert not HermesAdapter.supports(task)
missing = HermesAdapter.missing_capabilities_for(task)
assert AdapterCapability.MEMORY in missing
def test_hermes_full_agent_capabilities_cover_memory_and_dynamic_tasks() -> None:
task = _memory_task()
config = HermesAdapterConfig(model="stub-model", driver_mode="ai_agent")
assert HermesAdapter.supports(task, config)
caps = HermesAdapter.supported_capabilities(config)
assert AdapterCapability.MEMORY in caps
assert AdapterCapability.CRON in caps
assert AdapterCapability.BROWSER in caps
assert AdapterCapability.MULTI_TURN_INJECTION in caps
# ---------------------------------------------------------------------------
# Lifecycle
# ---------------------------------------------------------------------------
def test_setup_realizes_memory_seed_as_workspace_files(tmp_path: Path) -> None:
task = _files_only_task(memory_seed=True)
adapter, _ = _make_adapter()
async def _go() -> None:
async with adapter:
ctx = _make_ctx(task, tmp_path)
await adapter.setup(ctx)
asyncio.run(_go())
seeded = tmp_path / "memory" / "stack.md"
assert seeded.is_file()
assert "React" in seeded.read_text(encoding="utf-8")
def test_run_phase_sends_rendered_prompt_and_parses_conversation(tmp_path: Path) -> None:
task = _files_only_task()
adapter, runners = _make_adapter()
async def _go():
async with adapter:
ctx = _make_ctx(task, tmp_path)
await adapter.setup(ctx)
result = await adapter.run_phase(task.phases[0], ctx)
return ctx, result
ctx, result = asyncio.run(_go())
# The stub runner saw the rendered user message.
assert runners
assert runners[0].last_prompt == "List the workspace files."
# Conversation parsed into the shared transcript.
assert result.error is None
assert ctx.transcript.tool_call_sequence, "expected tool calls parsed out of Hermes conversation"
first_call = ctx.transcript.tool_call_sequence[0]
assert first_call.name == "bash"
assert first_call.input == {"cmd": "ls"}
assert "main.py" in first_call.output
assert result.adapter_metadata.get("api_calls") == 3
assert result.completed_normally is True
def test_runner_factory_uses_explicit_provider_instead_of_api_key(tmp_path: Path) -> None:
task = _files_only_task()
calls: list[dict] = []
def _factory(**kwargs):
calls.append(kwargs)
return _StubRunner(model=kwargs["model"], cwd=kwargs["cwd"])
adapter = HermesAdapter(
HermesAdapterConfig(
model="stub-model",
provider="openai-codex",
base_url="https://example.invalid/v1",
api_key="secret",
runner_factory=_factory,
)
)
async def _go() -> None:
async with adapter:
ctx = _make_ctx(task, tmp_path)
await adapter.setup(ctx)
asyncio.run(_go())
assert calls
assert calls[0]["base_url"] is None
assert calls[0]["api_key"] is None
def test_direct_openai_endpoint_strips_provider_prefix_for_hermes(tmp_path: Path) -> None:
task = _files_only_task()
calls: list[dict] = []
def _factory(**kwargs):
calls.append(kwargs)
return _StubRunner(model=kwargs["model"], cwd=kwargs["cwd"])
adapter = HermesAdapter(
HermesAdapterConfig(
model="openai/gpt-5.4",
base_url="https://api.openai.com/v1",
api_key="secret",
runner_factory=_factory,
)
)
async def _go() -> None:
async with adapter:
ctx = AdapterContext(
task=task,
workspace=tmp_path,
runtime_values={},
run_index=0,
model="openai/gpt-5.4",
transcript=Transcript(),
)
await adapter.setup(ctx)
assert ctx.adapter_state["effective_model"] == "gpt-5.4"
asyncio.run(_go())
assert calls
assert calls[0]["model"] == "gpt-5.4"
def test_ai_agent_direct_endpoint_reports_custom_provider(tmp_path: Path) -> None:
task = _files_only_task()
calls: list[dict] = []
class _StubAgent:
pass
def _factory(**kwargs):
calls.append(kwargs)
return _StubAgent()
adapter = HermesAdapter(
HermesAdapterConfig(
model="openai/gpt-5.4",
base_url="https://api.openai.com/v1",
api_key="secret",
driver_mode="ai_agent",
agent_factory=_factory,
)
)
async def _go() -> None:
async with adapter:
ctx = AdapterContext(
task=task,
workspace=tmp_path,
runtime_values={},
run_index=0,
model="openai/gpt-5.4",
transcript=Transcript(),
)
await adapter.setup(ctx)
assert ctx.adapter_state["effective_model"] == "gpt-5.4"
asyncio.run(_go())
assert calls
assert calls[0]["model"] == "gpt-5.4"
assert calls[0]["base_url"] == "https://api.openai.com/v1"
assert calls[0]["api_key"] == "secret"
assert calls[0]["provider"] == "custom"
# ---------------------------------------------------------------------------
# State queries
# ---------------------------------------------------------------------------
def test_memory_query_uses_workspace_fallback(tmp_path: Path) -> None:
task = _memory_task()
adapter, _ = _make_adapter()
# Simulate a prior run that wrote a MEMORY.md into the workspace.
(tmp_path / "MEMORY.md").write_text("stack: React, Node, Postgres", encoding="utf-8")
query = StateQuery(
kind="memory",
predicate="exists",
selector={"key_pattern": "stack"},
expected={"value_contains": ["React"]},
required_capability=AdapterCapability.MEMORY,
)
async def _go() -> StateQueryResult:
async with adapter:
ctx = _make_ctx(task, tmp_path)
await adapter.setup(ctx)
return await adapter.verify_state_query(query, ctx)
result = asyncio.run(_go())
assert result.ok is True
assert result.capability_missing is False
def test_session_query_is_reported_as_capability_missing(tmp_path: Path) -> None:
task = _memory_task()
adapter, _ = _make_adapter()
query = StateQuery(
kind="session",
predicate="exists",
selector={},
expected={},
required_capability=AdapterCapability.SESSION,
)
async def _go() -> StateQueryResult:
async with adapter:
ctx = _make_ctx(task, tmp_path)
await adapter.setup(ctx)
return await adapter.verify_state_query(query, ctx)
result = asyncio.run(_go())
assert result.capability_missing is True
assert result.ok is False
# ---------------------------------------------------------------------------
# Timeouts
# ---------------------------------------------------------------------------
def test_run_phase_surfaces_runner_timeout(tmp_path: Path) -> None:
task = _files_only_task()
class _SlowRunner:
def __init__(self, **_: object) -> None:
pass
def run_task(self, prompt: str) -> dict:
import time
time.sleep(5) # will exceed the test's configured timeout
return {"conversations": [], "completed": False, "api_calls": 0}
adapter = HermesAdapter(
HermesAdapterConfig(
model="stub-model",
runner_factory=lambda **kw: _SlowRunner(**kw),
)
)
# Force a short phase timeout so the test stays fast.
task_with_short_timeout = task.model_copy(
update={
"phases": [
task.phases[0].model_copy(update={"timeout_seconds": 1})
]
}
)
async def _go():
async with adapter:
ctx = _make_ctx(task_with_short_timeout, tmp_path)
await adapter.setup(ctx)
return await adapter.run_phase(task_with_short_timeout.phases[0], ctx)
result = asyncio.run(_go())
assert result.error is not None
assert "exceeded" in result.error
assert result.completed_normally is False

193
tests/test_hermes_xml.py Normal file
View File

@ -0,0 +1,193 @@
"""Tests for `clawbench.adapters.hermes_xml.parse_conversation`.
Covers the Hermes conversation shapes we expect from the wild:
- Plain assistant turn with a single tool call + a following tool_response.
- Multiple tool calls in one assistant turn.
- Assistant turn with free-form text + a tool call.
- A malformed tool_call payload parser must recover gracefully
(no raise; surface a best-effort call).
- Name-variant keys (`function`, `parameters`) Hermes-variant models emit.
"""
from __future__ import annotations
from clawbench.adapters.hermes_xml import (
iter_tool_calls_from_conversations,
parse_chat_messages,
parse_conversation,
)
from clawbench.trajectory import annotate_transcript_tool_calls
def _conv(*entries: dict[str, str]) -> dict:
return {"conversations": list(entries), "completed": True, "api_calls": 1}
def test_single_tool_call_with_response() -> None:
convo = _conv(
{"from": "system", "value": "You are a helpful coding agent."},
{"from": "user", "value": "List files."},
{
"from": "assistant",
"value": "I'll run `ls`.\n"
'<tool_call>{"name":"bash","arguments":{"cmd":"ls"}}</tool_call>',
},
{
"from": "tool",
"value": '<tool_response>{"stdout":"main.py\\nREADME"}</tool_response>',
},
)
transcript = parse_conversation(convo)
calls = transcript.tool_call_sequence
assert len(calls) == 1
assert calls[0].name == "bash"
assert calls[0].input == {"cmd": "ls"}
assert "main.py" in calls[0].output
assert calls[0].success is True
# Assistant text preserved, tool-call body stripped out.
assistant = next(
msg for msg in transcript.messages if msg.role == "assistant"
)
assert "I'll run `ls`." in assistant.text
assert "<tool_call>" not in assistant.text
def test_multiple_tool_calls_in_one_turn() -> None:
convo = _conv(
{
"from": "assistant",
"value": (
'<tool_call>{"name":"bash","arguments":{"cmd":"ls"}}</tool_call>'
'<tool_call>{"name":"bash","arguments":{"cmd":"pwd"}}</tool_call>'
),
},
{
"from": "tool",
"value": '<tool_response>{"stdout":"a"}</tool_response>',
},
{
"from": "tool",
"value": '<tool_response>{"stdout":"/tmp"}</tool_response>',
},
)
calls = iter_tool_calls_from_conversations(convo["conversations"])
assert len(calls) == 2
assert calls[0].input == {"cmd": "ls"}
assert calls[1].input == {"cmd": "pwd"}
assert calls[0].output == "a"
assert calls[1].output == "/tmp"
def test_malformed_json_falls_back_to_best_effort() -> None:
convo = _conv(
{
"from": "assistant",
"value": (
'<tool_call>{"name":"bash","arguments":{"cmd":"ls"} <-- stray text }</tool_call>'
'<tool_call>{"name":"bash","arguments":{"cmd":"pwd"}}</tool_call>'
),
},
)
calls = iter_tool_calls_from_conversations(convo["conversations"])
# First is malformed; parser recovers one or two calls without
# raising, and the clean second call is always captured.
assert len(calls) >= 1
assert any(c.input == {"cmd": "pwd"} for c in calls)
def test_name_variants_are_accepted() -> None:
convo = _conv(
{
"from": "assistant",
"value": (
'<tool_call>{"function":"bash","parameters":{"cmd":"ls"}}</tool_call>'
),
},
)
calls = iter_tool_calls_from_conversations(convo["conversations"])
assert len(calls) == 1
assert calls[0].name == "bash"
assert calls[0].input == {"cmd": "ls"}
def test_tool_error_marks_call_failed() -> None:
convo = _conv(
{
"from": "assistant",
"value": '<tool_call>{"name":"bash","arguments":{"cmd":"nonsense"}}</tool_call>',
},
{
"from": "tool",
"value": '<tool_response>{"stderr":"command not found","status":"error"}</tool_response>',
},
)
calls = iter_tool_calls_from_conversations(convo["conversations"])
assert len(calls) == 1
assert calls[0].success is False
assert "command not found" in calls[0].error
def test_orphan_tool_response_not_silently_dropped() -> None:
convo = _conv(
{
"from": "tool",
"value": '<tool_response>{"stdout":"nothing to pair with"}</tool_response>',
},
)
transcript = parse_conversation(convo)
# No calls, but one tool-role transcript message surfaces the output.
assert transcript.tool_call_sequence == []
tool_messages = [msg for msg in transcript.messages if msg.role == "tool"]
assert tool_messages
assert "nothing to pair" in tool_messages[0].tool_result_content
def test_parser_output_annotates_with_canonical_families() -> None:
convo = _conv(
{
"from": "assistant",
"value": (
'<tool_call>{"name":"str_replace_based_edit_tool",'
'"arguments":{"path":"main.py","old":"a","new":"b"}}</tool_call>'
),
},
)
transcript = parse_conversation(convo)
# Running the existing trajectory classifier over the parsed
# transcript should assign a canonical family tag to every call.
annotated = annotate_transcript_tool_calls(transcript)
families = [c.family for c in annotated.tool_call_sequence]
assert all(f for f in families), f"expected every call to get a family tag, got {families}"
assert families == ["edit"]
def test_parse_chat_messages_pairs_tool_results() -> None:
transcript = parse_chat_messages(
[
{"role": "user", "content": "List files"},
{
"role": "assistant",
"content": "I'll inspect.",
"tool_calls": [
{
"id": "call-1",
"function": {
"name": "terminal",
"arguments": "{\"command\":\"ls\"}",
},
}
],
},
{"role": "tool", "tool_call_id": "call-1", "content": "main.py"},
{"role": "assistant", "content": "Found main.py"},
]
)
calls = transcript.tool_call_sequence
assert len(calls) == 1
assert calls[0].name == "terminal"
assert calls[0].input == {"command": "ls"}
assert calls[0].output == "main.py"
assert transcript.assistant_messages[-1].text == "Found main.py"

View File

@ -4,6 +4,7 @@ from pathlib import Path
import pytest
import clawbench.tasks as tasks_module
from clawbench.client import GatewayConfig
from clawbench.environment import verify_completion
from clawbench.harness import BenchmarkHarness
@ -12,14 +13,8 @@ from clawbench.services import build_runtime_values, start_background_services,
from clawbench.tasks import load_all_tasks
from clawbench.trajectory import evaluate_trajectory
# The task set is moving to a private holdout; the public repo will ship a
# different task set soon. Until then, skip integration tests that need
# specific task ids when the tasks directory isn't present.
_TASKS_DIR = Path(__file__).resolve().parent.parent / "tasks"
pytestmark = pytest.mark.skipif(
not _TASKS_DIR.exists(),
reason="tasks/ directory not present (private holdout — public set TBD)",
)
PUBLIC_TASKS_DIR = Path(__file__).resolve().parent.parent / "tasks-public"
tasks_module.TASKS_DIR = PUBLIC_TASKS_DIR
class DummyClient:
@ -28,8 +23,13 @@ class DummyClient:
def _prepare_workspace(task_id: str, tmp_path: Path) -> tuple[Path, object]:
task = next(task for task in load_all_tasks() if task.id == task_id)
harness = BenchmarkHarness(gateway_config=GatewayConfig(), model="test-model", randomize_order=False)
task = next(task for task in load_all_tasks(tasks_dir=PUBLIC_TASKS_DIR) if task.id == task_id)
harness = BenchmarkHarness(
gateway_config=GatewayConfig(),
model="test-model",
randomize_order=False,
tasks_dir=PUBLIC_TASKS_DIR,
)
workspace = tmp_path / task_id
workspace.mkdir(parents=True, exist_ok=True)
harness._setup_workspace(task, workspace)
@ -57,50 +57,6 @@ async def test_python_completion_check_passes_after_fix(tmp_path: Path):
assert result.score == 1.0
@pytest.mark.asyncio
async def test_node_completion_check_passes_after_fix(tmp_path: Path):
workspace, task = _prepare_workspace("t2-node-search-patch", tmp_path)
# After hardening, render.js also exports emptyNote() with a legitimate
# empty body. The scoped fix only patches normalizeNote's body and must
# leave emptyNote alone.
(workspace / "src" / "render.js").write_text(
"function normalizeNote(note) {\n"
" return {\n"
" title: note.title.trim(),\n"
" body: note.body.trim(),\n"
" };\n"
"}\n\n"
"function emptyNote() {\n"
" return {\n"
" title: \"\",\n"
" body: \"\",\n"
" };\n"
"}\n\n"
"module.exports = { normalizeNote, emptyNote };\n",
encoding="utf-8",
)
(workspace / "src" / "search.js").write_text(
"function filterNotes(notes, query) {\n"
" const needle = query.trim().toLowerCase();\n"
" return notes.filter((note) => note.title.toLowerCase().includes(needle) || note.body.toLowerCase().includes(needle));\n"
"}\n\n"
"module.exports = { filterNotes };\n",
encoding="utf-8",
)
runtime_values = build_runtime_values(workspace=workspace, repo_root=Path.cwd())
result = await verify_completion(
task.completion,
workspace=workspace,
client=DummyClient(), # type: ignore[arg-type]
session_key="",
runtime_values=runtime_values,
)
assert result.score == 1.0
def _playwright_available() -> bool:
if not shutil.which("node"):
return False
@ -156,7 +112,10 @@ async def test_browser_completion_check_passes_after_fix(tmp_path: Path):
def test_memory_task_trajectory_requires_memory_tool():
task = next(task for task in load_all_tasks() if task.id == "t4-memory-recall-continuation")
task = next(
task for task in load_all_tasks(tasks_dir=PUBLIC_TASKS_DIR)
if task.id == "t4-memory-recall-continuation"
)
transcript = Transcript(
messages=[
TranscriptMessage(role="assistant", tool_calls=[ToolCall(name="exec", input={"command": "cat docs/release_notes.md"}, success=True)]),
@ -172,7 +131,10 @@ def test_memory_task_trajectory_requires_memory_tool():
def test_delegation_task_trajectory_requires_delegate_family():
task = next(task for task in load_all_tasks() if task.id == "t4-delegation-repair")
task = next(
task for task in load_all_tasks(tasks_dir=PUBLIC_TASKS_DIR)
if task.id == "t4-delegation-repair"
)
transcript = Transcript(
messages=[
TranscriptMessage(role="assistant", tool_calls=[ToolCall(name="exec", input={"command": "rg billing ."}, success=True)]),

View File

@ -0,0 +1,444 @@
"""Tests for `OpenClawAdapter` — exercised against a stub gateway.
This validates the adapter wiring (lifecycle + state-query resolution)
in isolation, before the harness is rewired through it. The stub
`GatewayClient` records every call and produces canned responses so
the adapter's branches are covered end-to-end without a real gateway.
"""
from __future__ import annotations
import asyncio
from pathlib import Path
from typing import Any
import pytest
from clawbench.adapters import get_adapter
from clawbench.adapters.base import AdapterContext, StateQueryResult
from clawbench.adapters.openclaw import OpenClawAdapter, OpenClawAdapterConfig
from clawbench.canonical import (
AdapterCapability,
CanonicalTask,
StateQuery,
)
from clawbench.canonical.convert import from_task_definition
from clawbench.schemas import (
CompletionSpec,
ExecutionCheck,
FileState,
GatewayAssertion,
MemoryState,
SessionState,
SimulatedUser,
TaskDefinition,
TaskFamily,
TaskSetup,
Tier,
Transcript,
UserTurn,
)
# ---------------------------------------------------------------------------
# Stub GatewayClient
# ---------------------------------------------------------------------------
class _StubGateway:
"""Minimal GatewayClient stand-in for adapter tests.
Records every `create_agent`, `create_session`, `subscribe`,
`send_and_wait`, `delete_*` call in `.calls`, and serves canned
responses for the verification RPCs used by `OpenClawAdapter`.
"""
def __init__(self) -> None:
self.calls: list[tuple[str, dict[str, Any]]] = []
self.rpc_responses: dict[str, dict[str, Any]] = {}
self.send_transcript = Transcript()
async def __aenter__(self) -> "_StubGateway":
self.calls.append(("__aenter__", {}))
return self
async def __aexit__(self, *exc: object) -> None:
self.calls.append(("__aexit__", {}))
async def create_agent(self, *, name: str, workspace: str) -> str:
self.calls.append(("create_agent", {"name": name, "workspace": workspace}))
return "agent-stub"
async def create_session(self, *, model: str, agent_id: str, label: str) -> str:
self.calls.append(
("create_session", {"model": model, "agent_id": agent_id, "label": label})
)
return f"session-{label}"
async def subscribe(self, session_key: str) -> None:
self.calls.append(("subscribe", {"session_key": session_key}))
async def send_and_wait(
self,
session_key: str,
message: str,
*,
timeout: float,
) -> Transcript:
self.calls.append(
(
"send_and_wait",
{"session_key": session_key, "message": message, "timeout": timeout},
)
)
return self.send_transcript
async def delete_session(self, session_key: str) -> None:
self.calls.append(("delete_session", {"session_key": session_key}))
async def delete_agent(self, agent_id: str, *, delete_files: bool) -> None:
self.calls.append(
("delete_agent", {"agent_id": agent_id, "delete_files": delete_files})
)
async def get_effective_tools(self, session_key: str) -> dict[str, Any]:
self.calls.append(("get_effective_tools", {"session_key": session_key}))
return self.rpc_responses.get(
"tools.effective",
{"groups": [{"tools": [{"id": "bash"}, {"id": "browser"}]}]},
)
async def _rpc(self, method: str, params: dict[str, Any]) -> dict[str, Any]:
self.calls.append((f"_rpc:{method}", dict(params)))
if method in self.rpc_responses:
return self.rpc_responses[method]
raise RuntimeError(f"stub gateway: no response set for {method}")
# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------
def _coding_task() -> CanonicalTask:
return from_task_definition(
TaskDefinition(
id="oa-adapter-test",
name="OA adapter test",
tier=Tier.TIER1,
family=TaskFamily.CODING,
surface="coding",
setup=TaskSetup(),
user=SimulatedUser(
max_turns=1,
turns=[UserTurn(message="Do the task.")],
),
completion=CompletionSpec(
files=[FileState(path="out.txt", exists=True)],
execution_checks=[ExecutionCheck(name="ok", command="true")],
),
)
)
def _mixed_state_task() -> CanonicalTask:
return from_task_definition(
TaskDefinition(
id="oa-adapter-state-test",
name="OA state test",
tier=Tier.TIER2,
family=TaskFamily.MULTI_TOOL,
surface="tools",
setup=TaskSetup(
pre_check_gateway=[
GatewayAssertion(
method="agents.list",
assert_path="$.count",
assert_equals=0,
),
],
),
user=SimulatedUser(max_turns=1, turns=[UserTurn(message="go")]),
completion=CompletionSpec(
memory=[MemoryState(key_pattern="stack", exists=True, value_contains=["React"])],
session=SessionState(should_exist=True, model_should_be="opus"),
),
)
)
def _make_adapter_and_gateway() -> tuple[OpenClawAdapter, _StubGateway]:
gateway = _StubGateway()
adapter = OpenClawAdapter(OpenClawAdapterConfig(model="test-model"))
adapter._client_factory = lambda: gateway # type: ignore[assignment]
return adapter, gateway
def _make_ctx(task: CanonicalTask, workspace: Path) -> AdapterContext:
return AdapterContext(
task=task,
workspace=workspace,
runtime_values={},
run_index=0,
model="test-model",
transcript=Transcript(),
)
# ---------------------------------------------------------------------------
# Registration
# ---------------------------------------------------------------------------
def test_openclaw_adapter_is_registered() -> None:
cls = get_adapter("openclaw")
assert cls is OpenClawAdapter
def test_openclaw_declares_full_capability_set() -> None:
assert AdapterCapability.FILES in OpenClawAdapter.capabilities
assert AdapterCapability.EXECUTION in OpenClawAdapter.capabilities
assert AdapterCapability.MEMORY in OpenClawAdapter.capabilities
assert AdapterCapability.SESSION in OpenClawAdapter.capabilities
assert AdapterCapability.CRON in OpenClawAdapter.capabilities
assert AdapterCapability.GATEWAY_RPC in OpenClawAdapter.capabilities
assert AdapterCapability.BROWSER in OpenClawAdapter.capabilities
# ---------------------------------------------------------------------------
# Lifecycle
# ---------------------------------------------------------------------------
def test_setup_realizes_memory_seed_files(tmp_path: Path) -> None:
task = from_task_definition(
TaskDefinition(
id="oa-seeded-memory",
name="OA seeded memory",
tier=Tier.TIER2,
family=TaskFamily.MULTI_TOOL,
surface="tools",
setup=TaskSetup(
memory_seed=[
{
"key": "event profile",
"value": "Vegetarian food, quiet rooms, and no stairs.",
}
]
),
user=SimulatedUser(max_turns=1, turns=[UserTurn(message="go")]),
)
)
adapter, gateway = _make_adapter_and_gateway()
async def _go() -> None:
async with adapter:
ctx = _make_ctx(task, tmp_path)
await adapter.setup(ctx)
asyncio.run(_go())
assert (tmp_path / "MEMORY.md").read_text(encoding="utf-8").count("event profile") == 1
assert "Vegetarian food" in (tmp_path / "memory" / "event_profile.md").read_text(encoding="utf-8")
assert any(call[0] == "create_agent" for call in gateway.calls)
def test_run_phase_creates_session_subscribes_and_drives_simulator(tmp_path: Path) -> None:
task = _coding_task()
adapter, gateway = _make_adapter_and_gateway()
async def _go() -> None:
async with adapter:
ctx = _make_ctx(task, tmp_path)
await adapter.setup(ctx)
result = await adapter.run_phase(task.phases[0], ctx)
assert result.error is None
await adapter.teardown(ctx)
asyncio.run(_go())
methods = [name for name, _ in gateway.calls]
# Ordered sequence we expect:
assert "create_agent" in methods
assert "create_session" in methods
assert "subscribe" in methods
assert "send_and_wait" in methods
assert "delete_session" in methods
assert "delete_agent" in methods
# The send_and_wait call should use the rendered user turn text.
send_args = next(args for name, args in gateway.calls if name == "send_and_wait")
assert send_args["message"] == "Do the task."
def test_run_phase_fails_fast_without_setup(tmp_path: Path) -> None:
task = _coding_task()
adapter, _ = _make_adapter_and_gateway()
async def _go() -> None:
async with adapter:
ctx = _make_ctx(task, tmp_path)
# Skip setup() — run_phase should return an error phase.
result = await adapter.run_phase(task.phases[0], ctx)
assert result.completed_normally is False
assert result.error and "agent_id" in result.error
asyncio.run(_go())
# ---------------------------------------------------------------------------
# State queries
# ---------------------------------------------------------------------------
def test_memory_query_uses_memory_search_primary_path(tmp_path: Path) -> None:
task = _mixed_state_task()
adapter, gateway = _make_adapter_and_gateway()
gateway.rpc_responses["memory.search"] = {
"payload": {"entries": [{"value": "stack = React, Node, Postgres"}]}
}
query = StateQuery(
kind="memory",
predicate="exists",
selector={"key_pattern": "stack"},
expected={"value_contains": ["React"]},
required_capability=AdapterCapability.MEMORY,
)
async def _go() -> StateQueryResult:
async with adapter:
ctx = _make_ctx(task, tmp_path)
await adapter.setup(ctx)
return await adapter.verify_state_query(query, ctx)
result = asyncio.run(_go())
assert result.ok is True
assert result.detail == "OK"
def test_memory_query_falls_back_to_workspace_on_rpc_failure(tmp_path: Path) -> None:
task = _mixed_state_task()
adapter, gateway = _make_adapter_and_gateway()
# No memory.search response → primary path raises, fallback runs.
# Seed a MEMORY.md file in the workspace so the fallback succeeds.
(tmp_path / "MEMORY.md").write_text(
"stack: React, Node, Postgres", encoding="utf-8"
)
query = StateQuery(
kind="memory",
predicate="exists",
selector={"key_pattern": "stack"},
expected={"value_contains": ["React"]},
required_capability=AdapterCapability.MEMORY,
)
async def _go() -> StateQueryResult:
async with adapter:
ctx = _make_ctx(task, tmp_path)
await adapter.setup(ctx)
return await adapter.verify_state_query(query, ctx)
result = asyncio.run(_go())
assert result.ok is True
def test_session_query_uses_sessions_resolve(tmp_path: Path) -> None:
task = _mixed_state_task()
adapter, gateway = _make_adapter_and_gateway()
gateway.rpc_responses["sessions.resolve"] = {
"payload": {"model": "claude-opus-4"}
}
query = StateQuery(
kind="session",
predicate="exists",
selector={},
expected={"model": "opus"},
required_capability=AdapterCapability.SESSION,
)
async def _go() -> StateQueryResult:
async with adapter:
ctx = _make_ctx(task, tmp_path)
await adapter.setup(ctx)
ctx.adapter_state["last_session_key"] = "some-session"
return await adapter.verify_state_query(query, ctx)
result = asyncio.run(_go())
assert result.ok is True
def test_gateway_query_resolves_json_path(tmp_path: Path) -> None:
task = _mixed_state_task()
adapter, gateway = _make_adapter_and_gateway()
gateway.rpc_responses["memory.list"] = {
"payload": {"count": 3}
}
query = StateQuery(
kind="custom",
predicate="equals",
selector={"method": "memory.list", "params": {}, "assert_path": "$.count"},
expected={"equals": 3, "exists": True},
required_capability=AdapterCapability.GATEWAY_RPC,
)
async def _go() -> StateQueryResult:
async with adapter:
ctx = _make_ctx(task, tmp_path)
await adapter.setup(ctx)
return await adapter.verify_state_query(query, ctx)
result = asyncio.run(_go())
assert result.ok is True
def test_cron_query_returns_false_when_no_jobs(tmp_path: Path) -> None:
task = _mixed_state_task()
adapter, gateway = _make_adapter_and_gateway()
gateway.rpc_responses["cron.list"] = {"payload": {"jobs": []}}
query = StateQuery(
kind="cron",
predicate="exists",
selector={"description_contains": "daily"},
expected={},
required_capability=AdapterCapability.CRON,
)
async def _go() -> StateQueryResult:
async with adapter:
ctx = _make_ctx(task, tmp_path)
await adapter.setup(ctx)
return await adapter.verify_state_query(query, ctx)
result = asyncio.run(_go())
assert result.ok is False
def test_pre_run_queries_evaluated_during_setup(tmp_path: Path) -> None:
task = _mixed_state_task()
adapter, gateway = _make_adapter_and_gateway()
# Deliberately return the wrong count to trigger a pre-run failure.
gateway.rpc_responses["agents.list"] = {"payload": {"count": 99}}
async def _go() -> list[str]:
async with adapter:
ctx = _make_ctx(task, tmp_path)
await adapter.setup(ctx)
return ctx.adapter_state.get("pre_run_failures", [])
failures = asyncio.run(_go())
assert failures, "pre-run gateway assertion should have failed"
# ---------------------------------------------------------------------------
# Requires-context guard
# ---------------------------------------------------------------------------
def test_client_accessor_errors_when_not_in_context() -> None:
adapter, _ = _make_adapter_and_gateway()
with pytest.raises(RuntimeError):
_ = adapter.client

View File

@ -0,0 +1,63 @@
import re
from pathlib import Path
import yaml
REPO_ROOT = Path(__file__).resolve().parent.parent
TASK_ID_RE = re.compile(r"\bt[1-6]-[a-z0-9-]+")
def _public_task_ids() -> set[str]:
manifest = yaml.safe_load((REPO_ROOT / "tasks-public" / "MANIFEST.yaml").read_text(encoding="utf-8"))
return {task["id"] for task in manifest["tasks"]}
def _mentioned_task_ids(path: Path) -> set[str]:
return set(TASK_ID_RE.findall(path.read_text(encoding="utf-8", errors="ignore")))
def test_public_docs_only_reference_public_task_ids():
public_ids = _public_task_ids()
docs = [
REPO_ROOT / "README.md",
REPO_ROOT / "SPACE_README.md",
REPO_ROOT / "tasks-public" / "README.md",
REPO_ROOT / "tasks-public" / "MANIFEST.yaml",
]
leaked: dict[str, list[str]] = {}
for path in docs:
private_mentions = sorted(_mentioned_task_ids(path) - public_ids)
if private_mentions:
leaked[str(path.relative_to(REPO_ROOT))] = private_mentions
assert leaked == {}
def test_reusable_scripts_do_not_embed_private_task_ids():
public_ids = _public_task_ids()
leaked: dict[str, list[str]] = {}
for path in sorted((REPO_ROOT / "scripts").glob("*")):
if not path.is_file() or path.suffix not in {".py", ".sh"}:
continue
private_mentions = sorted(_mentioned_task_ids(path) - public_ids)
if private_mentions:
leaked[str(path.relative_to(REPO_ROOT))] = private_mentions
assert leaked == {}
def test_public_docs_match_manifest_task_count():
manifest = yaml.safe_load((REPO_ROOT / "tasks-public" / "MANIFEST.yaml").read_text(encoding="utf-8"))
task_count = int(manifest["task_count"])
assert task_count == len(manifest["tasks"]) == 19
readme = (REPO_ROOT / "README.md").read_text(encoding="utf-8")
space_readme = (REPO_ROOT / "SPACE_README.md").read_text(encoding="utf-8")
assert f"Core v1: {task_count} tasks" in readme
assert "tasks : 19" in space_readme
assert f"Core v1: {task_count + 8} tasks" not in readme
assert f"tasks : {task_count + 1}" not in space_readme

View File

@ -1,37 +1,47 @@
from pathlib import Path
import clawbench.tasks as tasks_module
from clawbench.client import GatewayConfig
from clawbench.harness import BenchmarkHarness
from clawbench.tasks import load_all_tasks
PUBLIC_TASKS_DIR = Path(__file__).resolve().parent.parent / "tasks-public"
tasks_module.TASKS_DIR = PUBLIC_TASKS_DIR
def test_load_all_tasks_returns_full_corpus():
tasks = load_all_tasks()
# Public Core release has 19 tasks; full private dev set has 40.
# Either must cover tiers 1-5 and carry capability/subset/judge metadata.
assert len(tasks) >= 19
tasks = load_all_tasks(tasks_dir=PUBLIC_TASKS_DIR)
assert len(tasks) == 19
assert {task.tier.value for task in tasks} == {"tier1", "tier2", "tier3", "tier4", "tier5"}
assert any(task.capabilities for task in tasks)
assert any(task.subsets for task in tasks)
assert any(task.scenario is not None for task in tasks)
assert any("ambiguous" in [variant.value for variant in task.prompt_variants] for task in tasks)
assert sum(1 for task in tasks if task.judge is not None) >= 6
assert sum(1 for task in tasks if task.judge is not None) >= 5
assert all(task.pool.value == "public_dev" for task in tasks)
assert all(task.setup.asset_packs for task in tasks)
def test_public_tasks_match_core_v1_manifest_shape():
tasks = load_all_tasks(tasks_dir=PUBLIC_TASKS_DIR)
task_ids = {task.id for task in tasks}
assert len(tasks) == 19
assert "t1-bugfix-discount" in task_ids
assert "t5-hallucination-resistant-evidence" in task_ids
assert sum(1 for task in tasks if task.tier.value == "tier4") == 5
assert sum(1 for task in tasks if task.family.value == "browser") == 2
assert any("memory_continuation" in [cap.value for cap in task.capabilities] for task in tasks)
def test_load_all_tasks_supports_pool_subset_and_capability_filters():
hard_tasks = load_all_tasks(subsets=["hard"])
consensus_tasks = load_all_tasks(subsets=["consensus"])
bugfix_tasks = load_all_tasks(capabilities=["bugfix"])
coding_scene_tasks = load_all_tasks(scenario="coding_dev_assist")
ambiguous_tasks = load_all_tasks(prompt_variant="ambiguous")
bugfix_tasks = load_all_tasks(tasks_dir=PUBLIC_TASKS_DIR, capabilities=["bugfix"])
coding_scene_tasks = load_all_tasks(tasks_dir=PUBLIC_TASKS_DIR, scenario="coding_dev_assist")
ambiguous_tasks = load_all_tasks(tasks_dir=PUBLIC_TASKS_DIR, prompt_variant="ambiguous")
assert hard_tasks
assert consensus_tasks
assert bugfix_tasks
assert coding_scene_tasks
assert ambiguous_tasks
assert all("hard" in [subset.value for subset in task.subsets] for task in hard_tasks)
assert all("consensus" in [subset.value for subset in task.subsets] for task in consensus_tasks)
assert all("bugfix" in [capability.value for capability in task.capabilities] for task in bugfix_tasks)
assert all(task.scenario and task.scenario.value == "coding_dev_assist" for task in coding_scene_tasks)
assert all("ambiguous" in [variant.value for variant in task.prompt_variants] for task in ambiguous_tasks)
@ -42,8 +52,16 @@ def test_workspace_setup_preserves_nested_asset_paths(tmp_path: Path):
# passes whether the dev has private tasks/ or only the public release.
# t4-browser-research-and-code has both flat files (report_client.py,
# serve_docs.py) and nested dirs (docs/, tests/).
task = next(task for task in load_all_tasks() if task.id == "t4-browser-research-and-code")
harness = BenchmarkHarness(gateway_config=GatewayConfig(), model="test-model", randomize_order=False)
task = next(
task for task in load_all_tasks(tasks_dir=PUBLIC_TASKS_DIR)
if task.id == "t4-browser-research-and-code"
)
harness = BenchmarkHarness(
gateway_config=GatewayConfig(),
model="test-model",
randomize_order=False,
tasks_dir=PUBLIC_TASKS_DIR,
)
workspace = tmp_path / "workspace"
workspace.mkdir()
@ -57,7 +75,7 @@ def test_workspace_setup_preserves_nested_asset_paths(tmp_path: Path):
def test_selected_tasks_include_judge_rubrics():
# All assertions use task IDs from the Core v1 public set so CI
# (without the private tasks/) reproduces locally.
tasks = {task.id: task for task in load_all_tasks()}
tasks = {task.id: task for task in load_all_tasks(tasks_dir=PUBLIC_TASKS_DIR)}
assert tasks["t1-bugfix-discount"].judge is not None
assert tasks["t3-feature-export"].judge is not None

View File

@ -1,38 +1,12 @@
import pytest
from clawbench.schemas import BenchmarkResult
from clawbench.upload import _json_column, _submission_shard_name, upload_result
from clawbench.upload import upload_result
def test_submission_shard_name_sanitizes_ids():
assert _submission_shard_name("abc/def:ghi") == "abc-def-ghi.parquet"
assert _submission_shard_name("...") == "submission.parquet"
@pytest.mark.asyncio
async def test_upload_result_writes_append_only_submission_shard(monkeypatch):
uploads = []
ensured = []
uploaded_rows = []
class FakeApi:
def __init__(self, token: str) -> None:
self.token = token
def upload_file(self, *, path_or_fileobj: str, path_in_repo: str, repo_id: str, repo_type: str) -> None:
import pandas as pd
uploads.append((path_or_fileobj, path_in_repo, repo_id, repo_type))
uploaded_rows.extend(pd.read_parquet(path_or_fileobj).to_dict(orient="records"))
monkeypatch.setattr("huggingface_hub.HfApi", FakeApi)
monkeypatch.setattr(
"clawbench.upload.ensure_dataset_repo",
lambda api, repo_id: ensured.append((api.token, repo_id)),
)
result = BenchmarkResult(
submission_id="run/123",
def _result(submission_id: str = "run/123") -> BenchmarkResult:
return BenchmarkResult(
submission_id=submission_id,
model="anthropic/claude-sonnet-4-6",
provider="anthropic",
timestamp="2026-04-28T00:00:00+00:00",
@ -45,19 +19,58 @@ async def test_upload_result_writes_append_only_submission_shard(monkeypatch):
overall_pass_hat_k=1.0,
)
url = await upload_result(result, dataset_repo="openclaw/clawbench-results", token="hf_test")
@pytest.mark.asyncio
async def test_upload_result_requires_token(monkeypatch):
monkeypatch.delenv("HF_TOKEN", raising=False)
with pytest.raises(RuntimeError, match="HF_TOKEN not set"):
await upload_result(_result(), dataset_repo="openclaw/clawbench-results")
@pytest.mark.asyncio
async def test_upload_result_appends_and_deduplicates_submissions(monkeypatch):
ensured = []
pushed = []
class FakeApi:
def __init__(self, token: str) -> None:
self.token = token
class FakeDataset:
def __init__(self, rows):
self.rows = rows
@classmethod
def from_list(cls, rows):
return cls(rows)
def push_to_hub(self, repo_id: str, *, split: str, token: str) -> None:
pushed.append((repo_id, split, token, self.rows))
monkeypatch.setattr("huggingface_hub.HfApi", FakeApi)
monkeypatch.setattr("datasets.Dataset", FakeDataset)
monkeypatch.setattr(
"datasets.load_dataset",
lambda *args, **kwargs: [
{"submission_id": "old-run", "model": "old-model"},
{"submission_id": "run/123", "model": "stale-model"},
],
)
monkeypatch.setattr(
"clawbench.upload.ensure_dataset_repo",
lambda api, repo_id: ensured.append((api.token, repo_id)),
)
url = await upload_result(_result(), dataset_repo="openclaw/clawbench-results", token="hf_test")
assert url == "https://huggingface.co/datasets/openclaw/clawbench-results"
assert ensured == [("hf_test", "openclaw/clawbench-results")]
assert len(uploads) == 1
local_path, path_in_repo, repo_id, repo_type = uploads[0]
assert local_path.endswith("run-123.parquet")
assert path_in_repo == "data/submissions/run-123.parquet"
assert len(pushed) == 1
repo_id, split, token, rows = pushed[0]
assert repo_id == "openclaw/clawbench-results"
assert repo_type == "dataset"
assert uploaded_rows[0]["overall_delivery_outcome_counts"] == "{}"
assert uploaded_rows[0]["task_results"] == "[]"
def test_json_column_is_stable_and_compact():
assert _json_column({"b": 2, "a": 1}) == '{"a":1,"b":2}'
assert split == "submissions"
assert token == "hf_test"
assert [row["submission_id"] for row in rows] == ["old-run", "run/123"]
assert rows[-1]["model"] == "anthropic/claude-sonnet-4-6"

View File

@ -45,7 +45,12 @@ def test_configure_browser_runtime_sets_benchmark_safe_openclaw_config(monkeypat
assert json.loads(config_path.read_text(encoding="utf-8")) == {
"agents": {"defaults": {"skipBootstrap": True}},
"browser": {"headless": True, "noSandbox": True},
"tools": {"exec": {"host": "gateway", "security": "full", "ask": "off"}},
"approvals": {"exec": {"enabled": False}},
}
approvals = json.loads((state_dir / "exec-approvals.json").read_text(encoding="utf-8"))
assert approvals["defaults"] == {"security": "full", "ask": "off", "askFallback": "full"}
assert approvals["agents"]["*"] == {"security": "full", "ask": "off", "askFallback": "full"}
def test_configure_browser_runtime_pins_subagents_to_active_model(monkeypatch):
@ -72,6 +77,8 @@ def test_configure_browser_runtime_pins_subagents_to_active_model(monkeypatch):
}
},
"browser": {"headless": True, "noSandbox": True},
"tools": {"exec": {"host": "gateway", "security": "full", "ask": "off"}},
"approvals": {"exec": {"enabled": False}},
}
@ -169,6 +176,11 @@ def test_materialize_lane_runtime_spaces_ports_and_copies_auth(tmp_path: Path, m
assert lane1.port == GATEWAY_PORT + GATEWAY_PORT_SPACING
assert lane1.state_dir is not None
assert (lane1.state_dir / "agents" / "main" / "agent" / "auth-profiles.json").exists()
lane_cfg = json.loads((lane1.state_dir / "openclaw.json").read_text(encoding="utf-8"))
assert lane_cfg["tools"]["exec"] == {"host": "gateway", "security": "full", "ask": "off"}
assert lane_cfg["approvals"]["exec"] == {"enabled": False}
lane_approvals = json.loads((lane1.state_dir / "exec-approvals.json").read_text(encoding="utf-8"))
assert lane_approvals["defaults"] == {"security": "full", "ask": "off", "askFallback": "full"}
def test_job_progress_tracker_drops_finished_parallel_lane():