Compare commits
3 Commits
main
...
codex/stab
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
abf3500f69 | ||
|
|
cebd1c8026 | ||
|
|
7eb854710f |
19
.dockerignore
Normal file
19
.dockerignore
Normal file
@ -0,0 +1,19 @@
|
||||
.git
|
||||
.venv
|
||||
__pycache__
|
||||
.pytest_cache
|
||||
.mypy_cache
|
||||
.ruff_cache
|
||||
.DS_Store
|
||||
|
||||
data
|
||||
results
|
||||
.clawbench
|
||||
|
||||
.tmp/*
|
||||
!.tmp/hermes-agent
|
||||
!.tmp/hermes-agent/**
|
||||
|
||||
**/node_modules
|
||||
**/__pycache__
|
||||
**/.pytest_cache
|
||||
@ -104,11 +104,24 @@ Each task will declare:
|
||||
- `family`
|
||||
- `surface`
|
||||
- `capabilities`
|
||||
- `category`
|
||||
- `domain`
|
||||
- `functionality`
|
||||
- `trace_distribution`
|
||||
- `tool_surface`
|
||||
- `risk_tags`
|
||||
- `pool`
|
||||
- `variant_group`
|
||||
- `official`
|
||||
- `semantic_judge`
|
||||
|
||||
The added dimensions are flat, orthogonal leaderboard axes. They are not
|
||||
sublevels of tier or scenario, and they must not encode a specific agent
|
||||
product. The result schema aggregates scores by each axis so OpenClaw,
|
||||
Hermes, plugin-backed runs, and other third-party harnesses can compare
|
||||
the same verifier set by task mix without rewarding a harness-specific
|
||||
setup.
|
||||
|
||||
Recommended capability tags:
|
||||
|
||||
- `bugfix`
|
||||
|
||||
14
Dockerfile
14
Dockerfile
@ -1,8 +1,8 @@
|
||||
# ClawBench HF Docker Space
|
||||
# Layer the benchmark harness on top of a pinned OpenClaw image.
|
||||
# Layer the benchmark harness on top of the official OpenClaw image.
|
||||
|
||||
ARG OPENCLAW_IMAGE=ghcr.io/openclaw/openclaw@sha256:2e32f4f2e4f653f12d5dc6e5c93cc71e60f49d1dfaf061b18e53c3e61a38fb48
|
||||
FROM ${OPENCLAW_IMAGE}
|
||||
ARG BASE=ghcr.io/openclaw/openclaw:latest
|
||||
FROM ${BASE}
|
||||
|
||||
USER root
|
||||
|
||||
@ -13,8 +13,10 @@ RUN apt-get update && \
|
||||
|
||||
RUN ln -s /app /openclaw
|
||||
|
||||
ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
|
||||
RUN npx -y playwright@1.59.1 install --with-deps chromium && \
|
||||
ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright \
|
||||
NODE_PATH=/usr/local/lib/node_modules
|
||||
RUN npm install -g playwright@1.59.1 && \
|
||||
playwright install --with-deps chromium && \
|
||||
CHROME_PATH="$(find /ms-playwright -path '*/chrome' -type f | sort | head -n 1)" && \
|
||||
test -x "$CHROME_PATH" && \
|
||||
ln -sf "$CHROME_PATH" /usr/bin/chromium
|
||||
@ -38,7 +40,7 @@ RUN mkdir -p \
|
||||
/home/node/.openclaw/agents/dev \
|
||||
/home/node/.openclaw/agents/main/agent && \
|
||||
chown -R node:node /data /home/node/.openclaw && \
|
||||
chmod -R 775 /data /home/node/.openclaw
|
||||
chmod -R 777 /data /home/node/.openclaw
|
||||
|
||||
USER node
|
||||
|
||||
|
||||
53
Dockerfile.clawbench-426-agent-hotfix
Normal file
53
Dockerfile.clawbench-426-agent-hotfix
Normal file
@ -0,0 +1,53 @@
|
||||
# ClawBench HF Docker Space with OpenClaw 2026.4.26 agent-create race hotfix.
|
||||
|
||||
ARG BASE=openclaw-426-agent-hotfix:latest
|
||||
FROM ${BASE}
|
||||
|
||||
USER root
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
RUN apt-get update && \
|
||||
apt-get install -y python3-pip python-is-python3 && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN ln -s /app /openclaw
|
||||
|
||||
ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright \
|
||||
NODE_PATH=/usr/local/lib/node_modules
|
||||
RUN npm install -g playwright@1.59.1 && \
|
||||
playwright install --with-deps chromium && \
|
||||
CHROME_PATH="$(find /ms-playwright -path '*/chrome' -type f | sort | head -n 1)" && \
|
||||
test -x "$CHROME_PATH" && \
|
||||
ln -sf "$CHROME_PATH" /usr/bin/chromium
|
||||
|
||||
ENV HOME=/home/node PATH=/home/node/.local/bin:$PATH
|
||||
WORKDIR /home/node/app
|
||||
|
||||
COPY --chown=node:node pyproject.toml README.md CLAWBENCH_V0_4_SPEC.md PARTNER_TRACE_SPEC.md ./
|
||||
COPY --chown=node:node clawbench/ clawbench/
|
||||
COPY --chown=node:node scripts/ scripts/
|
||||
COPY --chown=node:node profiles/ profiles/
|
||||
COPY --chown=node:node tasks/ tasks/
|
||||
COPY --chown=node:node tasks-public/ tasks-public/
|
||||
COPY --chown=node:node tasks-domain/ tasks-domain/
|
||||
COPY --chown=node:node baselines/ baselines/
|
||||
COPY --chown=node:node app.py .
|
||||
|
||||
RUN python3 -m pip install --break-system-packages --no-cache-dir .
|
||||
|
||||
RUN mkdir -p \
|
||||
/data/results \
|
||||
/data/queue \
|
||||
/home/node/.openclaw/agents/dev \
|
||||
/home/node/.openclaw/agents/main/agent && \
|
||||
chown -R node:node /data /home/node/.openclaw && \
|
||||
chmod -R 777 /data /home/node/.openclaw
|
||||
|
||||
USER node
|
||||
|
||||
ENV GATEWAY_PORT=18789
|
||||
ENV OPENCLAW_HOME=/home/node
|
||||
ENV OPENCLAW_STATE_DIR=/home/node/.openclaw
|
||||
|
||||
EXPOSE 7860
|
||||
CMD ["python", "app.py"]
|
||||
113
Dockerfile.gbrain
Normal file
113
Dockerfile.gbrain
Normal file
@ -0,0 +1,113 @@
|
||||
# ClawBench + latest upstream GBrain for OpenClaw harness comparisons.
|
||||
#
|
||||
# Secrets are not baked into this image. Runtime API keys are read from the
|
||||
# mounted OpenClaw config/env by scripts/setup_gbrain_runtime.sh.
|
||||
|
||||
ARG BASE=ghcr.io/openclaw/openclaw:latest
|
||||
FROM ${BASE}
|
||||
|
||||
USER root
|
||||
|
||||
ARG GBRAIN_REPO=https://github.com/garrytan/gbrain.git
|
||||
ARG GBRAIN_REF=be8fffad71ea36bc51c2d58564762b0fe271e8f4
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
ca-certificates curl git jq python3-pip python-is-python3 unzip && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN ln -s /app /openclaw
|
||||
|
||||
ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright \
|
||||
NODE_PATH=/usr/local/lib/node_modules
|
||||
RUN npm install -g playwright@1.59.1 && \
|
||||
playwright install --with-deps chromium && \
|
||||
CHROME_PATH="$(find /ms-playwright -path '*/chrome' -type f | sort | head -n 1)" && \
|
||||
test -x "$CHROME_PATH" && \
|
||||
ln -sf "$CHROME_PATH" /usr/bin/chromium
|
||||
|
||||
ENV BUN_INSTALL=/usr/local/bun
|
||||
RUN mkdir -p /usr/local/bun && \
|
||||
curl -fsSL https://bun.sh/install | bash
|
||||
RUN git clone "${GBRAIN_REPO}" /opt/gbrain && \
|
||||
cd /opt/gbrain && \
|
||||
git checkout "${GBRAIN_REF}" && \
|
||||
/usr/local/bun/bin/bun install --frozen-lockfile
|
||||
|
||||
RUN mkdir -p /opt/gbrain/.codex-plugin /opt/gbrain/bin && \
|
||||
printf '%s\n' \
|
||||
'#!/usr/bin/env bash' \
|
||||
'set -euo pipefail' \
|
||||
'cd /opt/gbrain' \
|
||||
'exec /usr/local/bun/bin/bun run src/cli.ts "$@"' \
|
||||
> /opt/gbrain/bin/gbrain && \
|
||||
printf '%s\n' \
|
||||
'{' \
|
||||
' "id": "gbrain",' \
|
||||
' "name": "gbrain",' \
|
||||
' "description": "Personal knowledge brain with PGLite-backed CLI, skills, and MCP server",' \
|
||||
' "version": "0.22.6",' \
|
||||
' "skills": "skills",' \
|
||||
' "mcpServers": {' \
|
||||
' "gbrain": {' \
|
||||
' "command": "/opt/gbrain/bin/gbrain",' \
|
||||
' "args": ["serve"],' \
|
||||
' "cwd": "/opt/gbrain",' \
|
||||
' "connectionTimeoutMs": 120000,' \
|
||||
' "env": {' \
|
||||
' "PATH": "/opt/gbrain/bin:/usr/local/bun/bin:/usr/local/bin:/usr/bin:/bin"' \
|
||||
' }' \
|
||||
' }' \
|
||||
' },' \
|
||||
' "configSchema": {' \
|
||||
' "type": "object",' \
|
||||
' "additionalProperties": true,' \
|
||||
' "properties": {' \
|
||||
' "database_url": {"type": "string"},' \
|
||||
' "openai_api_key": {"type": "string"}' \
|
||||
' }' \
|
||||
' }' \
|
||||
'}' \
|
||||
> /opt/gbrain/.codex-plugin/plugin.json && \
|
||||
chmod +x /opt/gbrain/bin/gbrain && \
|
||||
ln -sf /opt/gbrain/bin/gbrain /usr/local/bin/gbrain && \
|
||||
ln -sf /usr/local/bun/bin/bun /usr/local/bin/bun && \
|
||||
chown -R node:node /opt/gbrain && \
|
||||
git config --system --add safe.directory /opt/gbrain
|
||||
|
||||
ENV PATH=/opt/gbrain/bin:/usr/local/bun/bin:/home/node/.local/bin:$PATH \
|
||||
HOME=/home/node \
|
||||
CLAWBENCH_ENABLE_GBRAIN=1 \
|
||||
CLAWBENCH_LANE_PREPARE_CMD=/home/node/app/scripts/setup_gbrain_runtime.sh \
|
||||
GBRAIN_ALLOW_SHELL_JOBS=1
|
||||
|
||||
WORKDIR /home/node/app
|
||||
|
||||
COPY --chown=node:node pyproject.toml README.md ./
|
||||
COPY --chown=node:node clawbench/ clawbench/
|
||||
COPY --chown=node:node tasks-public/ tasks-public/
|
||||
COPY --chown=node:node tasks-domain/ tasks-domain/
|
||||
COPY --chown=node:node baselines/ baselines/
|
||||
COPY --chown=node:node scripts/container_adapter_eval.sh scripts/container_lane_eval.sh scripts/setup_gbrain_runtime.sh scripts/
|
||||
COPY --chown=node:node app.py .
|
||||
|
||||
RUN chmod +x scripts/container_adapter_eval.sh scripts/container_lane_eval.sh scripts/setup_gbrain_runtime.sh && \
|
||||
python3 -m pip install --break-system-packages --no-cache-dir .
|
||||
|
||||
RUN mkdir -p \
|
||||
/data/results \
|
||||
/data/queue \
|
||||
/home/node/.openclaw/agents/dev \
|
||||
/home/node/.openclaw/agents/main/agent && \
|
||||
chown -R node:node /data /home/node/.openclaw && \
|
||||
chmod -R 777 /data /home/node/.openclaw
|
||||
|
||||
USER node
|
||||
|
||||
ENV GATEWAY_PORT=18789
|
||||
ENV OPENCLAW_HOME=/home/node
|
||||
ENV OPENCLAW_STATE_DIR=/home/node/.openclaw
|
||||
|
||||
EXPOSE 7860
|
||||
CMD ["python", "app.py"]
|
||||
@ -16,8 +16,10 @@ RUN apt-get update && \
|
||||
|
||||
RUN ln -s /app /openclaw
|
||||
|
||||
ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
|
||||
RUN npx -y playwright@1.59.1 install --with-deps chromium && \
|
||||
ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright \
|
||||
NODE_PATH=/usr/local/lib/node_modules
|
||||
RUN npm install -g playwright@1.59.1 && \
|
||||
playwright install --with-deps chromium && \
|
||||
CHROME_PATH="$(find /ms-playwright -path '*/chrome' -type f | sort | head -n 1)" && \
|
||||
test -x "$CHROME_PATH" && \
|
||||
ln -sf "$CHROME_PATH" /usr/bin/chromium
|
||||
|
||||
8
Dockerfile.openclaw-426-agent-hotfix
Normal file
8
Dockerfile.openclaw-426-agent-hotfix
Normal file
@ -0,0 +1,8 @@
|
||||
FROM ghcr.io/openclaw/openclaw:2026.4.26
|
||||
|
||||
USER root
|
||||
COPY patches/patch_openclaw_426_agent_create_queue.mjs /tmp/patch_openclaw_426_agent_create_queue.mjs
|
||||
RUN node /tmp/patch_openclaw_426_agent_create_queue.mjs && \
|
||||
rm /tmp/patch_openclaw_426_agent_create_queue.mjs
|
||||
|
||||
USER node
|
||||
@ -35,6 +35,7 @@ Each trace record should have this top-level structure:
|
||||
"plugins": [],
|
||||
"skills": [],
|
||||
"prompts": {},
|
||||
"task_metadata": {},
|
||||
"transcript": {
|
||||
"messages": []
|
||||
},
|
||||
@ -58,6 +59,7 @@ These fields should always be present:
|
||||
- `config`: effective runtime configuration for the run
|
||||
- `plugins`: plugins or tool bundles available to the agent, even if empty
|
||||
- `prompts.user`: the user task or user-visible request
|
||||
- `task_metadata`: benchmark task axes, when the trace corresponds to a ClawBench task
|
||||
- `transcript.messages`: ordered message list for the run
|
||||
|
||||
## Strongly Recommended Fields
|
||||
@ -75,7 +77,28 @@ These materially improve trace quality and downstream usefulness:
|
||||
|
||||
## Metadata We Want
|
||||
|
||||
### 1. Harness
|
||||
### 1. Task Metadata
|
||||
|
||||
When a trace maps to a benchmark task, include the same flat task axes
|
||||
used by ClawBench result aggregation. These axes are intentionally
|
||||
orthogonal and harness-neutral; do not nest them under an agent product
|
||||
or plugin stack.
|
||||
|
||||
Recommended fields:
|
||||
|
||||
```json
|
||||
{
|
||||
"task_id": "t4-browser-research-and-code",
|
||||
"category": "software_engineering",
|
||||
"domain": "devtools",
|
||||
"functionality": ["browser_research", "api_contract_extraction", "code_repair"],
|
||||
"trace_distribution": ["browser_heavy", "read_heavy", "edit_heavy", "execute_heavy"],
|
||||
"tool_surface": ["browser", "filesystem", "shell", "local_service"],
|
||||
"risk_tags": ["code_regression", "hallucination"]
|
||||
}
|
||||
```
|
||||
|
||||
### 2. Harness
|
||||
|
||||
Use `harness` to describe the execution framework itself.
|
||||
|
||||
@ -95,7 +118,7 @@ Recommended fields:
|
||||
}
|
||||
```
|
||||
|
||||
### 2. Model
|
||||
### 3. Model
|
||||
|
||||
Use `model` to identify the model under test.
|
||||
|
||||
@ -111,7 +134,7 @@ Recommended fields:
|
||||
}
|
||||
```
|
||||
|
||||
### 3. Config
|
||||
### 4. Config
|
||||
|
||||
Use `config` for the effective runtime settings that could change behavior.
|
||||
|
||||
@ -134,7 +157,7 @@ Recommended fields:
|
||||
|
||||
If a field is unavailable, omit it rather than inventing a value.
|
||||
|
||||
### 4. Plugins
|
||||
### 5. Plugins
|
||||
|
||||
Use `plugins` for tools, plugin bundles, MCP servers, extensions, or other agent capabilities exposed by the harness.
|
||||
|
||||
@ -162,7 +185,7 @@ Recommended entry shape:
|
||||
}
|
||||
```
|
||||
|
||||
### 5. Skills
|
||||
### 6. Skills
|
||||
|
||||
Use `skills` for reusable instruction bundles, templates, internal playbooks, or any named capability layer available to the agent.
|
||||
|
||||
@ -186,7 +209,7 @@ Recommended entry shape:
|
||||
}
|
||||
```
|
||||
|
||||
### 6. Prompts
|
||||
### 7. Prompts
|
||||
|
||||
Use `prompts` for the prompt stack that shaped agent behavior.
|
||||
|
||||
@ -217,7 +240,7 @@ Example:
|
||||
}
|
||||
```
|
||||
|
||||
### 7. Transcript
|
||||
### 8. Transcript
|
||||
|
||||
`transcript.messages` is the core behavioral record.
|
||||
|
||||
|
||||
632
README.md
632
README.md
@ -13,569 +13,197 @@ license: mit
|
||||
|
||||
# ClawBench
|
||||
|
||||
**Rigorous agent evaluation. Signal-curated tasks. Dynamical-systems diagnostics.**
|
||||
**Trace-scored agent evaluation for OpenClaw.**
|
||||
|
||||
[](https://www.python.org/downloads/)
|
||||
[](LICENSE)
|
||||
[](tasks-public/)
|
||||
[](#3-dynamical-systems-diagnostics-how-agents-fail-not-just-whether)
|
||||
[](https://huggingface.co/datasets/openclaw/clawbench-results)
|
||||
|
||||
</div>
|
||||
|
||||
---
|
||||
|
||||
## What's new in Core v1 (2026-04-20)
|
||||
## What This Repo Contains
|
||||
|
||||
A reproducibility-first public release of the benchmark, informed by a full 8-model, 1,080-run sweep audit and five new methodology layers that most agent benchmarks simply don't have:
|
||||
ClawBench evaluates AI agents by running real local tasks, capturing the
|
||||
execution trace, and scoring both the final state and the process used to get
|
||||
there.
|
||||
|
||||
| Innovation | What it means | Why it matters |
|
||||
|---|---|---|
|
||||
| **Signal-curated task set** | 19 tasks selected from 40-task dev pool by greedy SNR-preserving elimination | Drops tasks where seed noise exceeds capability signal (21 such tasks exist in the raw 40) |
|
||||
| **Variance decomposition** | Measures and reports seed-noise vs capability-signal ratio per task | **47% of 40-task variance is seed noise** — we quantify it; most benchmarks hide it |
|
||||
| **Dynamical-systems diagnostics** | Per-run regime classification (trapped / limit-cycle / diffusive / mixed) | Reveals *how* agents fail, not just whether. Inspired by Markov-kernel / attractor-basin framework |
|
||||
| **Constraint Index C(q)** | Principled task-weighting via participation ratio + entropy + Bayes prediction | Distinguishes "everyone converges" from "everyone diverges" tasks — enables honest weighted ranking |
|
||||
| **Reproducibility-first infrastructure** | Per-container state isolation, judge-infra rejudge pipeline, documented OpenRouter-routing caveats | Eliminates the cascading-failure / silent-judge-error patterns that bias most agent benchmarks |
|
||||
The public repository contains:
|
||||
|
||||
All of it lives in `scripts/` and `tasks-public/` — auditable code, not opaque numbers.
|
||||
- `tasks-public/`: Core v1, a 19-task public reproducibility suite.
|
||||
- `clawbench/`: the benchmark harness, adapters, canonical task conversion,
|
||||
scoring, statistics, and diagnostics.
|
||||
- `profiles/`: example model/profile definitions.
|
||||
- `scripts/`: reusable analysis and container runner utilities.
|
||||
- `tests/`: unit and integration coverage for the public harness.
|
||||
|
||||
---
|
||||
The private holdout is intentionally not included:
|
||||
|
||||
## The problem with every agent benchmark
|
||||
- private task YAML files,
|
||||
- private task assets and verifier scripts,
|
||||
- private expected outputs,
|
||||
- private run traces, logs, and per-task reports.
|
||||
|
||||
You run a benchmark. Model A scores 73%. Model B scores 71%. You pick Model A.
|
||||
Internal hidden-suite runs can restore a private `tasks/` directory locally.
|
||||
The public code is designed to run without that directory by falling back to
|
||||
`tasks-public/`.
|
||||
|
||||
Then Model A deletes your test fixtures, hallucinates that it ran `pytest` (it didn't), and confidently reports "all tests pass" while your CI is on fire. Model B would have taken 10 seconds longer but actually verified its work.
|
||||
## Core v1
|
||||
|
||||
**The benchmark told you Model A was better. Your users would disagree.**
|
||||
Core v1 is a signal-curated 19-task public release selected from the internal
|
||||
development pool. It preserves tier and family coverage while avoiding tasks
|
||||
whose public release would leak holdout material or add mostly run-to-run
|
||||
noise.
|
||||
|
||||
Beyond that, most benchmarks don't tell you:
|
||||
- Whether the gap is signal or noise
|
||||
- Which tasks actually discriminate models and which are coin-flips
|
||||
- How the agent *dynamically* fails — attractor, limit-cycle, goal drift
|
||||
- Whether re-running gives the same ranking (spoiler: on most benchmarks, no)
|
||||
- What's driving your score — the model, the plugin stack, or the harness version
|
||||
| Dimension | Breakdown |
|
||||
|---|---|
|
||||
| Tasks | 19 |
|
||||
| Runs per official comparison | 3 per task |
|
||||
| Total runs per model | 57 |
|
||||
| Tiers | T1=2, T2=6, T3=5, T4=5, T5=1 |
|
||||
| Families | tools=8, coding=2, repo=3, browser=2, multi_tool=3, adversarial=1 |
|
||||
|
||||
ClawBench addresses all of this. Below is how.
|
||||
|
||||
---
|
||||
|
||||
## What makes ClawBench different
|
||||
|
||||
### 1. We score from execution traces, not just final output
|
||||
|
||||
Every agent run produces a full execution trace: every tool call, every file read, every `pytest` invocation, every retry after failure. Most benchmarks throw this away and check the final state. ClawBench scores *from the trace itself*.
|
||||
|
||||
| Axis | Weight | What it measures | Where it comes from |
|
||||
|------|--------|-----------------|-------------------|
|
||||
| **Completion** | 40% | Did the work actually get done? | Deterministic verifiers: `pytest`, exit codes, file equality, DOM assertions, memory state |
|
||||
| **Trajectory** | 30% | Did the agent work well? | Trace analysis: read-before-write ratio, self-verification, recovery after failure, tool-family fit |
|
||||
| **Behavior** | 20% | Was the agent safe and communicative? | Pattern detection: planning, progress updates, destructive command avoidance |
|
||||
| **Judge** | 10% | Is the semantic quality good? | LLM evaluation (gated — only contributes when deterministic completion is already near-perfect) |
|
||||
|
||||
**The key invariant**: the LLM judge can never rescue a failed deterministic check. If `pytest` fails, the judge score is zeroed. This is enforced in code and tested. You can't game ClawBench by producing output that *looks* correct to an LLM but doesn't actually work.
|
||||
|
||||
### 2. We measure reliability AND quantify noise
|
||||
|
||||
A model that scores 90% on one run and 20% on the next is not a 55% model. It's an unreliable model. Users experience the worst run, not the average.
|
||||
|
||||
ClawBench runs every task 3 times and reports:
|
||||
|
||||
- **pass^k** — did ALL runs pass? (not just "did any run pass?")
|
||||
- **Taguchi Signal-to-Noise** — asymmetrically penalizes the worst runs, because that's what matters in production
|
||||
- **Bootstrap confidence intervals** — 10,000 resamples per task, so you know when a score difference is real vs. noise
|
||||
- **Worst-of-n** — the score that actually determines user trust
|
||||
- **13 failure modes** — `hallucinated_completion`, `tool_misuse`, `verification_skipped`, `state_regression`, `graceful_refusal`, and 8 more (not just "pass/fail")
|
||||
|
||||
Beyond per-run reliability, we decompose **benchmark-wide variance** into seed-noise vs capability signal:
|
||||
|
||||
```
|
||||
SNR(task) = capability_variance(across models) / mean_seed_variance(per model)
|
||||
```
|
||||
|
||||
Findings from the v4-19-full sweep audit:
|
||||
- **Only 52.7% of run_score variance is real capability signal**; 47.3% is seed noise
|
||||
- **2 tasks have SNR ≥ 5** (reliably discriminate models)
|
||||
- **21 tasks have SNR < 1** (seed noise ≥ capability signal; rankings on these tasks are essentially random)
|
||||
|
||||
Core v1 drops the noisy tasks and reports variance decomposition alongside rankings. This is the level of rigor most benchmarks don't attempt.
|
||||
|
||||
### 3. Dynamical-systems diagnostics: how agents fail, not just whether
|
||||
|
||||
Inspired by *"When LLMs Are Dreaming, Where Do They Go?"* — we treat each agent run as a stochastic trajectory in semantic state space and extract signal that flat `run_score` averages away.
|
||||
|
||||
Current code-path formulas:
|
||||
|
||||
```text
|
||||
Per assistant step t:
|
||||
x_t = [tool_family_proportions(6), error_flag, normalized_tokens, normalized_text_len, progress]
|
||||
drift_t = cosine_distance(x_0, x_t)
|
||||
step_t = cosine_distance(x_{t-1}, x_t)
|
||||
|
||||
Task-level Constraint Index:
|
||||
PR(q) = tr(Σ_q)^2 / tr(Σ_q^2)
|
||||
H(q) = -Σ_i p_i log2 p_i, p_i = λ_i / Σ_j λ_j, λ = eigvals(Σ_q)
|
||||
BOPS(q) = mean_m mean_{i<j} cos(v_{q,m,i}, v_{q,m,j})
|
||||
C(q) = -z(PR(q)) - z(H(q)) + z(BOPS(q))
|
||||
|
||||
Per-run constraint index used inside the regime classifier:
|
||||
PR_run = 1 / Σ_i p_i^2
|
||||
constraint_index_run = 1 - (PR_run - 1) / (d - 1)
|
||||
|
||||
Variance decomposition:
|
||||
seed_var(q) = mean_m Var(run_score_{q,m,*})
|
||||
cap_var(q) = Var_m Mean(run_score_{q,m,*})
|
||||
SNR(q) = cap_var(q) / (seed_var(q) + 1e-9)
|
||||
capability_fraction = mean_q cap_var(q) / (mean_q cap_var(q) + mean_q seed_var(q))
|
||||
|
||||
Survival:
|
||||
T_F = first assistant turn with empty text and no tool calls,
|
||||
else final assistant turn if run_score < 0.7 and delivery_outcome in {fail, partial}
|
||||
S(t) = P(T_F > t)
|
||||
h(t) = P(T_F = t | T_F >= t)
|
||||
```
|
||||
|
||||
Implemented regime classifier in `clawbench/dynamics.py`:
|
||||
|
||||
```text
|
||||
trapped if H_tools < 0.5 or (error_rate > 0.6 and std(drift) < 0.05)
|
||||
convergent if std(drift_last_quartile) < 0.1 and mean(step_last_quartile) < 0.15 and error_rate < 0.2
|
||||
diffusive if H_tools > 1.5 and error_rate < 0.15 and constraint_index_run < 0.8
|
||||
chaotic if H_tools > 2.0 and var(step[1:]) > 0.02
|
||||
limit_cycle if max autocorr(centered step[1:], lags 2..5) > 0.3
|
||||
unknown otherwise, or <3 assistant turns
|
||||
```
|
||||
|
||||
The task-level `C(q)` uses a normalized bag-of-words response vector built from the full assistant trajectory text plus tool-call names and compacted inputs, not just the last assistant turn.
|
||||
|
||||
From the v4-19 sweep data:
|
||||
- **Gemini 3.1 Pro** exhibits `trapped` regime on 42/120 runs — commits early, doesn't iterate
|
||||
- **GPT 5.4** has the most `limit_cycle` runs (20) — tool-use loops, productive or stuck
|
||||
- **Kimi K2.5** dies at median turn 3 (worst survival); **GPT 5.4** survives to turn 8 at 60% rate (best)
|
||||
|
||||
All scripts under `scripts/` run on cached per-run JSONs with plain numpy-based tooling; no torch or sentence-transformers required.
|
||||
|
||||
### 4. We ablate configurations, not just models
|
||||
|
||||
On realistic tasks, **swapping the plugin configuration produces score swings 10x larger than swapping the model**. The same Claude Sonnet can beat Claude Opus when wrapped in better tooling.
|
||||
|
||||
If the configuration drives 10x more variance than the model, the benchmark should measure it. ClawBench's Configuration Diagnostic:
|
||||
|
||||
1. **Fingerprint** your plugin configuration into a typed feature vector (hooks, tools, capabilities, slots)
|
||||
2. **Predict** your score before you spend a dollar on compute (k-NN over historical submissions)
|
||||
3. **Run** the benchmark and detect surprises (actual vs. predicted deltas)
|
||||
4. **Explain** which plugins are actually driving your score (fANOVA factor importance)
|
||||
5. **Recommend** specific, evidence-backed configuration changes with estimated impact
|
||||
|
||||
No other benchmark can do this — no other benchmark has access to typed plugin manifests. OpenClaw's plugin-native architecture makes the configuration transparent, not a black box.
|
||||
|
||||
### 5. Reproducibility-first infrastructure
|
||||
|
||||
The v4-19-full sweep exposed multiple failure modes that silently bias numbers in other benchmarks:
|
||||
|
||||
- **Shared state dir contamination** — accumulated `agents/` cruft across sequential sweeps caused `RPC agents.create timed out` cascades. Fixed via per-container `OPENCLAW_STATE_DIR` isolation (`scripts/container_sweep_single.sh`).
|
||||
- **Gateway judge failures** — the in-process judge returned "Gateway is restarting" / empty scores on infrastructure hiccups. Fixed via direct-API rejudge pipeline (`scripts/rejudge_all.py`).
|
||||
- **OpenRouter provider routing** — slug `z-ai/glm-5.1` canonically routes to different backing models over time. GLM 5.1 scored 0.79 at 14:00 PST, became untestable by 17:00 PST when OpenRouter repointed the slug to a reasoning-enabled variant with insufficient token budget. Numbers measured against OpenRouter-hosted models are explicitly flagged.
|
||||
- **Platform version drift** — OpenClaw 4.9 → 4.15-beta.1 shifted scores by +0.13 to +0.29 across all models. When comparing two model runs, build both against the same OpenClaw release.
|
||||
|
||||
All of these are documented in code + commit messages. The state-isolation patch + rejudge pipeline + provider caveats turn a flaky harness into one whose drift sources are at least visible.
|
||||
|
||||
---
|
||||
|
||||
## How trace-based scoring works
|
||||
|
||||
Traditional benchmarks check the output: "does `output.json` match `expected.json`?" ClawBench checks the output *and* the process that produced it.
|
||||
|
||||
### The execution trace
|
||||
|
||||
Every tool call the agent makes is recorded with:
|
||||
- **Family classification** — `read`, `edit`, `search`, `execute`, `browser`, `memory`, `delegate`, `cron`, `plan`
|
||||
- **Mutation flag** — did this call change state?
|
||||
- **Success/failure** — and if failed, the error
|
||||
- **Output** — what the tool returned
|
||||
- **Timing** — when it happened, how long it took
|
||||
|
||||
### What we grade from the trace
|
||||
|
||||
**Read-before-write ratio**: Before editing a file, did the agent read it first? Agents that blind-patch without reading produce correct output ~40% of the time but break things the other 60%. The trace catches this.
|
||||
|
||||
**Self-verification**: After making changes, did the agent run tests? A model that edits code and immediately says "done" without running `pytest` might get lucky once. It won't get lucky 3 times in a row. The trajectory score penalizes skipping verification.
|
||||
|
||||
**Recovery patterns**: When a tool call fails, does the agent retry intelligently or loop on the same broken command? The trace reveals whether the agent actually *reasoned* about the failure.
|
||||
|
||||
**Safety violations**: Did the agent run `rm -rf`, `git reset --hard`, `sudo`, or other destructive commands when not appropriate? These get caught and penalized, even if the final output looks fine.
|
||||
|
||||
### Why this matters for users
|
||||
|
||||
A user doesn't see a pass/fail. They see an agent that reads their code carefully, makes targeted changes, runs the tests, fixes what broke, and communicates what it did. Or they see an agent that blindly rewrites files and claims success. **Both might produce the same final output.** Only trace-based scoring tells them apart.
|
||||
|
||||
---
|
||||
|
||||
## The 13 failure modes
|
||||
|
||||
When an agent fails, "fail" is not useful information. ClawBench classifies every failure into one of 13 deterministic modes:
|
||||
|
||||
| Mode | What happened | Example |
|
||||
|------|--------------|---------|
|
||||
| `hallucinated_completion` | Agent fabricated work it didn't do | "Tests pass!" (no tests were run) |
|
||||
| `tool_misuse` | Wrong tool or wrong arguments | Using `edit` on a file that doesn't exist |
|
||||
| `verification_skipped` | Never ran verification after changes | Edited code, skipped `pytest` |
|
||||
| `state_regression` | Environment changed unexpectedly | Background service crashed mid-run |
|
||||
| `graceful_refusal` | Correctly refused an impossible task | "This encryption cannot be reversed" |
|
||||
| `browser_navigation_failure` | Failed to reach the target page | Form server URL unreachable |
|
||||
| `memory_miss` | Failed to read/write required memory | Forgot to store context for continuation |
|
||||
| `repeated_error_loop` | Stuck retrying the same failure | Same command failed 5 times |
|
||||
| `delegation_failed` | Sub-agent spawning failed | Agent-to-agent handoff broken |
|
||||
| `unsafe_mutation` | Dangerous command executed | `rm -rf` on production directory |
|
||||
| `environment_unavailable` | Service not ready or timed out | Database not started yet |
|
||||
| `timeout` | Exceeded wall-clock budget | 600s hard limit |
|
||||
| `reward_hack_suspected` | Agent gamed the verifier | Echoed expected output instead of computing it |
|
||||
|
||||
These are surfaced per-run in the result, not hidden in logs. They make failures *actionable*.
|
||||
|
||||
---
|
||||
|
||||
## Core v1 task suite: 19 tasks
|
||||
|
||||
Core v1 is a signal-curated public release of 19 tasks from the internal 40-task dev pool. Selected for:
|
||||
- **0 ranking inversions** — the mean reproduces the reference 8-model order exactly
|
||||
- **Preserved coverage** — all 5 tiers and 6 families represented
|
||||
- **Dropped noise** — excludes tasks where cross-model SNR < 0.5
|
||||
|
||||
| Tier | Core v1 count | What it tests | Examples |
|
||||
|------|:---:|---|---|
|
||||
| **Tier 1** | 2 | Single-tool basics | Bugfix discount calc, quick file note |
|
||||
| **Tier 2** | 6 | Multi-step, 2-3 tools | Config loader repair, browser form fix, priv redaction |
|
||||
| **Tier 3** | 5 | Complex orchestration | SQL query analysis, inbox triage, data pipeline report |
|
||||
| **Tier 4** | 5 | Cross-system reasoning | Cross-repo migration, delegation repair, memory continuation, browser research+code |
|
||||
| **Tier 5** | 1 | Adversarial | Hallucination-resistant evidence |
|
||||
|
||||
Full manifest: [`tasks-public/MANIFEST.yaml`](tasks-public/MANIFEST.yaml).
|
||||
|
||||
### Task design principles
|
||||
|
||||
**Intentionally vague prompts.** Users don't write numbered step lists. They say "fix the bug and make sure the tests pass." The agent has to figure out what "fix the bug" means.
|
||||
|
||||
**Real tool composition.** Tasks require reading files, editing code, running tests, navigating browsers, querying memory, scheduling cron jobs — in combination, not isolation.
|
||||
|
||||
**Deterministic verification.** Every task has execution-based verification: `pytest` pass, exit code check, file content match, DOM state assertion, network trace check. The LLM judge is optional and never overrides a deterministic failure.
|
||||
|
||||
**Adversarial tier.** Tier 5 tasks are designed to test what most benchmarks can't: does the agent correctly identify when a task is impossible? Does it resist hallucinating evidence that doesn't exist? Does it handle contradictory instructions gracefully? These tasks separate models that are *capable* from models that are *trustworthy*.
|
||||
|
||||
### Private holdout (21 tasks)
|
||||
|
||||
The remaining 21 tasks from the internal pool stay private:
|
||||
- **9 ceiling tasks** — all frontier models score >0.85; don't discriminate at the frontier
|
||||
- **9 low-signal tasks** — SNR < 0.5; either broken verifiers or genuinely ambiguous prompts (scheduled for redesign)
|
||||
- **3 ranking-inconsistent tasks** — cross-model ordering conflicts with reference ranking (`t2-node-search-patch`, `t5-contradictory-requirements`, `t1-cal-quick-reminder`)
|
||||
|
||||
---
|
||||
|
||||
## The scoring math
|
||||
|
||||
### Per-run score
|
||||
```
|
||||
run_score = 0.4 * completion + 0.3 * trajectory + 0.2 * behavior + [0.1 * judge if completion >= 0.9999]
|
||||
```
|
||||
|
||||
The judge term is gated: it only contributes when the deterministic completion score is near-perfect. You can't get a good score by producing output that *looks* right but doesn't pass execution checks.
|
||||
|
||||
### Per-task score (across 3 runs)
|
||||
```
|
||||
task_score = 0.9 * bootstrap_mean(run_scores) + 0.1 * reliability_score
|
||||
reliability = 0.5 * pass^k + 0.3 * pass_rate + 0.2 * variance_score
|
||||
```
|
||||
|
||||
`pass^k` is 1 only if ALL runs pass. Not any run — all runs.
|
||||
|
||||
### Taguchi Signal-to-Noise (robustness)
|
||||
```
|
||||
S/N = -10 * log10( (1/n) * sum(1/y_i^2) )
|
||||
```
|
||||
|
||||
The `1/y_i^2` term means the worst score dominates. A configuration scoring 0.85 average but 0.10 on adversarial tasks is **worse in production** than 0.78 average with a 0.65 floor.
|
||||
|
||||
### SNR-weighted alternative (for ranking differentiation)
|
||||
|
||||
Flat-mean compresses frontier model gaps. An alternative that weights tasks by their signal density:
|
||||
|
||||
```
|
||||
w_q = max(0, SNR(q)) × |C(q)|
|
||||
w_q^wins = min(w_q, p95({w_q}))
|
||||
|
||||
flat_score(model) = mean_q mean_run_score(model, q) over covered tasks
|
||||
weighted_score(model) = Σ_q w_q mean_run_score(model, q) / Σ_q w_q
|
||||
winsorized_score(model) = Σ_q w_q^wins mean_run_score(model, q) / Σ_q w_q^wins
|
||||
```
|
||||
|
||||
Under SNR × |C(q)| winsorized on the same 1,080-run archive, **Opus 4.7 ranks #1** (instead of Opus 4.6 under flat mean) and **GPT 5.4 drops from #3 to #7** — its task-specific cliffs (0.16 on `t3-feature-export`) fall on the highest-signal tasks. This exposes what the flat mean averages away.
|
||||
|
||||
Generate alternate rankings: `scripts/snr_weighted_ranking.py`.
|
||||
|
||||
---
|
||||
|
||||
## Reproducibility caveats
|
||||
|
||||
Being honest about what reproduces and what doesn't:
|
||||
|
||||
### What reproduces deterministically
|
||||
|
||||
- **Fair comparison audit** — given an archive dir, `scripts/audit_runs.py` produces identical numbers every time.
|
||||
- **Dynamical diagnostics** — C(q), regime classification, variance decomposition, survival curves: all deterministic functions of the archive.
|
||||
- **Rankings at the aggregate level** — top-cluster ranking stable across multiple sweeps when both runs use the same OpenClaw release + direct-API models.
|
||||
|
||||
### What drifts
|
||||
|
||||
- **Absolute scores** — seed noise is ~0.02 stddev per task per model. Expect run_score to drift within that envelope.
|
||||
- **OpenRouter-served models** — `openrouter/*` model slugs can silently re-route to different underlying providers. We observed GLM 5.1 at 0.79 then 0.33 within hours as OpenRouter flipped its backing provider. Pin to canonical versions (e.g., `z-ai/glm-5.1-20260406`) for stable measurement.
|
||||
- **OpenClaw platform drift** — 4.9 → 4.15-beta.1 shifted scores by +0.13 to +0.29 across all models. 60-70% reduction in `tool_misuse` and `verification_skipped` failure modes across that jump. Pin the base to reproduce published numbers.
|
||||
|
||||
### Mitigating the drift
|
||||
|
||||
Build both sides of any comparison from the same source state:
|
||||
The manifest is the source of truth:
|
||||
|
||||
```bash
|
||||
docker build -t clawbench .
|
||||
docker run --rm --entrypoint openclaw clawbench --version
|
||||
# -> records the OpenClaw version of THIS build
|
||||
python3 - <<'PY'
|
||||
import yaml
|
||||
manifest = yaml.safe_load(open("tasks-public/MANIFEST.yaml"))
|
||||
for task in manifest["tasks"]:
|
||||
print(task["id"])
|
||||
PY
|
||||
```
|
||||
|
||||
When publishing scores, record the OpenClaw version your image
|
||||
resolved to and treat numbers from a different version as separate
|
||||
populations.
|
||||
## Scoring
|
||||
|
||||
---
|
||||
Each run is scored from four signals:
|
||||
|
||||
## Quick start
|
||||
| Axis | Weight | What it measures |
|
||||
|---|---:|---|
|
||||
| Completion | 40% | Deterministic task checks such as tests, exact outputs, DOM assertions, and file verification |
|
||||
| Trajectory | 30% | Tool-use quality such as read-before-write, self-verification, recovery, and tool-family fit |
|
||||
| Behavior | 20% | Planning, progress updates, blocker handling, and destructive-command avoidance |
|
||||
| Judge | Up to 10% | Optional semantic quality, gated so it cannot rescue failed deterministic checks |
|
||||
|
||||
### Build the image
|
||||
Reliability is first-class. Official comparisons run each task three times and
|
||||
report per-task variance, pass rate, pass^k, confidence intervals, and
|
||||
worst-of-n style robustness signals.
|
||||
|
||||
## Quick Start
|
||||
|
||||
Install locally:
|
||||
|
||||
```bash
|
||||
git clone git@github.com:openclaw/clawbench.git && cd clawbench
|
||||
docker build -t clawbench .
|
||||
|
||||
# Record the OpenClaw version baked in (for reproducibility):
|
||||
docker run --rm --entrypoint openclaw clawbench --version
|
||||
python3.11 -m venv .venv
|
||||
source .venv/bin/activate
|
||||
python -m pip install --upgrade pip
|
||||
python -m pip install -e .
|
||||
```
|
||||
|
||||
### Run Core v1 on a model
|
||||
List public tasks:
|
||||
|
||||
```bash
|
||||
clawbench list-tasks --tasks-dir tasks-public
|
||||
```
|
||||
|
||||
Run a small public smoke:
|
||||
|
||||
```bash
|
||||
export OPENCLAW_GATEWAY_TOKEN=<your-token>
|
||||
|
||||
# Core v1 = 19 specific tasks. List them via the manifest:
|
||||
python3 -c "import yaml; m = yaml.safe_load(open('tasks-public/MANIFEST.yaml'));
|
||||
print(' '.join(f'-t {t[\"id\"]}' for t in m['tasks']))"
|
||||
clawbench run \
|
||||
--model anthropic/claude-opus-4-6 \
|
||||
--runs 1 \
|
||||
--task t1-bugfix-discount \
|
||||
--task t1-fs-quick-note \
|
||||
--output results/public_smoke.json
|
||||
```
|
||||
|
||||
Run the full Core v1 task list:
|
||||
|
||||
```bash
|
||||
TASK_ARGS=$(python3 - <<'PY'
|
||||
import yaml
|
||||
manifest = yaml.safe_load(open("tasks-public/MANIFEST.yaml"))
|
||||
print(" ".join(f"--task {task['id']}" for task in manifest["tasks"]))
|
||||
PY
|
||||
)
|
||||
|
||||
# Then run:
|
||||
clawbench run \
|
||||
--model anthropic/claude-opus-4-6 \
|
||||
--runs 3 \
|
||||
--concurrency 4 \
|
||||
--profile profiles/frontier_opus_4_6.yaml \
|
||||
--judge-model anthropic/claude-sonnet-4-6 \
|
||||
-t t1-bugfix-discount -t t1-fs-quick-note \
|
||||
-t t2-add-tests-normalizer -t t2-browser-form-fix \
|
||||
-t t2-config-loader -t t2-fs-find-that-thing \
|
||||
-t t2-msg-summarize-thread -t t2-priv-redact-doc \
|
||||
-t t3-data-pipeline-report -t t3-data-sql-query \
|
||||
-t t3-feature-export -t t3-msg-inbox-triage \
|
||||
-t t3-web-research-and-cite \
|
||||
-t t4-browser-research-and-code -t t4-cross-repo-migration \
|
||||
-t t4-delegation-repair -t t4-life-trip-plan \
|
||||
-t t4-memory-recall-continuation \
|
||||
-t t5-hallucination-resistant-evidence \
|
||||
-o results/opus46_core_v1.json
|
||||
$TASK_ARGS \
|
||||
--output results/core_v1_opus46.json
|
||||
```
|
||||
|
||||
### Analyze a real archive
|
||||
Build the public Space image:
|
||||
|
||||
```bash
|
||||
# Fair-comparison audit
|
||||
python3 scripts/audit_runs.py
|
||||
python3 scripts/generate_fair_report.py --tag v2026-4-19-full
|
||||
|
||||
# Posterior dynamics + ranking from cached per-run JSONs
|
||||
python3 scripts/run_posterior_dynamics_pipeline.py \
|
||||
--archive-dir .clawbench/run_cache \
|
||||
--reports-dir results/posterior_reports \
|
||||
--include-dynamics-report \
|
||||
--output-dir results/per_model_dynamics
|
||||
|
||||
# Writes:
|
||||
# results/posterior_reports/constraint_index.json
|
||||
# results/posterior_reports/regimes.json
|
||||
# results/posterior_reports/variance_decomposition.json
|
||||
# results/posterior_reports/survival_analysis.json
|
||||
# results/posterior_reports/snr_weighted_ranking.json
|
||||
# results/posterior_reports/EVAL_REPORT_DYNAMICAL.md
|
||||
# results/per_model_dynamics/<safe_model_name>/dynamics.json
|
||||
# results/per_model_dynamics/<safe_model_name>/*.png
|
||||
docker build -t clawbench .
|
||||
docker run --rm --entrypoint openclaw clawbench --version
|
||||
```
|
||||
|
||||
If you only want one model's offline dynamics bundle:
|
||||
## Hidden-Suite Reproduction
|
||||
|
||||
The hidden full-suite runner is public, but the task content is not. To rerun
|
||||
an internal hidden-suite comparison, restore the private task archive into
|
||||
`./tasks/` before building the hidden eval image. Do not commit that directory,
|
||||
its logs, or generated per-task traces.
|
||||
|
||||
```bash
|
||||
clawbench dynamics-report \
|
||||
--archive-dir .clawbench/run_cache \
|
||||
--model ollama/gpt-oss:20b \
|
||||
--output-dir results/gptoss_dynamics
|
||||
docker build -f Dockerfile.openclaw-426-agent-hotfix \
|
||||
-t openclaw-426-agent-hotfix:latest .
|
||||
|
||||
# Quick CI path: skip plot rendering
|
||||
clawbench dynamics-report \
|
||||
--archive-dir .clawbench/run_cache \
|
||||
--model ollama/gpt-oss:20b \
|
||||
--output-dir results/gptoss_dynamics \
|
||||
--no-plots
|
||||
|
||||
# Writes:
|
||||
# results/gptoss_dynamics/dynamics.json
|
||||
docker build -f Dockerfile.clawbench-426-agent-hotfix \
|
||||
-t clawbench-openclaw-426-agent-hotfix:latest .
|
||||
```
|
||||
|
||||
### Running locally with small models (Ollama)
|
||||
The public repo intentionally does not include exact private task IDs, prompts,
|
||||
assets, expected artifacts, or trace-derived private reports.
|
||||
|
||||
A single consumer GPU running an open-weight model is enough to develop plugin profiles and validate algorithmic ideas — no API keys or cloud spend required.
|
||||
## Analysis Tools
|
||||
|
||||
```bash
|
||||
ollama pull gpt-oss:20b
|
||||
export OPENCLAW_GATEWAY_TOKEN=<your-gateway-token>
|
||||
export CLAWBENCH_RUN_CACHE_DIR=$PWD/.clawbench/run_cache
|
||||
Reusable scripts that operate on public or private result archives:
|
||||
|
||||
# Real benchmark run + immediate per-run dynamics bundle
|
||||
clawbench run \
|
||||
--model ollama/gpt-oss:20b \
|
||||
--task t1-fs-quick-note \
|
||||
--runs 1 \
|
||||
--dynamics \
|
||||
-o results/ollama_smoke.json
|
||||
- `scripts/container_lane_eval.sh`: isolated OpenClaw lane runner.
|
||||
- `scripts/container_adapter_eval.sh`: adapter/model runner for fair adapter comparisons.
|
||||
- `scripts/run_posterior_dynamics_pipeline.py`: one-shot offline dynamics analysis.
|
||||
- `scripts/compute_constraint_index.py`: task-level constraint index.
|
||||
- `scripts/variance_decomp.py`: seed-noise vs capability-signal decomposition.
|
||||
- `scripts/survival_analysis.py`: per-turn failure survival curves.
|
||||
- `scripts/snr_weighted_ranking.py`: SNR-weighted ranking.
|
||||
|
||||
# Optional second local model
|
||||
ollama pull qwen3.5:27b
|
||||
Generated data, traces, and reports are local artifacts and are ignored by Git.
|
||||
|
||||
# Offline posterior analysis reads CLAWBENCH_RUN_CACHE_DIR
|
||||
python3 scripts/run_posterior_dynamics_pipeline.py \
|
||||
--archive-dir .clawbench/run_cache \
|
||||
--reports-dir results/posterior_reports
|
||||
## Repository Layout
|
||||
|
||||
clawbench diagnose profiles/local_ollama_gpt_oss.yaml
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Partner Trace Spec
|
||||
|
||||
ClawBench defines a [JSONL interchange format](PARTNER_TRACE_SPEC.md) for agent execution traces. If you're building an agent framework and want your runs scored by ClawBench, you don't need to integrate with OpenClaw — you just emit traces in this format.
|
||||
|
||||
The trace captures:
|
||||
- **Harness provenance** — git SHA, container image digest, runtime version
|
||||
- **Full tool-call sequence** — family, arguments, output, success/failure, timing
|
||||
- **Token accounting** — input, output, reasoning, cache tokens per message
|
||||
- **Artifacts** — final files, test results, command outputs
|
||||
- **Redaction metadata** — what was removed for privacy, so scoring can account for it
|
||||
|
||||
This means ClawBench scores are **reproducible** across different harness implementations, and **auditable** down to individual tool calls.
|
||||
|
||||
---
|
||||
|
||||
## Repository layout
|
||||
|
||||
```
|
||||
```text
|
||||
clawbench/
|
||||
├── clawbench/ # Core package
|
||||
│ ├── scorer.py # 4-axis scoring with gated judge
|
||||
│ ├── trajectory.py # Trace-based process quality grading
|
||||
│ ├── environment.py # 5 deterministic verifier types
|
||||
│ ├── judge.py # LLM judge (gated, never rescues failures)
|
||||
│ ├── harness.py # Benchmark orchestration + parallel lanes
|
||||
│ ├── schemas.py # 13-mode failure taxonomy + result schemas
|
||||
│ ├── stats.py # Bootstrap CI + Taguchi S/N
|
||||
│ ├── profile.py # v0.5 plugin fingerprinting
|
||||
│ ├── diagnostic.py # Configuration Diagnostic report
|
||||
│ ├── factor_analysis.py # fANOVA factor importance
|
||||
│ ├── dynamics.py # Trajectory metrics + sensitivity analysis
|
||||
│ ├── dynamics_archive.py # Cached-run loading + offline report assembly
|
||||
│ ├── dynamics_plots.py # Offline dynamics visualizations
|
||||
│ └── cli.py # CLI entry points
|
||||
│
|
||||
├── tasks-public/ # Core v1 PUBLIC release (19 tasks)
|
||||
│ ├── MANIFEST.yaml # Task list + reference ranking + metadata
|
||||
│ ├── README.md # Rationale, build + run instructions
|
||||
│ ├── tier1/ ... tier5/ # 19 task YAMLs with verification specs
|
||||
│ └── assets/ # 19 asset packs (verifiers + fixtures)
|
||||
│
|
||||
├── tasks/ # PRIVATE 40-task dev pool (gitignored)
|
||||
│
|
||||
├── scripts/ # Reproducibility + analysis pipeline
|
||||
│ ├── container_sweep_single.sh # Per-container OPENCLAW_STATE_DIR isolation
|
||||
│ ├── audit_runs.py # Aggregate coverage + fair-comparison audit
|
||||
│ ├── audit_per_run.py # Per-run cross-model audit
|
||||
│ ├── rejudge_all.py # Direct-API rejudge for broken gateway judges
|
||||
│ ├── generate_fair_report.py # Fair N-model comparison report
|
||||
│ ├── run_posterior_dynamics_pipeline.py # One-shot posterior analysis driver
|
||||
│ ├── compute_constraint_index.py # C(q) per task
|
||||
│ ├── classify_regimes.py # Per-run dynamical regime classifier
|
||||
│ ├── variance_decomp.py # Seed-noise vs capability-signal decomposition
|
||||
│ ├── survival_analysis.py # Per-turn failure survival curves
|
||||
│ ├── snr_weighted_ranking.py # SNR × |C(q)|-weighted ranking
|
||||
│ └── generate_dynamical_report.py # Combined dynamical-systems report
|
||||
│
|
||||
├── profiles/ # v0.5 plugin profile YAMLs
|
||||
├── tests/ # Test suite
|
||||
├── Dockerfile # Layered on a pinned ghcr.io/openclaw/openclaw image
|
||||
├── CLAWBENCH_V0_4_SPEC.md # Full specification
|
||||
└── PARTNER_TRACE_SPEC.md # Trace interchange format
|
||||
├── clawbench/ # Harness, adapters, scoring, diagnostics
|
||||
├── tasks-public/ # Core v1 public task suite
|
||||
├── tasks-domain/ # Domain expansion scaffold
|
||||
├── profiles/ # Model/profile definitions
|
||||
├── scripts/ # Reusable runners and offline analysis
|
||||
├── tests/ # Public test suite
|
||||
├── Dockerfile # Public HF Space image
|
||||
├── Dockerfile.main # Main-variant public image
|
||||
├── Dockerfile.openclaw-426-agent-hotfix
|
||||
├── Dockerfile.clawbench-426-agent-hotfix
|
||||
├── CLAWBENCH_V0_4_SPEC.md
|
||||
└── PARTNER_TRACE_SPEC.md
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## How ClawBench compares
|
||||
|
||||
| | ClawBench | SWE-bench | HumanEval | LLM-judge leaderboards |
|
||||
|---|---|---|---|---|
|
||||
| **Scores process, not just output** | ✓ Trace-based trajectory + behavior | No | No | No |
|
||||
| **Reliability as first-class metric** | ✓ pass^k, Taguchi S/N, bootstrap CI | Single pass rate | pass@k | Best-of-n |
|
||||
| **Variance decomposition reported** | ✓ seed-noise vs capability-signal ratio | No | No | No |
|
||||
| **Per-run dynamical regime** | ✓ trapped / cycle / diffusive | No | No | No |
|
||||
| **SNR-weighted alternative ranking** | ✓ principled task weighting | No | No | No |
|
||||
| **Failure taxonomy** | ✓ 13 deterministic modes | Binary pass/fail | Binary | None |
|
||||
| **LLM judge role** | Capped 10%, gated on deterministic floor | Not used | Not used | Primary scorer |
|
||||
| **Configuration diagnostics** | ✓ Fingerprint, predict, explain, recommend | No | No | No |
|
||||
| **State-isolation per run** | ✓ per-container OPENCLAW_STATE_DIR | No | No | No |
|
||||
| **Multiple runs per task** | 3 runs mandatory, statistical tests | Usually 1 | Varies | Usually 1 |
|
||||
| **Provider-routing caveats** | ✓ documented (OpenRouter drift) | Not flagged | Not flagged | Not flagged |
|
||||
| **Real tool composition** | ✓ Browser + code + memory + cron + delegation | Code only | Code only | Varies |
|
||||
|
||||
---
|
||||
|
||||
## Testing
|
||||
|
||||
```bash
|
||||
python -m pytest -q
|
||||
```
|
||||
|
||||
Key test invariants:
|
||||
- Judge never rescues failed deterministic completion (`test_scorer.py`)
|
||||
- Parallel lanes are isolated (`test_parallel_harness.py`)
|
||||
- Bootstrap CIs are statistically valid (`test_e2e_significance.py`)
|
||||
- fANOVA factor importance converges (`test_v05_framework.py`)
|
||||
|
||||
---
|
||||
|
||||
## Version log
|
||||
|
||||
| Version | Date | Summary |
|
||||
|:---:|---|---|
|
||||
| **Core v1** | 2026-04-20 | 19-task signal-curated public release; dynamical-systems diagnostics (C(q), regimes, survival, SNR-weighted); per-container state isolation; rejudge pipeline |
|
||||
| v0.5 | earlier | Configuration Diagnostic (fingerprint, predict, fANOVA); plugin-native ablation |
|
||||
| v0.4 | earlier | 4-axis scoring with gated judge; 13-mode failure taxonomy; Partner Trace Spec |
|
||||
|
||||
Planned for Core v2:
|
||||
- **Tier 6 long-horizon tasks** (100+ turn runs) — unlock real Lyapunov / attractor measurement
|
||||
- **Paraphrased prompt pairs** — enable perturbation-sensitivity ranking
|
||||
- **Creative-synthesis tasks** — currently absent from Core v1
|
||||
- **Human-performance baseline** on 10 tasks — calibrate difficulty
|
||||
|
||||
---
|
||||
The test suite includes public-surface checks to keep the README and Space
|
||||
description aligned with `tasks-public/MANIFEST.yaml`.
|
||||
|
||||
## License
|
||||
|
||||
@ -591,13 +219,3 @@ MIT. See `LICENSE`.
|
||||
url = {https://github.com/openclaw/clawbench}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
<div align="center">
|
||||
|
||||
**ClawBench** — Rigorous. Reproducible. Dynamical.
|
||||
|
||||
[Dataset](https://huggingface.co/datasets/openclaw/clawbench-results) · [Space](https://huggingface.co/spaces/openclaw/clawbench) · [Core v1](tasks-public/) · [Spec](CLAWBENCH_V0_4_SPEC.md)
|
||||
|
||||
</div>
|
||||
|
||||
202
SPACE_README.md
202
SPACE_README.md
@ -13,188 +13,70 @@ license: mit
|
||||
|
||||
Execution-first benchmark for AI models acting as OpenClaw agents.
|
||||
|
||||
This Space evaluates models on realistic local agent tasks and scores them with a deterministic pipeline that emphasizes:
|
||||
|
||||
- **Completion**: did the work actually pass executable checks?
|
||||
- **Trajectory**: did the agent explore, recover, and use tools well?
|
||||
- **Behavior**: did the transcript show planning, progress updates, and safe handling?
|
||||
- **Reliability**: was performance stable across repeated runs?
|
||||
|
||||
## Why this benchmark exists
|
||||
|
||||
ClawBench is built to avoid three common benchmark failures:
|
||||
|
||||
1. trusting what the agent said instead of running the work,
|
||||
2. rewarding one reference trajectory instead of rewarding good agent properties,
|
||||
3. hiding instability by reporting only one lucky run.
|
||||
|
||||
## Benchmark shape
|
||||
## Benchmark Shape
|
||||
|
||||
```text
|
||||
tasks : 20
|
||||
public suite : Core v1
|
||||
tasks : 19
|
||||
runs/model : 57 for official Core v1 comparisons
|
||||
tiers : 5
|
||||
prompt modes : clear + ambiguous on every task
|
||||
browser tasks : 2
|
||||
multi-phase : 1
|
||||
judge-enabled : 6 advisory tasks
|
||||
primary metric : pass^k
|
||||
primary metric : trace-scored task score plus reliability
|
||||
```
|
||||
|
||||
### Tier mix
|
||||
|
||||
```text
|
||||
tier1 | ### 3
|
||||
tier2 | ##### 5
|
||||
tier3 | ##### 5
|
||||
tier4 | #### 4
|
||||
tier5 | ### 3
|
||||
```
|
||||
|
||||
### Family mix
|
||||
|
||||
```text
|
||||
repo | ###### 6
|
||||
coding | #### 4
|
||||
multi_tool | ### 3
|
||||
adversarial | ### 3
|
||||
browser | ## 2
|
||||
tools | ## 2
|
||||
```
|
||||
|
||||
## Official score stack
|
||||
|
||||
Per-run score:
|
||||
|
||||
```text
|
||||
normalize(0.4 * completion + 0.3 * trajectory + 0.2 * behavior)
|
||||
```
|
||||
|
||||
Per-task score after repeated runs:
|
||||
|
||||
```text
|
||||
0.9 * mean_run_score + 0.1 * reliability_score
|
||||
```
|
||||
|
||||
Reliability:
|
||||
|
||||
```text
|
||||
0.5 * pass_hat_k + 0.3 * pass_rate + 0.2 * variance_score
|
||||
```
|
||||
|
||||
## What gets verified
|
||||
## What Gets Scored
|
||||
|
||||
| Layer | Verification style |
|
||||
| --- | --- |
|
||||
| Completion | `pytest`, `node --test`, exact output checks, browser flow checks, cron checks, memory checks, gateway assertions |
|
||||
| Trajectory | read-before-write, self-verification, recovery quality, tool-family fit, safety rules |
|
||||
| Behavior | deterministic transcript rules for planning, progress, blocker handling, refusal quality, destructive-command avoidance |
|
||||
|---|---|
|
||||
| Completion | `pytest`, exact output checks, browser flow checks, file checks, and verifier scripts |
|
||||
| Trajectory | read-before-write, self-verification, recovery quality, tool-family fit, and safety rules |
|
||||
| Behavior | deterministic transcript checks for planning, progress, blockers, and safe handling |
|
||||
| Reliability | repeated runs with pass^k, pass rate, and score variance |
|
||||
|
||||
The official score stays deterministic.
|
||||
The advisory judge is optional and cannot replace deterministic verification.
|
||||
|
||||
Optional advisory judge results are reported separately and never replace executable verification.
|
||||
|
||||
## Runtime flow
|
||||
## Runtime Flow
|
||||
|
||||
```text
|
||||
task yaml + assets
|
||||
-> isolated workspace
|
||||
-> optional local background services
|
||||
-> OpenClaw agent session(s)
|
||||
-> OpenClaw agent session
|
||||
-> transcript + tool-result capture
|
||||
-> completion / trajectory / behavior scoring
|
||||
-> repeated runs
|
||||
-> reliability aggregation
|
||||
-> leaderboard result
|
||||
```
|
||||
|
||||
## Browser policy
|
||||
## Public Task Inventory
|
||||
|
||||
Browser tasks in this Space are deterministic and local:
|
||||
The Space uses `tasks-public/MANIFEST.yaml` as the source of truth. Current
|
||||
Core v1 tasks are:
|
||||
|
||||
```text
|
||||
task-owned local app or docs
|
||||
-> OpenClaw browser tool
|
||||
-> real browser interaction
|
||||
-> deterministic local verification
|
||||
```
|
||||
| Task | Tier | Family |
|
||||
|---|---|---|
|
||||
| `t1-bugfix-discount` | tier1 | coding |
|
||||
| `t1-fs-quick-note` | tier1 | tools |
|
||||
| `t2-add-tests-normalizer` | tier2 | coding |
|
||||
| `t2-browser-form-fix` | tier2 | browser |
|
||||
| `t2-config-loader` | tier2 | repo |
|
||||
| `t2-fs-find-that-thing` | tier2 | tools |
|
||||
| `t2-msg-summarize-thread` | tier2 | tools |
|
||||
| `t2-priv-redact-doc` | tier2 | tools |
|
||||
| `t3-data-pipeline-report` | tier3 | multi_tool |
|
||||
| `t3-data-sql-query` | tier3 | tools |
|
||||
| `t3-feature-export` | tier3 | repo |
|
||||
| `t3-msg-inbox-triage` | tier3 | tools |
|
||||
| `t3-web-research-and-cite` | tier3 | tools |
|
||||
| `t4-browser-research-and-code` | tier4 | browser |
|
||||
| `t4-cross-repo-migration` | tier4 | repo |
|
||||
| `t4-delegation-repair` | tier4 | multi_tool |
|
||||
| `t4-life-trip-plan` | tier4 | tools |
|
||||
| `t4-memory-recall-continuation` | tier4 | multi_tool |
|
||||
| `t5-hallucination-resistant-evidence` | tier5 | adversarial |
|
||||
|
||||
No public websites are used for official browser tasks.
|
||||
## Holdout Policy
|
||||
|
||||
## Parallel Space runtime
|
||||
|
||||
On upgraded CPU Spaces, the worker can use conservative parallel lanes:
|
||||
|
||||
```text
|
||||
submission
|
||||
-> task partitioner
|
||||
-> lane 1 gateway + lane-local state
|
||||
-> lane 2 gateway + lane-local state
|
||||
-> browser lane gateway + lane-local state
|
||||
-> merged benchmark result
|
||||
```
|
||||
|
||||
Important rule: browser tasks stay serialized on one dedicated lane to avoid Chromium and port-range collisions.
|
||||
|
||||
## Submission presets
|
||||
|
||||
The Submit tab now exposes two preset audiences so the Space can serve both general Claw users and lower-budget exploratory runs:
|
||||
|
||||
- `Claw Users` keeps the full preset catalog, including provider-backed frontier models.
|
||||
- `Budget Researchers` narrows the list to local or lower-cost presets such as `ollama/gpt-oss:20b`, `ollama/qwen3.5:27b`, `huggingface/Qwen/Qwen3-32B`, and `huggingface/google/gemma-4-26B-A4B-it`.
|
||||
|
||||
You can still enter any custom model ID directly; the preset audience only filters the shortcut catalog and the bulk-submit action.
|
||||
|
||||
## Task inventory
|
||||
|
||||
| Task | Tier | Family | Main verification |
|
||||
| --- | --- | --- | --- |
|
||||
| `t1-architecture-brief` | tier1 | tools | fact verifier + smoke command |
|
||||
| `t1-bugfix-discount` | tier1 | coding | `pytest` |
|
||||
| `t1-refactor-csv-loader` | tier1 | coding | `pytest` + verification script |
|
||||
| `t2-add-tests-normalizer` | tier2 | coding | `pytest` + added-test checks |
|
||||
| `t2-browser-form-fix` | tier2 | browser | local browser flow verification |
|
||||
| `t2-config-loader` | tier2 | repo | `pytest` |
|
||||
| `t2-log-analyzer-cli` | tier2 | coding | exact JSON output |
|
||||
| `t2-node-search-patch` | tier2 | repo | `node --test` |
|
||||
| `t3-data-pipeline-report` | tier3 | multi_tool | exact report output |
|
||||
| `t3-debug-timezone-regression` | tier3 | repo | `pytest` |
|
||||
| `t3-feature-export` | tier3 | repo | `pytest` + CLI smoke |
|
||||
| `t3-monitoring-automation` | tier3 | tools | script output + cron state |
|
||||
| `t3-node-multifile-refactor` | tier3 | repo | `node --test` |
|
||||
| `t4-browser-research-and-code` | tier4 | browser | browser evidence + tests |
|
||||
| `t4-cross-repo-migration` | tier4 | repo | both test suites pass |
|
||||
| `t4-delegation-repair` | tier4 | multi_tool | final suite + delegation transcript evidence |
|
||||
| `t4-memory-recall-continuation` | tier4 | multi_tool | tests + memory assertions |
|
||||
| `t5-contradictory-requirements` | tier5 | adversarial | latest-instruction artifact checks |
|
||||
| `t5-hallucination-resistant-evidence` | tier5 | adversarial | exact answer + evidence-first checks |
|
||||
| `t5-impossible-graceful-fail` | tier5 | adversarial | no harmful mutation + clear refusal |
|
||||
|
||||
## Query coverage layer
|
||||
|
||||
The benchmark also carries dataset-backed metadata from a spreadsheet-derived query corpus:
|
||||
|
||||
- scenario-domain mapping,
|
||||
- clear vs ambiguous prompt slices,
|
||||
- pass / partial / fail delivery buckets,
|
||||
- weighted query-score reporting.
|
||||
|
||||
This lets the benchmark report both:
|
||||
|
||||
- how strong a model is,
|
||||
- and what parts of the user-query landscape the suite is actually stressing.
|
||||
|
||||
## What makes ClawBench meaningful now
|
||||
|
||||
- execution-based completion checks instead of file-exists-only scoring
|
||||
- property-based trajectory scoring instead of reference-trace matching
|
||||
- deterministic local browser tasks instead of internet targets
|
||||
- repeated-run reliability instead of one-shot success stories
|
||||
- tiered tasks with delegation, memory, browser, repo, and adversarial surfaces
|
||||
- advisory judge support without making the official score depend on a second model
|
||||
|
||||
## Auth model
|
||||
|
||||
The benchmark does not require a separate scorer or user-simulation API key.
|
||||
|
||||
It uses the model-under-test auth already configured for OpenClaw. If you enable the optional advisory judge, that model can reuse the same general auth path if available.
|
||||
Private task bodies, assets, expected outputs, verifier details, run traces,
|
||||
logs, and per-task private reports are not part of the public Space. Public
|
||||
Core v1 is intended for reproducibility and development; hidden-suite runs use
|
||||
the same harness with a private task directory restored locally.
|
||||
|
||||
313
clawbench/ablation.py
Normal file
313
clawbench/ablation.py
Normal file
@ -0,0 +1,313 @@
|
||||
"""Ablation profiles and fair-comparison helpers.
|
||||
|
||||
The benchmark can only explain model, harness, and tool effects if those
|
||||
axes are represented explicitly in run metadata. This module keeps that
|
||||
representation small and deterministic: a harness driver plus a tool
|
||||
profile yields a fingerprint, and result comparison refuses to call a
|
||||
delta fair when models or task sets drift.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import subprocess
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any, Iterable
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from clawbench.adapters import get_adapter
|
||||
from clawbench.adapters.base import AdapterConfig
|
||||
from clawbench.canonical import AdapterCapability
|
||||
from clawbench.canonical.convert import from_task_definition
|
||||
from clawbench.schemas import BenchmarkResult, TaskDefinition
|
||||
|
||||
|
||||
CAPABILITY_TO_INTERFACE: dict[AdapterCapability, str] = {
|
||||
AdapterCapability.FILES: "filesystem",
|
||||
AdapterCapability.EXECUTION: "shell",
|
||||
AdapterCapability.MEMORY: "memory",
|
||||
AdapterCapability.SESSION: "session",
|
||||
AdapterCapability.CRON: "scheduler",
|
||||
AdapterCapability.BROWSER: "browser",
|
||||
AdapterCapability.GATEWAY_RPC: "gateway_rpc",
|
||||
AdapterCapability.MULTI_TURN_INJECTION: "multi_turn",
|
||||
}
|
||||
|
||||
|
||||
class HarnessDescriptor(BaseModel):
|
||||
"""Identifies the agent loop being measured."""
|
||||
|
||||
adapter: str
|
||||
driver: str = ""
|
||||
version: str = ""
|
||||
git_sha: str = ""
|
||||
source: str = ""
|
||||
invocation: str = "clawbench"
|
||||
|
||||
|
||||
class ToolProfile(BaseModel):
|
||||
"""The tools/interfaces exposed to a harness run."""
|
||||
|
||||
name: str
|
||||
mode: str = "native"
|
||||
interfaces: list[str] = Field(default_factory=list)
|
||||
adapter_capabilities: list[str] = Field(default_factory=list)
|
||||
enabled_toolsets: list[str] = Field(default_factory=list)
|
||||
disabled_toolsets: list[str] = Field(default_factory=list)
|
||||
tools: list[str] = Field(default_factory=list)
|
||||
fingerprint: str = ""
|
||||
|
||||
def with_fingerprint(self) -> "ToolProfile":
|
||||
payload = {
|
||||
"name": self.name,
|
||||
"mode": self.mode,
|
||||
"interfaces": sorted(self.interfaces),
|
||||
"adapter_capabilities": sorted(self.adapter_capabilities),
|
||||
"enabled_toolsets": sorted(self.enabled_toolsets),
|
||||
"disabled_toolsets": sorted(self.disabled_toolsets),
|
||||
"tools": sorted(self.tools),
|
||||
}
|
||||
digest = hashlib.sha256(
|
||||
json.dumps(payload, sort_keys=True, separators=(",", ":")).encode("utf-8")
|
||||
).hexdigest()
|
||||
return self.model_copy(update={"fingerprint": digest[:16]})
|
||||
|
||||
|
||||
class AblationProfile(BaseModel):
|
||||
"""Run-level axis metadata embedded in BenchmarkResult.environment."""
|
||||
|
||||
model: str
|
||||
harness: HarnessDescriptor
|
||||
tool_profile: ToolProfile
|
||||
prompt_profile: str = "clear"
|
||||
fingerprint: str = ""
|
||||
|
||||
def with_fingerprint(self) -> "AblationProfile":
|
||||
tool_profile = self.tool_profile.with_fingerprint()
|
||||
payload = {
|
||||
"model": self.model,
|
||||
"harness": self.harness.model_dump(),
|
||||
"tool_profile": tool_profile.model_dump(),
|
||||
"prompt_profile": self.prompt_profile,
|
||||
}
|
||||
digest = hashlib.sha256(
|
||||
json.dumps(payload, sort_keys=True, separators=(",", ":")).encode("utf-8")
|
||||
).hexdigest()
|
||||
return self.model_copy(
|
||||
update={
|
||||
"tool_profile": tool_profile,
|
||||
"fingerprint": digest[:16],
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class FairTaskSet:
|
||||
task_ids: list[str]
|
||||
skipped: dict[str, list[str]] = field(default_factory=dict)
|
||||
|
||||
|
||||
def capabilities_to_interfaces(capabilities: Iterable[AdapterCapability | str]) -> list[str]:
|
||||
values: list[str] = []
|
||||
for cap in capabilities:
|
||||
enum_value = cap if isinstance(cap, AdapterCapability) else AdapterCapability(str(cap))
|
||||
values.append(CAPABILITY_TO_INTERFACE.get(enum_value, enum_value.value))
|
||||
return sorted(set(values))
|
||||
|
||||
|
||||
def adapter_capabilities(
|
||||
adapter: str,
|
||||
config: AdapterConfig | None = None,
|
||||
) -> set[AdapterCapability]:
|
||||
adapter_cls = get_adapter(adapter)
|
||||
return adapter_cls.supported_capabilities(config)
|
||||
|
||||
|
||||
def default_tool_profile(
|
||||
*,
|
||||
adapter: str,
|
||||
config: AdapterConfig | None = None,
|
||||
name: str | None = None,
|
||||
mode: str = "native",
|
||||
enabled_toolsets: list[str] | None = None,
|
||||
disabled_toolsets: list[str] | None = None,
|
||||
) -> ToolProfile:
|
||||
caps = adapter_capabilities(adapter, config)
|
||||
profile = ToolProfile(
|
||||
name=name or f"{adapter}-{mode}",
|
||||
mode=mode,
|
||||
interfaces=capabilities_to_interfaces(caps),
|
||||
adapter_capabilities=sorted(cap.value for cap in caps),
|
||||
enabled_toolsets=enabled_toolsets or [],
|
||||
disabled_toolsets=disabled_toolsets or [],
|
||||
)
|
||||
return profile.with_fingerprint()
|
||||
|
||||
|
||||
def compatible_task_ids(
|
||||
tasks: Iterable[TaskDefinition],
|
||||
*,
|
||||
adapter: str,
|
||||
config: AdapterConfig | None = None,
|
||||
) -> tuple[list[str], dict[str, list[str]]]:
|
||||
caps = adapter_capabilities(adapter, config)
|
||||
task_ids: list[str] = []
|
||||
skipped: dict[str, list[str]] = {}
|
||||
for task in tasks:
|
||||
canonical = from_task_definition(task)
|
||||
missing = set(canonical.required_adapter_capabilities) - caps
|
||||
if missing:
|
||||
skipped[task.id] = sorted(cap.value for cap in missing)
|
||||
else:
|
||||
task_ids.append(task.id)
|
||||
return task_ids, skipped
|
||||
|
||||
|
||||
def common_compatible_task_set(
|
||||
tasks: Iterable[TaskDefinition],
|
||||
adapter_configs: dict[str, tuple[str, AdapterConfig | None]],
|
||||
) -> FairTaskSet:
|
||||
task_list = list(tasks)
|
||||
common: set[str] | None = None
|
||||
skipped: dict[str, list[str]] = {}
|
||||
for label, (adapter, config) in adapter_configs.items():
|
||||
ids, missing = compatible_task_ids(task_list, adapter=adapter, config=config)
|
||||
ids_set = set(ids)
|
||||
common = ids_set if common is None else common & ids_set
|
||||
for task_id, caps in missing.items():
|
||||
skipped.setdefault(task_id, []).append(f"{label}: {', '.join(caps)}")
|
||||
ordered = [task.id for task in task_list if task.id in (common or set())]
|
||||
return FairTaskSet(task_ids=ordered, skipped=skipped)
|
||||
|
||||
|
||||
def build_ablation_profile(
|
||||
*,
|
||||
model: str,
|
||||
adapter: str,
|
||||
config: AdapterConfig | None = None,
|
||||
prompt_profile: str = "clear",
|
||||
harness_version: str = "",
|
||||
harness_git_sha: str = "",
|
||||
harness_source: str = "",
|
||||
driver: str = "",
|
||||
tool_profile_name: str | None = None,
|
||||
enabled_toolsets: list[str] | None = None,
|
||||
disabled_toolsets: list[str] | None = None,
|
||||
) -> AblationProfile:
|
||||
harness = HarnessDescriptor(
|
||||
adapter=adapter,
|
||||
driver=driver,
|
||||
version=harness_version,
|
||||
git_sha=harness_git_sha,
|
||||
source=harness_source,
|
||||
)
|
||||
tool_profile = default_tool_profile(
|
||||
adapter=adapter,
|
||||
config=config,
|
||||
name=tool_profile_name,
|
||||
enabled_toolsets=enabled_toolsets,
|
||||
disabled_toolsets=disabled_toolsets,
|
||||
)
|
||||
return AblationProfile(
|
||||
model=model,
|
||||
harness=harness,
|
||||
tool_profile=tool_profile,
|
||||
prompt_profile=prompt_profile,
|
||||
).with_fingerprint()
|
||||
|
||||
|
||||
def compare_results(results: dict[str, BenchmarkResult]) -> dict[str, Any]:
|
||||
"""Return score deltas plus fairness checks for result JSONs."""
|
||||
|
||||
labels = list(results)
|
||||
models = {label: result.model for label, result in results.items()}
|
||||
task_sets = {
|
||||
label: [task.task_id for task in result.task_results]
|
||||
for label, result in results.items()
|
||||
}
|
||||
first_tasks = next(iter(task_sets.values()), [])
|
||||
same_task_set = all(tasks == first_tasks for tasks in task_sets.values())
|
||||
same_model = len(set(models.values())) == 1
|
||||
snapshot_fingerprints = {
|
||||
result.task_snapshot_fingerprint
|
||||
for result in results.values()
|
||||
if result.task_snapshot_fingerprint
|
||||
}
|
||||
same_task_snapshot = len(snapshot_fingerprints) <= 1
|
||||
prompt_variants = {
|
||||
str(result.environment.get("prompt_variant", ""))
|
||||
for result in results.values()
|
||||
if result.environment.get("prompt_variant", "")
|
||||
}
|
||||
same_prompt_variant = len(prompt_variants) <= 1
|
||||
benchmark_releases = {
|
||||
result.benchmark_release_id
|
||||
for result in results.values()
|
||||
if result.benchmark_release_id
|
||||
}
|
||||
same_benchmark_release = len(benchmark_releases) <= 1
|
||||
task_verifier_fair = same_task_set and same_task_snapshot and same_prompt_variant and same_benchmark_release
|
||||
|
||||
rows: dict[str, Any] = {}
|
||||
for label, result in results.items():
|
||||
rows[label] = {
|
||||
"model": result.model,
|
||||
"adapter": result.environment.get("adapter", ""),
|
||||
"score": result.overall_score,
|
||||
"completion": result.overall_completion,
|
||||
"trajectory": result.overall_trajectory,
|
||||
"behavior": result.overall_behavior,
|
||||
"reliability": result.overall_reliability,
|
||||
"task_count": len(result.task_results),
|
||||
"task_snapshot_fingerprint": result.task_snapshot_fingerprint,
|
||||
"benchmark_release_id": result.benchmark_release_id,
|
||||
"prompt_variant": result.environment.get("prompt_variant", ""),
|
||||
"dimension_coverage": result.environment.get("dimension_coverage", {}),
|
||||
"ablation": result.environment.get("ablation_profile", {}),
|
||||
}
|
||||
|
||||
deltas: dict[str, float] = {}
|
||||
if labels:
|
||||
baseline = results[labels[0]].overall_score
|
||||
for label in labels[1:]:
|
||||
deltas[f"{label}_minus_{labels[0]}"] = round(
|
||||
results[label].overall_score - baseline,
|
||||
4,
|
||||
)
|
||||
|
||||
return {
|
||||
"fair": bool(task_verifier_fair),
|
||||
"task_verifier_fair": bool(task_verifier_fair),
|
||||
"controlled_ablation": bool(task_verifier_fair and same_model),
|
||||
"same_model": same_model,
|
||||
"same_task_set": same_task_set,
|
||||
"same_task_snapshot": same_task_snapshot,
|
||||
"same_prompt_variant": same_prompt_variant,
|
||||
"same_benchmark_release": same_benchmark_release,
|
||||
"models": models,
|
||||
"task_sets": task_sets,
|
||||
"rows": rows,
|
||||
"deltas": deltas,
|
||||
}
|
||||
|
||||
|
||||
def git_head(path: Path) -> tuple[str, str]:
|
||||
"""Best-effort `(sha, describe)` for harness provenance."""
|
||||
|
||||
try:
|
||||
sha = subprocess.check_output(
|
||||
["git", "-C", str(path), "rev-parse", "HEAD"],
|
||||
text=True,
|
||||
stderr=subprocess.DEVNULL,
|
||||
).strip()
|
||||
desc = subprocess.check_output(
|
||||
["git", "-C", str(path), "describe", "--tags", "--always", "--dirty"],
|
||||
text=True,
|
||||
stderr=subprocess.DEVNULL,
|
||||
).strip()
|
||||
return sha, desc
|
||||
except Exception:
|
||||
return "", ""
|
||||
102
clawbench/adapters/__init__.py
Normal file
102
clawbench/adapters/__init__.py
Normal file
@ -0,0 +1,102 @@
|
||||
"""Agent adapter layer — Phase-4 of CLAWBENCH_V0_4_SPEC.md.
|
||||
|
||||
Adapters plug an agent framework (OpenClaw, Hermes, Codex, Claude Code,
|
||||
Deerflow, …) into ClawBench's canonical task pipeline. Each adapter is
|
||||
responsible for:
|
||||
|
||||
- Setting up the workspace + seed state from a `CanonicalTask`.
|
||||
- Driving the agent through each `CanonicalPhase`'s simulated user.
|
||||
- Returning a canonical `Transcript` so the scorer, trajectory analyser,
|
||||
and judge can score the run unchanged.
|
||||
- Resolving `StateQuery` assertions that fall under its declared
|
||||
capabilities; returning `capability_missing=True` for queries that
|
||||
require a capability the adapter doesn't provide.
|
||||
|
||||
The `ADAPTERS` registry is populated by each adapter module at import
|
||||
time. `get_adapter(name)` is the canonical lookup.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from clawbench.adapters.base import (
|
||||
AdapterConfig,
|
||||
AdapterContext,
|
||||
AgentAdapter,
|
||||
PhaseResult,
|
||||
StateQueryResult,
|
||||
)
|
||||
|
||||
#: Registry of adapter_name → adapter class. Populated by the adapter
|
||||
#: modules at import time (e.g. `from clawbench.adapters.openclaw import *`
|
||||
#: registers the OpenClaw adapter). Callers should use `get_adapter`
|
||||
#: rather than reading this dict directly.
|
||||
ADAPTERS: dict[str, type[AgentAdapter]] = {}
|
||||
|
||||
|
||||
def register_adapter(cls: type[AgentAdapter]) -> type[AgentAdapter]:
|
||||
"""Decorator / direct-call helper that registers an adapter class.
|
||||
|
||||
Adapters declare themselves via:
|
||||
|
||||
```
|
||||
@register_adapter
|
||||
class HermesAdapter(AgentAdapter):
|
||||
name = "hermes"
|
||||
...
|
||||
```
|
||||
"""
|
||||
|
||||
name = getattr(cls, "name", "")
|
||||
if not name:
|
||||
raise ValueError(f"{cls.__name__} must set a non-empty `name` class attribute")
|
||||
existing = ADAPTERS.get(name)
|
||||
if existing is not None and existing is not cls:
|
||||
raise ValueError(
|
||||
f"Adapter name collision: '{name}' is already registered "
|
||||
f"to {existing.__qualname__}"
|
||||
)
|
||||
ADAPTERS[name] = cls
|
||||
return cls
|
||||
|
||||
|
||||
def get_adapter(name: str) -> type[AgentAdapter]:
|
||||
"""Look up an adapter class by its registered name.
|
||||
|
||||
Import the adapter module before calling this so the registration
|
||||
has run. `clawbench.adapters.openclaw` always loads; optional
|
||||
adapters (hermes, codex) guard their imports and raise a clear
|
||||
error if their runtime dep isn't installed.
|
||||
"""
|
||||
|
||||
try:
|
||||
return ADAPTERS[name]
|
||||
except KeyError as exc:
|
||||
available = ", ".join(sorted(ADAPTERS)) or "(none)"
|
||||
raise KeyError(
|
||||
f"Unknown adapter '{name}'. Registered adapters: {available}"
|
||||
) from exc
|
||||
|
||||
|
||||
__all__ = [
|
||||
"ADAPTERS",
|
||||
"AdapterConfig",
|
||||
"AdapterContext",
|
||||
"AgentAdapter",
|
||||
"PhaseResult",
|
||||
"StateQueryResult",
|
||||
"get_adapter",
|
||||
"register_adapter",
|
||||
]
|
||||
|
||||
|
||||
# Register built-in adapters at import time. Each adapter module is
|
||||
# expected to @register_adapter its class. OpenClaw is always
|
||||
# available; optional adapters (hermes, codex) guard their imports and
|
||||
# are registered only when their runtime dep is present.
|
||||
from clawbench.adapters import openclaw as _openclaw # noqa: E402,F401
|
||||
|
||||
try:
|
||||
from clawbench.adapters import hermes as _hermes # noqa: E402,F401
|
||||
except Exception:
|
||||
# hermes-agent is an optional extra; absence is fine.
|
||||
_hermes = None # type: ignore[assignment]
|
||||
234
clawbench/adapters/base.py
Normal file
234
clawbench/adapters/base.py
Normal file
@ -0,0 +1,234 @@
|
||||
"""Agent adapter ABC and associated data shapes.
|
||||
|
||||
An `AgentAdapter` is the execution counterpart to a `CanonicalTask`. It
|
||||
is the only place where framework-specific details (OpenClaw gateway
|
||||
RPCs, Hermes `MiniSWERunner`, Claude Code SDK, etc.) live. Everything
|
||||
downstream of the adapter — trajectory analysis, scorer, judge, stats —
|
||||
consumes a canonical `Transcript` and `TaskRunResult` produced by the
|
||||
adapter, so those modules stay unchanged across adapters.
|
||||
|
||||
Lifecycle per task run:
|
||||
|
||||
1. Harness instantiates `adapter = AdapterClass(config)`.
|
||||
2. `async with adapter as adapter:` — starts subprocesses / websockets
|
||||
/ whatever this adapter needs to hold open across a run.
|
||||
3. `await adapter.setup(ctx)` — realizes seed state, workspace files,
|
||||
background services, pre-run state queries.
|
||||
4. For each `CanonicalPhase`: `await adapter.run_phase(phase, ctx)` —
|
||||
drives the simulated user against the agent, returns a
|
||||
`PhaseResult` with the transcript increment.
|
||||
5. For each `StateQuery` in `task.verifier.state_queries`:
|
||||
`await adapter.verify_state_query(query, ctx)` — returns whether
|
||||
the assertion held, or that the adapter lacks the capability.
|
||||
6. `await adapter.teardown(ctx)` — cleans up agent-side state (the
|
||||
workspace itself is harness-owned).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any, ClassVar
|
||||
|
||||
from clawbench.canonical import (
|
||||
AdapterCapability,
|
||||
CanonicalPhase,
|
||||
CanonicalTask,
|
||||
StateQuery,
|
||||
)
|
||||
from clawbench.schemas import Transcript, TranscriptMessage
|
||||
|
||||
|
||||
@dataclass
|
||||
class AdapterConfig:
|
||||
"""Base config every adapter accepts.
|
||||
|
||||
Adapters subclass this to add their own fields. The harness builds
|
||||
a config instance from CLI flags / env vars and passes it to the
|
||||
adapter constructor.
|
||||
"""
|
||||
|
||||
#: Primary model identifier. Semantics are adapter-specific (an
|
||||
#: OpenClaw model id, a Hermes `--model` string, etc.).
|
||||
model: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class AdapterContext:
|
||||
"""Per-run context handed to every adapter method.
|
||||
|
||||
`transcript` is mutated in place across phases: each
|
||||
`run_phase` call appends the messages it observed, so the scorer
|
||||
sees one consolidated `Transcript` at the end.
|
||||
"""
|
||||
|
||||
task: CanonicalTask
|
||||
workspace: Path
|
||||
runtime_values: dict[str, Any]
|
||||
run_index: int
|
||||
model: str
|
||||
transcript: Transcript
|
||||
#: Free-form adapter-owned scratch state (e.g. the OpenClaw
|
||||
#: `session_key` and `agent_id`; the Hermes `MiniSWERunner`
|
||||
#: instance). The harness never reads these — the adapter is free
|
||||
#: to use the dict as its own in-context cache.
|
||||
adapter_state: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
|
||||
@dataclass
|
||||
class PhaseResult:
|
||||
"""The transcript increment produced by a single phase."""
|
||||
|
||||
messages: list[TranscriptMessage] = field(default_factory=list)
|
||||
#: Adapter-specific metadata for this phase (token counts returned
|
||||
#: by the adapter, session identifiers, etc.). Merged into
|
||||
#: `TaskRunResult` under the `efficiency_result` / adapter metadata
|
||||
#: fields where applicable.
|
||||
adapter_metadata: dict[str, Any] = field(default_factory=dict)
|
||||
#: True if the adapter detected that the agent completed normally
|
||||
#: (e.g. Hermes's `completed=True`). Not a pass/fail signal — just
|
||||
#: whether the trajectory ran out of work vs was cut short. The
|
||||
#: scorer uses this in `delivery_outcome` classification.
|
||||
completed_normally: bool = True
|
||||
#: If the phase aborted due to the adapter itself (not the agent),
|
||||
#: populated with an error message the harness surfaces.
|
||||
error: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class StateQueryResult:
|
||||
"""Result of resolving a `StateQuery` against the adapter's state.
|
||||
|
||||
`capability_missing=True` means "this adapter cannot evaluate this
|
||||
kind of query". The scorer treats that as neutral (neither pass nor
|
||||
fail) and records a skip note in the `CompletionResult`; under
|
||||
`--strict-compat` the harness will have filtered the task out before
|
||||
the adapter ever saw it.
|
||||
"""
|
||||
|
||||
ok: bool
|
||||
detail: str = ""
|
||||
capability_missing: bool = False
|
||||
|
||||
|
||||
class AgentAdapter(ABC):
|
||||
"""Abstract base class for agent adapters.
|
||||
|
||||
Subclasses MUST:
|
||||
- Set a unique `name: ClassVar[str]`.
|
||||
- Set a `capabilities: ClassVar[set[AdapterCapability]]` declaring
|
||||
which state-query kinds the adapter can resolve.
|
||||
- Implement `setup`, `run_phase`, `verify_state_query`, `teardown`.
|
||||
- Optionally implement `__aenter__` / `__aexit__` for long-lived
|
||||
resource setup (a persistent websocket, a subprocess pool).
|
||||
"""
|
||||
|
||||
name: ClassVar[str] = ""
|
||||
capabilities: ClassVar[set[AdapterCapability]] = set()
|
||||
|
||||
def __init__(self, config: AdapterConfig | None = None) -> None:
|
||||
self.config: AdapterConfig = config or AdapterConfig()
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Optional long-lived resource management.
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def __aenter__(self) -> "AgentAdapter":
|
||||
return self
|
||||
|
||||
async def __aexit__(self, exc_type: object, exc: object, tb: object) -> None:
|
||||
return None
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Required per-run lifecycle.
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@abstractmethod
|
||||
async def setup(self, ctx: AdapterContext) -> None:
|
||||
"""Realise the workspace, seed state, and any pre-run state.
|
||||
|
||||
The harness has already created the workspace dir and expanded
|
||||
`CanonicalAssets.workspace_files` into it. The adapter is
|
||||
responsible for:
|
||||
|
||||
- Applying `seed_state` entries via an adapter-appropriate
|
||||
mechanism (OpenClaw → memory RPCs; Hermes → file writes).
|
||||
- Starting the agent's process/session so `run_phase` can send
|
||||
turns immediately.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
async def run_phase(
|
||||
self,
|
||||
phase: CanonicalPhase,
|
||||
ctx: AdapterContext,
|
||||
) -> PhaseResult:
|
||||
"""Drive one `CanonicalPhase` to completion.
|
||||
|
||||
The simulated user in `phase.user` dictates what to send and
|
||||
when. The adapter's job is to deliver those turns, observe the
|
||||
agent's responses, and append canonical `TranscriptMessage`
|
||||
entries to `ctx.transcript`.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
async def verify_state_query(
|
||||
self,
|
||||
query: StateQuery,
|
||||
ctx: AdapterContext,
|
||||
) -> StateQueryResult:
|
||||
"""Resolve one `StateQuery` against the agent's post-run state.
|
||||
|
||||
Adapters whose `capabilities` don't cover `query.required_capability`
|
||||
should return `StateQueryResult(ok=False, capability_missing=True)`.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
async def teardown(self, ctx: AdapterContext) -> None:
|
||||
"""Release any agent-side state created during `setup`/`run_phase`.
|
||||
|
||||
The harness owns the workspace lifecycle; the adapter owns
|
||||
sessions, subprocesses, and any in-memory caches it held open.
|
||||
"""
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Convenience helpers available to every adapter.
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@classmethod
|
||||
def supported_capabilities(
|
||||
cls,
|
||||
config: AdapterConfig | None = None,
|
||||
) -> set[AdapterCapability]:
|
||||
"""Return capabilities available for a concrete adapter config.
|
||||
|
||||
Most adapters have a fixed surface and can use the class-level
|
||||
`capabilities`. Adapters with multiple driver modes, such as Hermes
|
||||
MiniSWE vs full AIAgent, override this to keep task gating honest.
|
||||
"""
|
||||
|
||||
return set(cls.capabilities)
|
||||
|
||||
@classmethod
|
||||
def missing_capabilities_for(
|
||||
cls,
|
||||
task: CanonicalTask,
|
||||
config: AdapterConfig | None = None,
|
||||
) -> set[AdapterCapability]:
|
||||
"""Return the subset of `task.required_adapter_capabilities` this
|
||||
adapter cannot cover. Empty set means the task is fully runnable
|
||||
under this adapter.
|
||||
"""
|
||||
|
||||
return set(task.required_adapter_capabilities) - cls.supported_capabilities(config)
|
||||
|
||||
@classmethod
|
||||
def supports(
|
||||
cls,
|
||||
task: CanonicalTask,
|
||||
config: AdapterConfig | None = None,
|
||||
) -> bool:
|
||||
"""True iff this adapter can cover every capability the task needs."""
|
||||
|
||||
return not cls.missing_capabilities_for(task, config)
|
||||
704
clawbench/adapters/hermes.py
Normal file
704
clawbench/adapters/hermes.py
Normal file
@ -0,0 +1,704 @@
|
||||
"""Hermes adapter — drives Nous Research `hermes-agent`.
|
||||
|
||||
Hermes (https://github.com/NousResearch/hermes-agent) is a Python agent
|
||||
framework with `MiniSWERunner` as its clean programmatic entry point.
|
||||
This adapter:
|
||||
|
||||
1. Realizes the canonical workspace + seed state (seed_state entries
|
||||
with `kind="memory"` become files, since Hermes has no memory RPC).
|
||||
2. Constructs a `MiniSWERunner` scoped to the workspace.
|
||||
3. For each canonical phase, renders the user turn and calls
|
||||
`runner.run_task(prompt)` in a worker thread, with the phase's
|
||||
timeout enforced as a wall clock.
|
||||
4. Parses the returned `conversations` via
|
||||
`clawbench.adapters.hermes_xml.parse_conversation` into a canonical
|
||||
`Transcript` the scorer can consume unchanged.
|
||||
5. For state queries the adapter can't resolve (session, cron, custom
|
||||
gateway RPC), returns `capability_missing=True` so the harness
|
||||
reports a clean skip. Memory queries fall back to workspace file
|
||||
scanning via `environment_files.verify_memory_fallback`.
|
||||
|
||||
`hermes-agent` is an **optional** dependency (`clawbench[hermes]`). The
|
||||
import is guarded so the base install stays lean; calling this adapter
|
||||
without the dep installed raises a clear error rather than a cryptic
|
||||
`ImportError`.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import importlib.util
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from clawbench.adapters import register_adapter
|
||||
from clawbench.adapters.base import (
|
||||
AdapterConfig,
|
||||
AdapterContext,
|
||||
AgentAdapter,
|
||||
PhaseResult,
|
||||
StateQueryResult,
|
||||
)
|
||||
from clawbench.adapters.hermes_xml import parse_chat_messages, parse_conversation
|
||||
from clawbench.canonical import (
|
||||
AdapterCapability,
|
||||
CanonicalPhase,
|
||||
StateQuery,
|
||||
)
|
||||
from clawbench.environment_files import verify_memory_fallback
|
||||
from clawbench.render import render_template
|
||||
from clawbench.schemas import MemoryState, PromptVariant
|
||||
from clawbench.simulated_user import UserSimulator
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Optional dependency import — guarded so the base install stays lean.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _load_mini_swe_runner() -> tuple[Any, Exception | None]:
|
||||
try: # pragma: no cover - import-guard branch
|
||||
from mini_swe_runner import MiniSWERunner as runner_cls # type: ignore[import-not-found]
|
||||
|
||||
return runner_cls, None
|
||||
except Exception as import_exc: # pragma: no cover - import-guard branch
|
||||
candidates: list[Path] = []
|
||||
explicit_file = os.environ.get("HERMES_MINI_SWE_RUNNER")
|
||||
if explicit_file:
|
||||
candidates.append(Path(explicit_file).expanduser())
|
||||
for env_name in ("HERMES_AGENT_REPO", "HERMES_INSTALL_DIR"):
|
||||
value = os.environ.get(env_name)
|
||||
if value:
|
||||
candidates.append(Path(value).expanduser() / "mini_swe_runner.py")
|
||||
hermes_home = Path(os.environ.get("HERMES_HOME", "~/.hermes")).expanduser()
|
||||
candidates.append(hermes_home / "hermes-agent" / "mini_swe_runner.py")
|
||||
|
||||
for path in candidates:
|
||||
if not path.is_file():
|
||||
continue
|
||||
try:
|
||||
repo_root = str(path.parent)
|
||||
if repo_root not in sys.path:
|
||||
sys.path.insert(0, repo_root)
|
||||
spec = importlib.util.spec_from_file_location(
|
||||
"_clawbench_hermes_mini_swe_runner",
|
||||
path,
|
||||
)
|
||||
if spec is None or spec.loader is None:
|
||||
continue
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
sys.modules[spec.name] = module
|
||||
spec.loader.exec_module(module)
|
||||
return module.MiniSWERunner, None
|
||||
except Exception as path_exc:
|
||||
import_exc = path_exc
|
||||
continue
|
||||
return None, import_exc
|
||||
|
||||
|
||||
MiniSWERunner, _HERMES_IMPORT_ERROR = _load_mini_swe_runner()
|
||||
|
||||
|
||||
def _load_ai_agent() -> tuple[Any, Exception | None]:
|
||||
try: # pragma: no cover - import-guard branch
|
||||
from run_agent import AIAgent as agent_cls # type: ignore[import-not-found]
|
||||
|
||||
return agent_cls, None
|
||||
except Exception as import_exc: # pragma: no cover - import-guard branch
|
||||
candidates: list[Path] = []
|
||||
for env_name in ("HERMES_AGENT_REPO", "HERMES_INSTALL_DIR"):
|
||||
value = os.environ.get(env_name)
|
||||
if value:
|
||||
candidates.append(Path(value).expanduser() / "run_agent.py")
|
||||
hermes_home = Path(os.environ.get("HERMES_HOME", "~/.hermes")).expanduser()
|
||||
candidates.append(hermes_home / "hermes-agent" / "run_agent.py")
|
||||
|
||||
for path in candidates:
|
||||
if not path.is_file():
|
||||
continue
|
||||
try:
|
||||
repo_root = str(path.parent)
|
||||
if repo_root not in sys.path:
|
||||
sys.path.insert(0, repo_root)
|
||||
spec = importlib.util.spec_from_file_location(
|
||||
"_clawbench_hermes_run_agent",
|
||||
path,
|
||||
)
|
||||
if spec is None or spec.loader is None:
|
||||
continue
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
sys.modules[spec.name] = module
|
||||
spec.loader.exec_module(module)
|
||||
return module.AIAgent, None
|
||||
except Exception as path_exc:
|
||||
import_exc = path_exc
|
||||
continue
|
||||
return None, import_exc
|
||||
|
||||
|
||||
AIAgent, _HERMES_AGENT_IMPORT_ERROR = _load_ai_agent()
|
||||
|
||||
|
||||
class _CodexToolMessageCompatClient:
|
||||
"""Client wrapper for Hermes's Codex Responses shim.
|
||||
|
||||
The current Hermes MiniSWERunner feeds OpenAI chat-style `role="tool"`
|
||||
messages back into `chat.completions.create()`. Hermes's Codex
|
||||
Responses adapter accepts chat-shaped calls but currently forwards
|
||||
those tool messages to Responses as plain input items, where Codex
|
||||
rejects the unsupported role. Rewriting tool results as user-visible
|
||||
text preserves the important observation for the next turn and keeps
|
||||
the runner moving.
|
||||
"""
|
||||
|
||||
def __init__(self, inner: Any) -> None:
|
||||
self._inner = inner
|
||||
self.chat = _CodexToolMessageCompatChat(inner.chat)
|
||||
self.api_key = getattr(inner, "api_key", None)
|
||||
self.base_url = getattr(inner, "base_url", None)
|
||||
|
||||
def close(self) -> None:
|
||||
close = getattr(self._inner, "close", None)
|
||||
if callable(close):
|
||||
close()
|
||||
|
||||
|
||||
class _CodexToolMessageCompatChat:
|
||||
def __init__(self, inner_chat: Any) -> None:
|
||||
self.completions = _CodexToolMessageCompatCompletions(inner_chat.completions)
|
||||
|
||||
|
||||
class _CodexToolMessageCompatCompletions:
|
||||
def __init__(self, inner_completions: Any) -> None:
|
||||
self._inner = inner_completions
|
||||
|
||||
def create(self, **kwargs: Any) -> Any:
|
||||
messages = kwargs.get("messages")
|
||||
if isinstance(messages, list):
|
||||
kwargs = dict(kwargs)
|
||||
kwargs["messages"] = [_rewrite_codex_tool_message(message) for message in messages]
|
||||
return self._inner.create(**kwargs)
|
||||
|
||||
|
||||
def _rewrite_codex_tool_message(message: Any) -> Any:
|
||||
if not isinstance(message, dict) or message.get("role") != "tool":
|
||||
return message
|
||||
content = message.get("content", "")
|
||||
if not isinstance(content, str):
|
||||
content = str(content)
|
||||
tool_call_id = message.get("tool_call_id") or message.get("name") or "tool"
|
||||
return {
|
||||
"role": "user",
|
||||
"content": f"Tool result ({tool_call_id}):\n{content}",
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Config
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@dataclass
|
||||
class HermesAdapterConfig(AdapterConfig):
|
||||
"""Config for the Hermes adapter.
|
||||
|
||||
Fields map onto `MiniSWERunner` kwargs; ClawBench passes the
|
||||
canonical model string through verbatim so users pick Hermes-
|
||||
supported models via the existing `--model` flag.
|
||||
"""
|
||||
|
||||
env_type: str = "local"
|
||||
max_iterations: int = 15
|
||||
timeout_seconds: int = 60
|
||||
base_url: str | None = None
|
||||
api_key: str | None = None
|
||||
provider: str | None = None
|
||||
api_mode: str | None = None
|
||||
prompt_variant: str = PromptVariant.CLEAR.value
|
||||
driver_mode: str = "mini_swe"
|
||||
enabled_toolsets: list[str] | None = None
|
||||
disabled_toolsets: list[str] | None = None
|
||||
hermes_home: str | None = None
|
||||
tool_delay_seconds: float = 0.0
|
||||
# Optional: an explicit `MiniSWERunner` factory. Used by tests to
|
||||
# plug in a stub; production code leaves this None and the adapter
|
||||
# instantiates the real runner lazily.
|
||||
runner_factory: Any = None
|
||||
agent_factory: Any = None
|
||||
|
||||
|
||||
@register_adapter
|
||||
class HermesAdapter(AgentAdapter):
|
||||
"""Adapter for the Nous Research hermes-agent."""
|
||||
|
||||
name = "hermes"
|
||||
capabilities = {
|
||||
AdapterCapability.FILES,
|
||||
AdapterCapability.EXECUTION,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def supported_capabilities(cls, config: AdapterConfig | None = None) -> set[AdapterCapability]:
|
||||
if isinstance(config, HermesAdapterConfig) and config.driver_mode == "ai_agent":
|
||||
return {
|
||||
AdapterCapability.FILES,
|
||||
AdapterCapability.EXECUTION,
|
||||
AdapterCapability.MEMORY,
|
||||
AdapterCapability.CRON,
|
||||
AdapterCapability.BROWSER,
|
||||
AdapterCapability.MULTI_TURN_INJECTION,
|
||||
}
|
||||
return set(cls.capabilities)
|
||||
|
||||
def __init__(self, config: HermesAdapterConfig | None = None) -> None:
|
||||
super().__init__(config or HermesAdapterConfig())
|
||||
self._config: HermesAdapterConfig = self.config # type: ignore[assignment]
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Lifecycle.
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def setup(self, ctx: AdapterContext) -> None:
|
||||
"""Realize memory seed state as files and build the runner.
|
||||
|
||||
Hermes-in-`env_type=local` operates directly on the workspace
|
||||
filesystem, so memory `SeedEntry` entries are written out as
|
||||
`memory/<key>.md` files. Callers that want a different mapping
|
||||
can pre-populate the workspace before invoking the adapter.
|
||||
"""
|
||||
|
||||
for seed in ctx.task.assets.seed_state:
|
||||
if seed.kind == "memory" and seed.key:
|
||||
target = ctx.workspace / "memory" / f"{seed.key}.md"
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
content = seed.content or ""
|
||||
if not isinstance(content, str):
|
||||
content = str(content)
|
||||
target.write_text(content, encoding="utf-8")
|
||||
|
||||
if self._config.driver_mode == "ai_agent":
|
||||
agent = self._build_ai_agent(ctx)
|
||||
ctx.adapter_state["agent"] = agent
|
||||
ctx.adapter_state["conversation_history"] = []
|
||||
ctx.adapter_state["hermes_home"] = self._hermes_home(ctx)
|
||||
else:
|
||||
runner = self._build_runner(ctx)
|
||||
ctx.adapter_state["runner"] = runner
|
||||
ctx.adapter_state.setdefault("api_calls", 0)
|
||||
|
||||
def _hermes_home(self, ctx: AdapterContext) -> Path:
|
||||
configured = self._config.hermes_home
|
||||
if configured:
|
||||
return Path(configured).expanduser()
|
||||
return ctx.workspace / ".hermes"
|
||||
|
||||
def _prepare_process_env(self, ctx: AdapterContext) -> None:
|
||||
hermes_home = self._hermes_home(ctx)
|
||||
hermes_home.mkdir(parents=True, exist_ok=True)
|
||||
os.environ["HERMES_HOME"] = str(hermes_home)
|
||||
os.environ["TERMINAL_CWD"] = str(ctx.workspace)
|
||||
os.environ.setdefault("TERMINAL_ENV", "local")
|
||||
cron_jobs = sys.modules.get("cron.jobs")
|
||||
if cron_jobs is not None:
|
||||
cron_dir = hermes_home / "cron"
|
||||
setattr(cron_jobs, "HERMES_DIR", hermes_home)
|
||||
setattr(cron_jobs, "CRON_DIR", cron_dir)
|
||||
setattr(cron_jobs, "JOBS_FILE", cron_dir / "jobs.json")
|
||||
setattr(cron_jobs, "OUTPUT_DIR", cron_dir / "output")
|
||||
|
||||
def _effective_model(self, ctx: AdapterContext) -> str:
|
||||
"""Translate ClawBench provider-prefixed slugs for direct providers."""
|
||||
|
||||
model = ctx.model
|
||||
if self._config.provider:
|
||||
return model
|
||||
base_url = self._config.base_url or ""
|
||||
try:
|
||||
host = urlparse(base_url).hostname or ""
|
||||
except Exception:
|
||||
host = ""
|
||||
if host == "api.openai.com" and model.startswith("openai/"):
|
||||
return model.split("/", 1)[1]
|
||||
return model
|
||||
|
||||
def _runtime_provider_hint(self) -> str | None:
|
||||
"""Return the provider identity Hermes should expose to its runtime.
|
||||
|
||||
Hermes distinguishes the transport used for the main model from the
|
||||
auxiliary routing metadata it exposes to side tasks. Direct
|
||||
OpenAI-compatible endpoints need to keep their explicit base URL and
|
||||
API key, but should still identify as ``custom`` so Hermes auxiliary
|
||||
calls resolve to the same primary model instead of falling through to
|
||||
auto-detected providers such as OpenRouter.
|
||||
"""
|
||||
|
||||
if self._config.provider:
|
||||
return self._config.provider
|
||||
if self._config.base_url:
|
||||
return "custom"
|
||||
return None
|
||||
|
||||
def _build_runner(self, ctx: AdapterContext) -> Any:
|
||||
explicit_api_key = None if self._config.provider else self._config.api_key
|
||||
explicit_base_url = None if self._config.provider else self._config.base_url
|
||||
effective_model = self._effective_model(ctx)
|
||||
ctx.adapter_state["effective_model"] = effective_model
|
||||
if self._config.runner_factory is not None:
|
||||
return self._config.runner_factory(
|
||||
model=effective_model,
|
||||
env_type=self._config.env_type,
|
||||
cwd=str(ctx.workspace),
|
||||
max_iterations=self._config.max_iterations,
|
||||
command_timeout=self._config.timeout_seconds,
|
||||
base_url=explicit_base_url,
|
||||
api_key=explicit_api_key,
|
||||
)
|
||||
if MiniSWERunner is None: # pragma: no cover - import-guard branch
|
||||
raise RuntimeError(
|
||||
"HermesAdapter requires Hermes Agent's `mini_swe_runner.py`. "
|
||||
"Install Hermes with the official installer, or set "
|
||||
"`HERMES_AGENT_REPO=/path/to/hermes-agent` / "
|
||||
"`HERMES_MINI_SWE_RUNNER=/path/to/mini_swe_runner.py`. "
|
||||
f"Underlying import error: {_HERMES_IMPORT_ERROR!r}"
|
||||
)
|
||||
runner = MiniSWERunner(
|
||||
model=effective_model,
|
||||
env_type=self._config.env_type,
|
||||
cwd=str(ctx.workspace),
|
||||
max_iterations=self._config.max_iterations,
|
||||
command_timeout=self._config.timeout_seconds,
|
||||
base_url=explicit_base_url,
|
||||
api_key=explicit_api_key,
|
||||
)
|
||||
if self._config.provider:
|
||||
try:
|
||||
from agent.auxiliary_client import resolve_provider_client
|
||||
except Exception as exc: # pragma: no cover - optional Hermes internals
|
||||
raise RuntimeError(
|
||||
f"Hermes provider routing requested for '{self._config.provider}', "
|
||||
"but Hermes provider utilities could not be imported."
|
||||
) from exc
|
||||
client, resolved_model = resolve_provider_client(
|
||||
self._config.provider,
|
||||
model=ctx.model,
|
||||
)
|
||||
if client is None or not resolved_model:
|
||||
raise RuntimeError(
|
||||
f"Hermes provider '{self._config.provider}' did not resolve credentials."
|
||||
)
|
||||
if self._config.provider == "openai-codex":
|
||||
client = _CodexToolMessageCompatClient(client)
|
||||
runner.client = client
|
||||
runner.model = str(resolved_model)
|
||||
return runner
|
||||
|
||||
def _build_ai_agent(self, ctx: AdapterContext) -> Any:
|
||||
self._prepare_process_env(ctx)
|
||||
explicit_api_key = None if self._config.provider else self._config.api_key
|
||||
explicit_base_url = None if self._config.provider else self._config.base_url
|
||||
enabled_toolsets = self._config.enabled_toolsets or ["hermes-api-server"]
|
||||
effective_model = self._effective_model(ctx)
|
||||
provider_hint = self._runtime_provider_hint()
|
||||
ctx.adapter_state["effective_model"] = effective_model
|
||||
if self._config.agent_factory is not None:
|
||||
return self._config.agent_factory(
|
||||
model=effective_model,
|
||||
base_url=explicit_base_url,
|
||||
api_key=explicit_api_key,
|
||||
provider=provider_hint,
|
||||
api_mode=self._config.api_mode,
|
||||
max_iterations=self._config.max_iterations,
|
||||
enabled_toolsets=enabled_toolsets,
|
||||
disabled_toolsets=self._config.disabled_toolsets,
|
||||
)
|
||||
if AIAgent is None: # pragma: no cover - import-guard branch
|
||||
raise RuntimeError(
|
||||
"HermesAdapter full mode requires Hermes Agent's `run_agent.py`. "
|
||||
"Set `HERMES_AGENT_REPO=/path/to/hermes-agent` or install Hermes. "
|
||||
f"Underlying import error: {_HERMES_AGENT_IMPORT_ERROR!r}"
|
||||
)
|
||||
return AIAgent(
|
||||
base_url=explicit_base_url,
|
||||
api_key=explicit_api_key,
|
||||
provider=provider_hint,
|
||||
api_mode=self._config.api_mode,
|
||||
model=effective_model,
|
||||
max_iterations=self._config.max_iterations,
|
||||
tool_delay=self._config.tool_delay_seconds,
|
||||
enabled_toolsets=enabled_toolsets,
|
||||
disabled_toolsets=self._config.disabled_toolsets,
|
||||
quiet_mode=True,
|
||||
verbose_logging=False,
|
||||
skip_context_files=True,
|
||||
session_id=f"clawbench-{ctx.task.id}-run{ctx.run_index}",
|
||||
platform="cli",
|
||||
)
|
||||
|
||||
async def run_phase(
|
||||
self,
|
||||
phase: CanonicalPhase,
|
||||
ctx: AdapterContext,
|
||||
) -> PhaseResult:
|
||||
"""Render the phase's first user turn, invoke Hermes, parse output.
|
||||
|
||||
v1 limitation: only the first turn of each phase is delivered.
|
||||
Tasks that declare `MULTI_TURN_INJECTION` as a required
|
||||
capability are filtered out at harness level before the adapter
|
||||
is invoked (harness gating lands in a later step). Guarding
|
||||
here too keeps the adapter honest if it is driven directly.
|
||||
"""
|
||||
|
||||
if self._config.driver_mode == "ai_agent":
|
||||
return await self._run_ai_agent_phase(phase, ctx)
|
||||
|
||||
runner = ctx.adapter_state.get("runner")
|
||||
if runner is None:
|
||||
return PhaseResult(
|
||||
error="HermesAdapter.run_phase called before setup(); no runner",
|
||||
completed_normally=False,
|
||||
)
|
||||
|
||||
if not phase.user.turns:
|
||||
return PhaseResult(completed_normally=True)
|
||||
|
||||
# Hermes cannot receive dynamic follow-ups; we render and send
|
||||
# only the first turn. Later turns remain in the canonical
|
||||
# phase description but are intentionally dropped here.
|
||||
first_turn = phase.user.turns[0]
|
||||
message = first_turn.variant_messages.get(
|
||||
self._config.prompt_variant, first_turn.message
|
||||
)
|
||||
prompt = render_template(message, ctx.runtime_values)
|
||||
|
||||
phase_timeout = float(
|
||||
phase.timeout_seconds
|
||||
or ctx.task.budgets.timeout_seconds
|
||||
or self._config.timeout_seconds * self._config.max_iterations
|
||||
)
|
||||
|
||||
try:
|
||||
result: dict[str, Any] = await asyncio.wait_for(
|
||||
asyncio.to_thread(runner.run_task, prompt),
|
||||
timeout=phase_timeout,
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
return PhaseResult(
|
||||
error=f"Hermes phase '{phase.name}' exceeded {phase_timeout:.0f}s",
|
||||
completed_normally=False,
|
||||
)
|
||||
except Exception as exc: # pragma: no cover - runner-internal error
|
||||
return PhaseResult(
|
||||
error=f"HermesAdapter runner error: {exc}",
|
||||
completed_normally=False,
|
||||
)
|
||||
|
||||
phase_transcript = parse_conversation(result or {})
|
||||
ctx.transcript.messages.extend(phase_transcript.messages)
|
||||
|
||||
api_calls = int(result.get("api_calls", 0)) if isinstance(result, dict) else 0
|
||||
ctx.adapter_state["api_calls"] = (
|
||||
int(ctx.adapter_state.get("api_calls", 0)) + api_calls
|
||||
)
|
||||
|
||||
return PhaseResult(
|
||||
messages=phase_transcript.messages,
|
||||
adapter_metadata={
|
||||
"api_calls": api_calls,
|
||||
"hermes_metadata": result.get("metadata", {}) if isinstance(result, dict) else {},
|
||||
},
|
||||
completed_normally=bool(result.get("completed", False)) if isinstance(result, dict) else False,
|
||||
)
|
||||
|
||||
async def _run_ai_agent_phase(
|
||||
self,
|
||||
phase: CanonicalPhase,
|
||||
ctx: AdapterContext,
|
||||
) -> PhaseResult:
|
||||
agent = ctx.adapter_state.get("agent")
|
||||
if agent is None:
|
||||
return PhaseResult(
|
||||
error="HermesAdapter.run_phase called before setup(); no AIAgent",
|
||||
completed_normally=False,
|
||||
)
|
||||
|
||||
simulator = UserSimulator(
|
||||
phase.user,
|
||||
ctx.runtime_values,
|
||||
prompt_variant=self._config.prompt_variant,
|
||||
)
|
||||
phase_timeout = float(
|
||||
phase.timeout_seconds
|
||||
or ctx.task.budgets.timeout_seconds
|
||||
or self._config.timeout_seconds * self._config.max_iterations
|
||||
)
|
||||
appended_messages: list = []
|
||||
phase_api_calls = 0
|
||||
completed = True
|
||||
|
||||
while not simulator.is_done:
|
||||
user_message = await simulator.next_message(ctx.transcript)
|
||||
if user_message is None:
|
||||
break
|
||||
history = list(ctx.adapter_state.get("conversation_history") or [])
|
||||
try:
|
||||
result: dict[str, Any] = await asyncio.wait_for(
|
||||
asyncio.to_thread(
|
||||
agent.run_conversation,
|
||||
user_message,
|
||||
conversation_history=history or None,
|
||||
task_id=f"{ctx.task.id}-run{ctx.run_index}",
|
||||
),
|
||||
timeout=phase_timeout,
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
return PhaseResult(
|
||||
messages=appended_messages,
|
||||
error=f"Hermes AIAgent phase '{phase.name}' exceeded {phase_timeout:.0f}s",
|
||||
completed_normally=False,
|
||||
)
|
||||
except Exception as exc: # pragma: no cover - agent-internal error
|
||||
return PhaseResult(
|
||||
messages=appended_messages,
|
||||
error=f"HermesAdapter AIAgent error: {exc}",
|
||||
completed_normally=False,
|
||||
)
|
||||
|
||||
messages = result.get("messages", []) if isinstance(result, dict) else []
|
||||
if not isinstance(messages, list):
|
||||
messages = []
|
||||
delta = messages[len(history):] if len(messages) >= len(history) else messages
|
||||
phase_transcript = parse_chat_messages(delta)
|
||||
ctx.transcript.messages.extend(phase_transcript.messages)
|
||||
appended_messages.extend(phase_transcript.messages)
|
||||
ctx.adapter_state["conversation_history"] = messages
|
||||
phase_api_calls += int(result.get("api_calls", 0)) if isinstance(result, dict) else 0
|
||||
completed = completed and bool(result.get("completed", False))
|
||||
|
||||
ctx.adapter_state["api_calls"] = (
|
||||
int(ctx.adapter_state.get("api_calls", 0)) + phase_api_calls
|
||||
)
|
||||
return PhaseResult(
|
||||
messages=appended_messages,
|
||||
adapter_metadata={
|
||||
"api_calls": phase_api_calls,
|
||||
"driver_mode": "ai_agent",
|
||||
},
|
||||
completed_normally=completed,
|
||||
)
|
||||
|
||||
async def verify_state_query(
|
||||
self,
|
||||
query: StateQuery,
|
||||
ctx: AdapterContext,
|
||||
) -> StateQueryResult:
|
||||
if query.kind == "memory":
|
||||
fallback_state = MemoryState(
|
||||
key_pattern=str(query.selector.get("key_pattern", "")),
|
||||
exists=query.predicate != "absent",
|
||||
value_contains=list(query.expected.get("value_contains", [])),
|
||||
)
|
||||
extra_memory_text = self._read_hermes_memory_text(ctx)
|
||||
ok, detail = verify_memory_fallback(
|
||||
fallback_state,
|
||||
ctx.workspace,
|
||||
transcript=ctx.transcript,
|
||||
extra_memory_text=extra_memory_text,
|
||||
)
|
||||
return StateQueryResult(ok=ok, detail=detail)
|
||||
|
||||
if self._config.driver_mode == "ai_agent" and query.kind == "session":
|
||||
expected_model = str(query.expected.get("model") or "")
|
||||
if query.predicate == "absent":
|
||||
return StateQueryResult(ok=False, detail="Hermes AIAgent session exists")
|
||||
if expected_model and expected_model.lower() not in ctx.model.lower():
|
||||
return StateQueryResult(
|
||||
ok=False,
|
||||
detail=f"Model mismatch: expected {expected_model}, got {ctx.model}",
|
||||
)
|
||||
return StateQueryResult(ok=True, detail="OK")
|
||||
|
||||
if self._config.driver_mode == "ai_agent" and query.kind == "cron":
|
||||
return self._verify_cron_file(query, ctx)
|
||||
|
||||
# HermesAdapter does not currently expose session/cron/custom
|
||||
# gateway state. Flag as capability-missing so the scorer can
|
||||
# apply the neutral skip policy.
|
||||
return StateQueryResult(
|
||||
ok=False,
|
||||
detail=(
|
||||
f"HermesAdapter does not resolve '{query.kind}' state queries "
|
||||
f"(missing capability {query.required_capability.value})"
|
||||
),
|
||||
capability_missing=True,
|
||||
)
|
||||
|
||||
def _read_hermes_memory_text(self, ctx: AdapterContext) -> str:
|
||||
hermes_home = Path(ctx.adapter_state.get("hermes_home") or self._hermes_home(ctx))
|
||||
candidates = [
|
||||
hermes_home / "memory",
|
||||
hermes_home / "memories",
|
||||
hermes_home / "user_memory",
|
||||
]
|
||||
chunks: list[str] = []
|
||||
for candidate in candidates:
|
||||
if candidate.is_file():
|
||||
chunks.append(candidate.read_text(encoding="utf-8", errors="replace"))
|
||||
elif candidate.is_dir():
|
||||
for path in candidate.rglob("*"):
|
||||
if path.is_file() and path.suffix.lower() in {".md", ".txt", ".json"}:
|
||||
try:
|
||||
chunks.append(path.read_text(encoding="utf-8", errors="replace"))
|
||||
except Exception:
|
||||
continue
|
||||
return "\n".join(chunks)
|
||||
|
||||
def _verify_cron_file(
|
||||
self,
|
||||
query: StateQuery,
|
||||
ctx: AdapterContext,
|
||||
) -> StateQueryResult:
|
||||
hermes_home = Path(ctx.adapter_state.get("hermes_home") or self._hermes_home(ctx))
|
||||
jobs_file = hermes_home / "cron" / "jobs.json"
|
||||
if not jobs_file.is_file():
|
||||
if query.predicate == "absent":
|
||||
return StateQueryResult(ok=True, detail="Correctly absent")
|
||||
return StateQueryResult(ok=False, detail=f"No Hermes cron jobs file at {jobs_file}")
|
||||
try:
|
||||
payload = json.loads(jobs_file.read_text(encoding="utf-8"))
|
||||
except Exception as exc:
|
||||
return StateQueryResult(ok=False, detail=f"Could not read Hermes cron jobs: {exc}")
|
||||
jobs = payload if isinstance(payload, list) else payload.get("jobs", [])
|
||||
if not isinstance(jobs, list):
|
||||
jobs = []
|
||||
if query.predicate == "absent":
|
||||
return StateQueryResult(
|
||||
ok=not jobs,
|
||||
detail="Correctly absent" if not jobs else "Cron jobs exist",
|
||||
)
|
||||
description_contains = query.selector.get("description_contains")
|
||||
if not jobs:
|
||||
return StateQueryResult(ok=False, detail="No cron jobs found")
|
||||
if description_contains:
|
||||
needle = str(description_contains).lower()
|
||||
if not any(needle in json.dumps(job, sort_keys=True).lower() for job in jobs):
|
||||
return StateQueryResult(
|
||||
ok=False,
|
||||
detail=f"No cron job matched '{description_contains}'",
|
||||
)
|
||||
return StateQueryResult(ok=True, detail="OK")
|
||||
|
||||
async def teardown(self, ctx: AdapterContext) -> None:
|
||||
"""Release the runner reference so GC can reclaim its process pool."""
|
||||
|
||||
ctx.adapter_state.pop("runner", None)
|
||||
ctx.adapter_state.pop("agent", None)
|
||||
|
||||
|
||||
__all__ = ["HermesAdapter", "HermesAdapterConfig"]
|
||||
494
clawbench/adapters/hermes_xml.py
Normal file
494
clawbench/adapters/hermes_xml.py
Normal file
@ -0,0 +1,494 @@
|
||||
"""Hermes agent conversation → ClawBench `Transcript` converter.
|
||||
|
||||
Hermes's `MiniSWERunner.run_task()` returns a dict shaped like:
|
||||
|
||||
```json
|
||||
{
|
||||
"conversations": [
|
||||
{"from": "system", "value": "..."},
|
||||
{"from": "user", "value": "..."},
|
||||
{"from": "assistant", "value": "I'll look at the file.\\n<tool_call>{\\"name\\":\\"bash\\",\\"arguments\\":{\\"cmd\\":\\"ls\\"}}</tool_call>"},
|
||||
{"from": "tool", "value": "<tool_response>{\\"stdout\\":\\"file.py\\"}</tool_response>"},
|
||||
{"from": "assistant", "value": "<tool_call>...</tool_call>"},
|
||||
...
|
||||
],
|
||||
"completed": true,
|
||||
"api_calls": 7,
|
||||
"metadata": {...}
|
||||
}
|
||||
```
|
||||
|
||||
This module parses that into a canonical `Transcript` with
|
||||
`TranscriptMessage` + `ToolCall` entries so the scorer / trajectory /
|
||||
judge layers can score the run without any Hermes-specific knowledge.
|
||||
|
||||
The XML parsing is deliberately tolerant: Hermes transcripts observed
|
||||
in the wild sometimes have malformed JSON inside `<tool_call>` tags
|
||||
(trailing commas, unescaped newlines). We fall back to a permissive
|
||||
regex extraction in that case so a single bad tool call doesn't tank
|
||||
the whole transcript.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
from typing import Any, Iterable
|
||||
|
||||
from clawbench.schemas import ToolCall, Transcript, TranscriptMessage
|
||||
|
||||
|
||||
#: One `<tool_call>…</tool_call>` block. Non-greedy across newlines.
|
||||
_TOOL_CALL_RE = re.compile(
|
||||
r"<tool_call>\s*(?P<body>.*?)\s*</tool_call>", re.DOTALL
|
||||
)
|
||||
|
||||
#: One `<tool_response>…</tool_response>` block.
|
||||
_TOOL_RESPONSE_RE = re.compile(
|
||||
r"<tool_response>\s*(?P<body>.*?)\s*</tool_response>", re.DOTALL
|
||||
)
|
||||
|
||||
|
||||
def _coerce_role(raw: str) -> str:
|
||||
"""Normalize Hermes role labels to ClawBench `TranscriptMessage.role`.
|
||||
|
||||
ClawBench uses `"user"`, `"assistant"`, `"system"`, `"tool"`. Hermes
|
||||
can emit `"human"`/`"gpt"`/`"function"` variants; we map them all
|
||||
down to the canonical vocabulary.
|
||||
"""
|
||||
|
||||
value = (raw or "").strip().lower()
|
||||
if value in {"assistant", "gpt", "model"}:
|
||||
return "assistant"
|
||||
if value in {"user", "human"}:
|
||||
return "user"
|
||||
if value in {"tool", "function", "tool_response"}:
|
||||
return "tool"
|
||||
if value == "system":
|
||||
return "system"
|
||||
return value or "assistant"
|
||||
|
||||
|
||||
def _extract_json_objects(text: str) -> list[dict[str, Any]]:
|
||||
"""Parse 0-or-more top-level JSON objects from free-form text.
|
||||
|
||||
Hermes usually puts a single JSON object inside each `<tool_call>`,
|
||||
but we handle multi-object payloads defensively. Returns an empty
|
||||
list if no valid JSON is present.
|
||||
"""
|
||||
|
||||
text = text.strip()
|
||||
if not text:
|
||||
return []
|
||||
try:
|
||||
parsed = json.loads(text)
|
||||
if isinstance(parsed, dict):
|
||||
return [parsed]
|
||||
if isinstance(parsed, list):
|
||||
return [item for item in parsed if isinstance(item, dict)]
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
# Fallback: scan for balanced `{...}` blocks. Useful when the
|
||||
# assistant wrote slightly malformed JSON. We accept a best-effort
|
||||
# parse and silently discard the rest.
|
||||
results: list[dict[str, Any]] = []
|
||||
depth = 0
|
||||
start: int | None = None
|
||||
for i, ch in enumerate(text):
|
||||
if ch == "{":
|
||||
if depth == 0:
|
||||
start = i
|
||||
depth += 1
|
||||
elif ch == "}":
|
||||
depth -= 1
|
||||
if depth == 0 and start is not None:
|
||||
candidate = text[start : i + 1]
|
||||
try:
|
||||
obj = json.loads(candidate)
|
||||
if isinstance(obj, dict):
|
||||
results.append(obj)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
start = None
|
||||
return results
|
||||
|
||||
|
||||
def _tool_call_from_payload(
|
||||
payload: dict[str, Any],
|
||||
*,
|
||||
index: int,
|
||||
timestamp_ms: int,
|
||||
) -> ToolCall:
|
||||
"""Build a canonical `ToolCall` from a Hermes `<tool_call>` payload.
|
||||
|
||||
Hermes emits `{"name": "...", "arguments": {...}}` inside each
|
||||
tool_call tag. Some Nous-trained models emit slight variants —
|
||||
`"function"` for the tool name, `"parameters"` or `"input"` for
|
||||
the args. We accept any of those.
|
||||
"""
|
||||
|
||||
name = (
|
||||
payload.get("name")
|
||||
or payload.get("function")
|
||||
or payload.get("tool")
|
||||
or ""
|
||||
)
|
||||
arguments = (
|
||||
payload.get("arguments")
|
||||
or payload.get("parameters")
|
||||
or payload.get("args")
|
||||
or payload.get("input")
|
||||
or {}
|
||||
)
|
||||
if isinstance(arguments, str):
|
||||
# Occasionally Hermes passes a JSON-encoded string of args.
|
||||
try:
|
||||
arguments = json.loads(arguments)
|
||||
except json.JSONDecodeError:
|
||||
arguments = {"raw": arguments}
|
||||
if not isinstance(arguments, dict):
|
||||
arguments = {"value": arguments}
|
||||
call_id = str(payload.get("id") or payload.get("call_id") or f"hermes-{index}")
|
||||
return ToolCall(
|
||||
id=call_id,
|
||||
name=str(name),
|
||||
input=arguments,
|
||||
timestamp_ms=timestamp_ms,
|
||||
)
|
||||
|
||||
|
||||
def _tool_response_summary(payload: dict[str, Any]) -> tuple[str, str, bool | None]:
|
||||
"""Extract (output, error, success) from a `<tool_response>` payload."""
|
||||
|
||||
output = ""
|
||||
error = ""
|
||||
success: bool | None = None
|
||||
|
||||
stdout = payload.get("stdout")
|
||||
stderr = payload.get("stderr")
|
||||
result = payload.get("result")
|
||||
err = payload.get("error")
|
||||
msg = payload.get("message")
|
||||
status = payload.get("status")
|
||||
|
||||
if isinstance(stdout, str):
|
||||
output = stdout
|
||||
elif isinstance(result, (str, dict, list)):
|
||||
output = result if isinstance(result, str) else json.dumps(result)
|
||||
elif isinstance(msg, str):
|
||||
output = msg
|
||||
if isinstance(stderr, str) and stderr.strip():
|
||||
error = stderr
|
||||
elif isinstance(err, (str, dict, list)):
|
||||
error = err if isinstance(err, str) else json.dumps(err)
|
||||
|
||||
if isinstance(status, str):
|
||||
lowered = status.lower()
|
||||
if lowered in {"ok", "success", "succeeded"}:
|
||||
success = True
|
||||
elif lowered in {"error", "failed", "failure"}:
|
||||
success = False
|
||||
if error and success is None:
|
||||
success = False
|
||||
if not error and output and success is None:
|
||||
success = True
|
||||
return output, error, success
|
||||
|
||||
|
||||
def _split_tagged(text: str, tag_re: re.Pattern[str]) -> list[tuple[str, str]]:
|
||||
"""Split `text` into `(kind, body)` tuples where `kind` is `"text"` or
|
||||
`"tag"`. Preserves ordering so we can thread tool calls/responses
|
||||
back into the canonical transcript in the order they appeared.
|
||||
"""
|
||||
|
||||
pieces: list[tuple[str, str]] = []
|
||||
cursor = 0
|
||||
for match in tag_re.finditer(text):
|
||||
if match.start() > cursor:
|
||||
pieces.append(("text", text[cursor : match.start()]))
|
||||
pieces.append(("tag", match.group("body")))
|
||||
cursor = match.end()
|
||||
if cursor < len(text):
|
||||
pieces.append(("text", text[cursor:]))
|
||||
return pieces
|
||||
|
||||
|
||||
def parse_conversation(result: dict[str, Any]) -> Transcript:
|
||||
"""Parse a `MiniSWERunner.run_task` result dict into a `Transcript`.
|
||||
|
||||
The conversation is processed in order; tool calls are emitted into
|
||||
the assistant message that contained them, and tool responses are
|
||||
paired with the most recent unpaired call. The final Transcript is
|
||||
ready for `annotate_transcript_tool_calls` → scorer.
|
||||
"""
|
||||
|
||||
transcript = Transcript()
|
||||
conversations = result.get("conversations") or []
|
||||
pending_calls: list[ToolCall] = []
|
||||
call_counter = 0
|
||||
|
||||
for turn_index, entry in enumerate(conversations):
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
role = _coerce_role(str(entry.get("from", "")))
|
||||
value = str(entry.get("value", "") or "")
|
||||
|
||||
# Tool responses arrive from the tool/function role.
|
||||
if role == "tool":
|
||||
for response_body in _TOOL_RESPONSE_RE.findall(value):
|
||||
payloads = _extract_json_objects(response_body)
|
||||
if not payloads:
|
||||
payloads = [{"result": response_body}]
|
||||
for payload in payloads:
|
||||
output, error, success = _tool_response_summary(payload)
|
||||
if pending_calls:
|
||||
target = pending_calls.pop(0)
|
||||
target.output = output
|
||||
target.error = error
|
||||
if success is not None:
|
||||
target.success = success
|
||||
else:
|
||||
# Orphan tool response — surface it as a tool
|
||||
# message so nothing is silently dropped.
|
||||
transcript.messages.append(
|
||||
TranscriptMessage(
|
||||
role="tool",
|
||||
tool_result_content=output or error,
|
||||
)
|
||||
)
|
||||
continue
|
||||
|
||||
# Everything else (assistant / user / system) may carry tool
|
||||
# calls plus free-form text. We interleave them faithfully.
|
||||
pieces = _split_tagged(value, _TOOL_CALL_RE)
|
||||
text_chunks: list[str] = []
|
||||
tool_calls: list[ToolCall] = []
|
||||
for kind, body in pieces:
|
||||
if kind == "text":
|
||||
text_chunks.append(body)
|
||||
else:
|
||||
payloads = _extract_json_objects(body)
|
||||
for payload in payloads:
|
||||
call_counter += 1
|
||||
tool_call = _tool_call_from_payload(
|
||||
payload,
|
||||
index=call_counter,
|
||||
timestamp_ms=turn_index,
|
||||
)
|
||||
tool_calls.append(tool_call)
|
||||
pending_calls.append(tool_call)
|
||||
|
||||
joined_text = "\n".join(chunk for chunk in text_chunks if chunk.strip()).strip()
|
||||
|
||||
if role == "assistant":
|
||||
transcript.messages.append(
|
||||
TranscriptMessage(
|
||||
role="assistant",
|
||||
text=joined_text,
|
||||
tool_calls=tool_calls,
|
||||
timestamp_ms=turn_index,
|
||||
)
|
||||
)
|
||||
elif role == "user":
|
||||
transcript.messages.append(
|
||||
TranscriptMessage(
|
||||
role="user",
|
||||
text=joined_text,
|
||||
timestamp_ms=turn_index,
|
||||
)
|
||||
)
|
||||
elif role == "system":
|
||||
if joined_text:
|
||||
transcript.messages.append(
|
||||
TranscriptMessage(
|
||||
role="system",
|
||||
text=joined_text,
|
||||
timestamp_ms=turn_index,
|
||||
)
|
||||
)
|
||||
else:
|
||||
if joined_text:
|
||||
transcript.messages.append(
|
||||
TranscriptMessage(
|
||||
role=role,
|
||||
text=joined_text,
|
||||
timestamp_ms=turn_index,
|
||||
)
|
||||
)
|
||||
|
||||
return transcript
|
||||
|
||||
|
||||
def _content_to_text(content: Any) -> str:
|
||||
"""Normalize OpenAI/Anthropic-style message content to plain text."""
|
||||
|
||||
if content is None:
|
||||
return ""
|
||||
if isinstance(content, str):
|
||||
return content
|
||||
if isinstance(content, list):
|
||||
parts: list[str] = []
|
||||
for part in content:
|
||||
if isinstance(part, str):
|
||||
parts.append(part)
|
||||
elif isinstance(part, dict):
|
||||
if isinstance(part.get("text"), str):
|
||||
parts.append(part["text"])
|
||||
elif isinstance(part.get("content"), str):
|
||||
parts.append(part["content"])
|
||||
return "\n".join(parts)
|
||||
if isinstance(content, dict):
|
||||
if isinstance(content.get("text"), str):
|
||||
return content["text"]
|
||||
if isinstance(content.get("content"), str):
|
||||
return content["content"]
|
||||
return str(content)
|
||||
|
||||
|
||||
def _tool_call_from_chat_payload(
|
||||
payload: dict[str, Any],
|
||||
*,
|
||||
index: int,
|
||||
timestamp_ms: int,
|
||||
) -> ToolCall:
|
||||
"""Build a canonical tool call from chat-completions message payloads."""
|
||||
|
||||
function = payload.get("function")
|
||||
if not isinstance(function, dict):
|
||||
function = {}
|
||||
name = (
|
||||
function.get("name")
|
||||
or payload.get("name")
|
||||
or payload.get("tool")
|
||||
or payload.get("type")
|
||||
or ""
|
||||
)
|
||||
arguments = (
|
||||
function.get("arguments")
|
||||
or payload.get("arguments")
|
||||
or payload.get("args")
|
||||
or payload.get("input")
|
||||
or {}
|
||||
)
|
||||
if isinstance(arguments, str):
|
||||
try:
|
||||
arguments = json.loads(arguments)
|
||||
except json.JSONDecodeError:
|
||||
arguments = {"raw": arguments}
|
||||
if not isinstance(arguments, dict):
|
||||
arguments = {"value": arguments}
|
||||
return ToolCall(
|
||||
id=str(payload.get("id") or payload.get("call_id") or f"hermes-chat-{index}"),
|
||||
name=str(name),
|
||||
input=arguments,
|
||||
timestamp_ms=timestamp_ms,
|
||||
)
|
||||
|
||||
|
||||
def parse_chat_messages(messages: Iterable[dict[str, Any]]) -> Transcript:
|
||||
"""Parse Hermes AIAgent/OpenAI-style message history to a Transcript.
|
||||
|
||||
`AIAgent.run_conversation()` returns a `messages` list with user,
|
||||
assistant, and tool-role entries. This parser preserves ordering and
|
||||
attaches tool-role output back to the assistant `ToolCall` it belongs to.
|
||||
"""
|
||||
|
||||
transcript = Transcript()
|
||||
pending_by_id: dict[str, ToolCall] = {}
|
||||
pending_order: list[ToolCall] = []
|
||||
call_counter = 0
|
||||
|
||||
for turn_index, entry in enumerate(messages):
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
role = _coerce_role(str(entry.get("role") or entry.get("from") or ""))
|
||||
text = _content_to_text(entry.get("content", entry.get("value", "")))
|
||||
|
||||
if role == "tool":
|
||||
tool_call_id = str(entry.get("tool_call_id") or entry.get("id") or "")
|
||||
target = pending_by_id.get(tool_call_id) if tool_call_id else None
|
||||
if target is None and pending_order:
|
||||
target = pending_order.pop(0)
|
||||
if target is not None:
|
||||
target.output = text
|
||||
target.success = not _looks_like_error(text)
|
||||
if not target.success:
|
||||
target.error = text
|
||||
elif text:
|
||||
transcript.messages.append(
|
||||
TranscriptMessage(
|
||||
role="tool",
|
||||
tool_result_for=tool_call_id or None,
|
||||
tool_result_content=text,
|
||||
timestamp_ms=turn_index,
|
||||
)
|
||||
)
|
||||
continue
|
||||
|
||||
tool_calls: list[ToolCall] = []
|
||||
raw_calls = entry.get("tool_calls") or []
|
||||
if isinstance(raw_calls, list):
|
||||
for payload in raw_calls:
|
||||
if not isinstance(payload, dict):
|
||||
continue
|
||||
call_counter += 1
|
||||
call = _tool_call_from_chat_payload(
|
||||
payload,
|
||||
index=call_counter,
|
||||
timestamp_ms=turn_index,
|
||||
)
|
||||
tool_calls.append(call)
|
||||
pending_by_id[call.id] = call
|
||||
pending_order.append(call)
|
||||
|
||||
if role == "assistant":
|
||||
transcript.messages.append(
|
||||
TranscriptMessage(
|
||||
role="assistant",
|
||||
text=text,
|
||||
tool_calls=tool_calls,
|
||||
timestamp_ms=turn_index,
|
||||
)
|
||||
)
|
||||
elif role in {"user", "system"}:
|
||||
if text:
|
||||
transcript.messages.append(
|
||||
TranscriptMessage(
|
||||
role=role,
|
||||
text=text,
|
||||
timestamp_ms=turn_index,
|
||||
)
|
||||
)
|
||||
elif text:
|
||||
transcript.messages.append(
|
||||
TranscriptMessage(
|
||||
role=role,
|
||||
text=text,
|
||||
timestamp_ms=turn_index,
|
||||
)
|
||||
)
|
||||
|
||||
return transcript
|
||||
|
||||
|
||||
def _looks_like_error(text: str) -> bool:
|
||||
lowered = text.lower()
|
||||
return any(token in lowered for token in ("error", "traceback", "failed", "exception"))
|
||||
|
||||
|
||||
def iter_tool_calls_from_conversations(conversations: Iterable[dict[str, Any]]) -> list[ToolCall]:
|
||||
"""Helper used by tests: pull out just the tool-call sequence.
|
||||
|
||||
Equivalent to `parse_conversation({"conversations": list(conv)}).tool_call_sequence`
|
||||
but skips the assistant-text assembly. Useful for asserting on call
|
||||
order and arguments without noise.
|
||||
"""
|
||||
|
||||
return parse_conversation({"conversations": list(conversations)}).tool_call_sequence
|
||||
|
||||
|
||||
__all__ = [
|
||||
"iter_tool_calls_from_conversations",
|
||||
"parse_chat_messages",
|
||||
"parse_conversation",
|
||||
]
|
||||
472
clawbench/adapters/openclaw.py
Normal file
472
clawbench/adapters/openclaw.py
Normal file
@ -0,0 +1,472 @@
|
||||
"""OpenClaw adapter — drives tasks through an OpenClaw gateway.
|
||||
|
||||
This is the adapter-shaped wrapper around the agent execution flow that
|
||||
has lived inside `BenchmarkHarness._run_single` until now. It holds a
|
||||
`GatewayClient` open for the run's duration, creates one agent per run
|
||||
and one session per phase (matching the existing behavior), delivers
|
||||
simulated-user turns, and resolves `StateQuery` assertions against the
|
||||
gateway's `memory.search` / `sessions.resolve` / `cron.list` / arbitrary
|
||||
`_rpc(method)` surface.
|
||||
|
||||
The benchmark harness now routes OpenClaw through this adapter, matching
|
||||
the same canonical task/run lifecycle used by other harness adapters.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import uuid
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from clawbench.adapters import register_adapter
|
||||
from clawbench.adapters.base import (
|
||||
AdapterConfig,
|
||||
AdapterContext,
|
||||
AgentAdapter,
|
||||
PhaseResult,
|
||||
StateQueryResult,
|
||||
)
|
||||
from clawbench.canonical import (
|
||||
AdapterCapability,
|
||||
CanonicalPhase,
|
||||
StateQuery,
|
||||
)
|
||||
from clawbench.client import GatewayClient, GatewayConfig
|
||||
from clawbench.environment_files import (
|
||||
memory_visible_in_transcript,
|
||||
resolve_json_path,
|
||||
verify_memory_fallback,
|
||||
)
|
||||
from clawbench.schemas import (
|
||||
CronState,
|
||||
MemoryState,
|
||||
PromptVariant,
|
||||
SessionState,
|
||||
Transcript,
|
||||
)
|
||||
from clawbench.session_labels import unique_session_label
|
||||
from clawbench.simulated_user import UserSimulator
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class OpenClawAdapterConfig(AdapterConfig):
|
||||
"""Config for the OpenClaw adapter.
|
||||
|
||||
`gateway` holds the connection parameters the adapter uses to reach
|
||||
the OpenClaw gateway. `prompt_variant` controls which wording of
|
||||
each simulated-user turn is rendered.
|
||||
"""
|
||||
|
||||
gateway: GatewayConfig | None = None
|
||||
prompt_variant: str = PromptVariant.CLEAR.value
|
||||
# Default per-turn timeout passed to `send_and_wait` when the
|
||||
# phase does not override it. Matches the existing harness default.
|
||||
turn_timeout_seconds: float = 180.0
|
||||
|
||||
|
||||
@register_adapter
|
||||
class OpenClawAdapter(AgentAdapter):
|
||||
"""Adapter for the OpenClaw gateway (default harness path)."""
|
||||
|
||||
name = "openclaw"
|
||||
capabilities = {
|
||||
AdapterCapability.FILES,
|
||||
AdapterCapability.EXECUTION,
|
||||
AdapterCapability.MEMORY,
|
||||
AdapterCapability.SESSION,
|
||||
AdapterCapability.CRON,
|
||||
AdapterCapability.BROWSER,
|
||||
AdapterCapability.GATEWAY_RPC,
|
||||
AdapterCapability.MULTI_TURN_INJECTION,
|
||||
}
|
||||
|
||||
def __init__(self, config: OpenClawAdapterConfig | None = None) -> None:
|
||||
super().__init__(config or OpenClawAdapterConfig())
|
||||
self._config: OpenClawAdapterConfig = self.config # type: ignore[assignment]
|
||||
self._gateway_config: GatewayConfig = self._config.gateway or GatewayConfig()
|
||||
self._client: GatewayClient | None = None
|
||||
# Dependency injection hook for tests: monkeypatch this to swap
|
||||
# in a stub gateway without touching the class definition.
|
||||
self._client_factory = lambda: GatewayClient(self._gateway_config)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Long-lived gateway connection.
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def __aenter__(self) -> "OpenClawAdapter":
|
||||
client = self._client_factory()
|
||||
await client.__aenter__()
|
||||
self._client = client
|
||||
return self
|
||||
|
||||
async def __aexit__(self, exc_type: object, exc: object, tb: object) -> None:
|
||||
if self._client is not None:
|
||||
try:
|
||||
await self._client.__aexit__(exc_type, exc, tb)
|
||||
finally:
|
||||
self._client = None
|
||||
|
||||
@property
|
||||
def client(self) -> GatewayClient:
|
||||
if self._client is None:
|
||||
raise RuntimeError(
|
||||
"OpenClawAdapter must be used as an async context manager "
|
||||
"before calling setup/run_phase/teardown."
|
||||
)
|
||||
return self._client
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Lifecycle.
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def setup(self, ctx: AdapterContext) -> None:
|
||||
"""Create the per-run agent and run pre-run state queries."""
|
||||
|
||||
self._realize_memory_seeds(ctx)
|
||||
|
||||
agent_name = (
|
||||
f"clawbench-{ctx.task.id}-run-{ctx.run_index}-{uuid.uuid4().hex[:6]}"
|
||||
)
|
||||
agent_id = await self.client.create_agent(
|
||||
name=agent_name, workspace=str(ctx.workspace)
|
||||
)
|
||||
ctx.adapter_state["agent_id"] = agent_id
|
||||
ctx.adapter_state.setdefault("session_keys", [])
|
||||
|
||||
# Pre-run gateway assertions (ex-`setup.pre_check_gateway`) —
|
||||
# evaluated immediately, failures are surfaced via the returned
|
||||
# state via `ctx.adapter_state["pre_run_failures"]` so the
|
||||
# harness can fail fast before doing any phase work.
|
||||
failures: list[str] = []
|
||||
for query in ctx.task.verifier.pre_run_queries:
|
||||
result = await self.verify_state_query(query, ctx)
|
||||
if not result.ok:
|
||||
failures.append(result.detail or query.description)
|
||||
if failures:
|
||||
ctx.adapter_state["pre_run_failures"] = failures
|
||||
|
||||
def _realize_memory_seeds(self, ctx: AdapterContext) -> None:
|
||||
"""Expose canonical memory seeds through the run workspace.
|
||||
|
||||
OpenClaw's native memory backend has no public seed/write RPC in the
|
||||
benchmark client, but agents can read files in their workspace and the
|
||||
verifier already falls back to these same memory files. This keeps
|
||||
seeded-memory tasks fair across OpenClaw and filesystem-first harnesses.
|
||||
"""
|
||||
|
||||
chunks: list[str] = []
|
||||
for seed in ctx.task.assets.seed_state:
|
||||
if seed.kind != "memory" or not seed.key:
|
||||
continue
|
||||
content = seed.content or ""
|
||||
if not isinstance(content, str):
|
||||
content = str(content)
|
||||
safe_key = "".join(
|
||||
ch if ch.isalnum() or ch in ("-", "_") else "_"
|
||||
for ch in seed.key.strip()
|
||||
).strip("_")
|
||||
if not safe_key:
|
||||
safe_key = "seed"
|
||||
body = f"# {seed.key}\n\n{content.strip()}\n"
|
||||
target = ctx.workspace / "memory" / f"{safe_key}.md"
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
target.write_text(body, encoding="utf-8")
|
||||
chunks.append(body)
|
||||
|
||||
if chunks:
|
||||
(ctx.workspace / "MEMORY.md").write_text("\n".join(chunks), encoding="utf-8")
|
||||
|
||||
async def run_phase(
|
||||
self,
|
||||
phase: CanonicalPhase,
|
||||
ctx: AdapterContext,
|
||||
) -> PhaseResult:
|
||||
"""Create a session, drive the simulator, append to the transcript."""
|
||||
|
||||
agent_id = ctx.adapter_state.get("agent_id")
|
||||
if not agent_id:
|
||||
return PhaseResult(
|
||||
error="OpenClawAdapter.run_phase called before setup(); no agent_id",
|
||||
completed_normally=False,
|
||||
)
|
||||
|
||||
session_keys: list[str] = ctx.adapter_state.setdefault("session_keys", [])
|
||||
session_key = await self.client.create_session(
|
||||
model=ctx.model,
|
||||
agent_id=agent_id,
|
||||
label=unique_session_label(
|
||||
f"clawbench-{ctx.task.id}-run{ctx.run_index}-phase{phase.name}"
|
||||
),
|
||||
)
|
||||
session_keys.append(session_key)
|
||||
ctx.adapter_state["last_session_key"] = session_key
|
||||
|
||||
await self.client.subscribe(session_key)
|
||||
|
||||
# Browser tasks require the browser tool to actually be
|
||||
# registered in the effective tool set for this session. If it
|
||||
# isn't, fail the phase fast rather than letting the agent
|
||||
# flounder against a missing tool.
|
||||
if ctx.task.family.value == "browser":
|
||||
try:
|
||||
await self._assert_browser_support(session_key)
|
||||
except Exception as exc:
|
||||
return PhaseResult(
|
||||
error=str(exc),
|
||||
completed_normally=False,
|
||||
)
|
||||
|
||||
simulator = UserSimulator(
|
||||
phase.user,
|
||||
ctx.runtime_values,
|
||||
prompt_variant=self._config.prompt_variant,
|
||||
)
|
||||
|
||||
turn_timeout = float(phase.timeout_seconds or ctx.task.budgets.timeout_seconds)
|
||||
turn_timeout = min(turn_timeout, self._config.turn_timeout_seconds)
|
||||
|
||||
appended: list = []
|
||||
turns_sent = 0
|
||||
while not simulator.is_done:
|
||||
user_message = await simulator.next_message(ctx.transcript)
|
||||
if user_message is None:
|
||||
break
|
||||
phase_transcript = await self.client.send_and_wait(
|
||||
session_key,
|
||||
user_message,
|
||||
timeout=turn_timeout,
|
||||
)
|
||||
ctx.transcript.messages.extend(phase_transcript.messages)
|
||||
appended.extend(phase_transcript.messages)
|
||||
turns_sent += 1
|
||||
|
||||
return PhaseResult(
|
||||
messages=appended,
|
||||
adapter_metadata={
|
||||
"session_key": session_key,
|
||||
"turns_sent": turns_sent,
|
||||
},
|
||||
)
|
||||
|
||||
async def _assert_browser_support(self, session_key: str) -> None:
|
||||
inventory = await self.client.get_effective_tools(session_key)
|
||||
tool_ids = {
|
||||
str(tool.get("id", ""))
|
||||
for group in inventory.get("groups", [])
|
||||
for tool in group.get("tools", [])
|
||||
}
|
||||
if "browser" not in tool_ids:
|
||||
raise RuntimeError(
|
||||
"Browser tasks require the browser tool, but it is not available in this gateway."
|
||||
)
|
||||
|
||||
async def teardown(self, ctx: AdapterContext) -> None:
|
||||
"""Delete per-phase sessions and the per-run agent."""
|
||||
|
||||
client = self._client
|
||||
if client is None:
|
||||
return
|
||||
session_keys: list[str] = ctx.adapter_state.get("session_keys", [])
|
||||
agent_id: str | None = ctx.adapter_state.get("agent_id")
|
||||
for session_key in session_keys:
|
||||
try:
|
||||
await client.delete_session(session_key)
|
||||
except Exception as exc: # pragma: no cover - best effort
|
||||
logger.warning("delete_session failed for %s: %s", session_key, exc)
|
||||
if agent_id:
|
||||
try:
|
||||
await client.delete_agent(agent_id, delete_files=False)
|
||||
except Exception as exc: # pragma: no cover - best effort
|
||||
logger.warning("delete_agent failed for %s: %s", agent_id, exc)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# State query resolution.
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def verify_state_query(
|
||||
self,
|
||||
query: StateQuery,
|
||||
ctx: AdapterContext,
|
||||
) -> StateQueryResult:
|
||||
try:
|
||||
if query.kind == "memory":
|
||||
return await self._verify_memory(query, ctx)
|
||||
if query.kind == "session":
|
||||
return await self._verify_session(query, ctx)
|
||||
if query.kind == "cron":
|
||||
return await self._verify_cron(query, ctx)
|
||||
if query.kind == "custom":
|
||||
return await self._verify_gateway(query, ctx)
|
||||
except Exception as exc:
|
||||
return StateQueryResult(ok=False, detail=str(exc))
|
||||
return StateQueryResult(
|
||||
ok=False,
|
||||
detail=f"OpenClawAdapter has no handler for query kind '{query.kind}'",
|
||||
capability_missing=True,
|
||||
)
|
||||
|
||||
# --- memory ---
|
||||
|
||||
async def _verify_memory(
|
||||
self, query: StateQuery, ctx: AdapterContext
|
||||
) -> StateQueryResult:
|
||||
key_pattern = str(query.selector.get("key_pattern", ""))
|
||||
value_contains = list(query.expected.get("value_contains", []))
|
||||
session_key = ctx.adapter_state.get("last_session_key", "")
|
||||
agent_id = ctx.adapter_state.get("agent_id")
|
||||
|
||||
# Primary path: memory.search RPC.
|
||||
try:
|
||||
response = await self.client._rpc(
|
||||
"memory.search",
|
||||
{
|
||||
"query": key_pattern,
|
||||
"sessionKey": session_key,
|
||||
"limit": 20,
|
||||
},
|
||||
)
|
||||
entries = response.get("payload", {}).get("entries", [])
|
||||
if query.predicate == "absent":
|
||||
ok = not entries
|
||||
return StateQueryResult(
|
||||
ok=ok,
|
||||
detail="Correctly absent" if ok else "Memory entry exists",
|
||||
)
|
||||
if not entries:
|
||||
return StateQueryResult(ok=False, detail="No matching memory entries found")
|
||||
all_values = " ".join(str(entry.get("value", "")) for entry in entries)
|
||||
for token in value_contains:
|
||||
if token.lower() not in all_values.lower():
|
||||
return StateQueryResult(
|
||||
ok=False, detail=f"Memory value missing '{token}'"
|
||||
)
|
||||
return StateQueryResult(ok=True, detail="OK")
|
||||
except Exception as exc:
|
||||
logger.info(
|
||||
"memory.search unavailable for verification, falling back: %s",
|
||||
exc,
|
||||
)
|
||||
|
||||
# Fallback: gateway-sourced memory files + workspace scan + transcript.
|
||||
fallback_state = MemoryState(
|
||||
key_pattern=key_pattern,
|
||||
exists=query.predicate != "absent",
|
||||
value_contains=value_contains,
|
||||
)
|
||||
extra_memory_text = ""
|
||||
if agent_id:
|
||||
try:
|
||||
from clawbench.environment import _read_agent_memory_text # local import to avoid cycle
|
||||
|
||||
extra_memory_text = await _read_agent_memory_text(self.client, agent_id)
|
||||
except Exception:
|
||||
extra_memory_text = ""
|
||||
ok, detail = verify_memory_fallback(
|
||||
fallback_state,
|
||||
ctx.workspace,
|
||||
transcript=ctx.transcript,
|
||||
extra_memory_text=extra_memory_text,
|
||||
)
|
||||
return StateQueryResult(ok=ok, detail=detail)
|
||||
|
||||
# --- session ---
|
||||
|
||||
async def _verify_session(
|
||||
self, query: StateQuery, ctx: AdapterContext
|
||||
) -> StateQueryResult:
|
||||
session_key = ctx.adapter_state.get("last_session_key", "")
|
||||
expected_model = query.expected.get("model") or ""
|
||||
try:
|
||||
response = await self.client._rpc("sessions.resolve", {"key": session_key})
|
||||
payload = response.get("payload", {})
|
||||
if query.predicate == "absent":
|
||||
return StateQueryResult(ok=False, detail="Session exists but should not")
|
||||
if expected_model:
|
||||
actual = str(payload.get("model", ""))
|
||||
if str(expected_model).lower() not in actual.lower():
|
||||
return StateQueryResult(
|
||||
ok=False,
|
||||
detail=f"Model mismatch: expected {expected_model}, got {actual}",
|
||||
)
|
||||
return StateQueryResult(ok=True, detail="OK")
|
||||
except Exception as exc:
|
||||
if query.predicate == "absent":
|
||||
return StateQueryResult(ok=True, detail="Correctly absent")
|
||||
return StateQueryResult(ok=False, detail=str(exc))
|
||||
|
||||
# --- cron ---
|
||||
|
||||
async def _verify_cron(
|
||||
self, query: StateQuery, ctx: AdapterContext
|
||||
) -> StateQueryResult:
|
||||
description_contains = query.selector.get("description_contains")
|
||||
try:
|
||||
response = await self.client._rpc("cron.list", {})
|
||||
jobs = response.get("payload", {}).get("jobs", [])
|
||||
if query.predicate == "absent":
|
||||
ok = not jobs
|
||||
return StateQueryResult(
|
||||
ok=ok,
|
||||
detail="Correctly absent" if ok else "Cron jobs exist",
|
||||
)
|
||||
if not jobs:
|
||||
return StateQueryResult(ok=False, detail="No cron jobs found")
|
||||
if description_contains and not any(
|
||||
str(description_contains).lower() in json.dumps(job).lower() for job in jobs
|
||||
):
|
||||
return StateQueryResult(
|
||||
ok=False,
|
||||
detail=f"No cron job matched '{description_contains}'",
|
||||
)
|
||||
return StateQueryResult(ok=True, detail="OK")
|
||||
except Exception as exc:
|
||||
return StateQueryResult(ok=False, detail=str(exc))
|
||||
|
||||
# --- arbitrary gateway RPC ---
|
||||
|
||||
async def _verify_gateway(
|
||||
self, query: StateQuery, ctx: AdapterContext
|
||||
) -> StateQueryResult:
|
||||
method = str(query.selector.get("method", ""))
|
||||
params = dict(query.selector.get("params", {}))
|
||||
assert_path = str(query.selector.get("assert_path", "$"))
|
||||
expected_equals = query.expected.get("equals")
|
||||
expected_contains = query.expected.get("contains")
|
||||
expected_exists = bool(query.expected.get("exists", True))
|
||||
try:
|
||||
response = await self.client._rpc(method, params)
|
||||
payload = response.get("payload", {})
|
||||
value = resolve_json_path(payload, assert_path)
|
||||
if not expected_exists:
|
||||
ok = value is None
|
||||
return StateQueryResult(
|
||||
ok=ok,
|
||||
detail="Correctly absent" if ok else "Path exists",
|
||||
)
|
||||
if value is None:
|
||||
return StateQueryResult(
|
||||
ok=False, detail=f"Path {assert_path} not found"
|
||||
)
|
||||
if expected_equals is not None and value != expected_equals:
|
||||
return StateQueryResult(
|
||||
ok=False, detail=f"Expected {expected_equals}, got {value}"
|
||||
)
|
||||
if (
|
||||
expected_contains is not None
|
||||
and str(expected_contains).lower() not in str(value).lower()
|
||||
):
|
||||
return StateQueryResult(
|
||||
ok=False,
|
||||
detail=f"Expected '{expected_contains}' in {value}",
|
||||
)
|
||||
return StateQueryResult(ok=True, detail="OK")
|
||||
except Exception as exc:
|
||||
return StateQueryResult(ok=False, detail=str(exc))
|
||||
|
||||
|
||||
__all__ = ["OpenClawAdapter", "OpenClawAdapterConfig"]
|
||||
45
clawbench/canonical/__init__.py
Normal file
45
clawbench/canonical/__init__.py
Normal file
@ -0,0 +1,45 @@
|
||||
"""Canonical task schema — agent-agnostic intent layer.
|
||||
|
||||
Part of ClawBench Phase-4 per CLAWBENCH_V0_4_SPEC.md §"Canonical Task Schema".
|
||||
Splits canonical task intent (what to set up, prompt with, and verify) from
|
||||
OpenClaw-specific execution details (which become adapter responsibilities).
|
||||
|
||||
The existing `TaskDefinition` in `clawbench/schemas.py` stays as-is for
|
||||
back-compat; this package adds a canonical view produced by
|
||||
`convert.from_task_definition`, which is the single bridge between the two
|
||||
shapes. Everything downstream of the harness (scorer, trajectory, judge,
|
||||
stats) is already agent-agnostic — those modules consume the transcript +
|
||||
TaskRunResult and do not need changes.
|
||||
"""
|
||||
|
||||
from clawbench.canonical.schema import (
|
||||
AdapterCapability,
|
||||
BudgetSpec,
|
||||
CanonicalAssets,
|
||||
CanonicalPhase,
|
||||
CanonicalTask,
|
||||
Deliverable,
|
||||
InteractionPolicy,
|
||||
SeedEntry,
|
||||
StateQuery,
|
||||
StateQueryKind,
|
||||
StateQueryPredicate,
|
||||
VerifierContract,
|
||||
)
|
||||
from clawbench.canonical.convert import from_task_definition
|
||||
|
||||
__all__ = [
|
||||
"AdapterCapability",
|
||||
"BudgetSpec",
|
||||
"CanonicalAssets",
|
||||
"CanonicalPhase",
|
||||
"CanonicalTask",
|
||||
"Deliverable",
|
||||
"InteractionPolicy",
|
||||
"SeedEntry",
|
||||
"StateQuery",
|
||||
"StateQueryKind",
|
||||
"StateQueryPredicate",
|
||||
"VerifierContract",
|
||||
"from_task_definition",
|
||||
]
|
||||
328
clawbench/canonical/convert.py
Normal file
328
clawbench/canonical/convert.py
Normal file
@ -0,0 +1,328 @@
|
||||
"""Convert `TaskDefinition` → `CanonicalTask`.
|
||||
|
||||
This is the single bridge between the existing OpenClaw-entangled task
|
||||
format (`clawbench.schemas.TaskDefinition`) and the agent-agnostic
|
||||
canonical form (`CanonicalTask`). Callers load tasks as usual via
|
||||
`clawbench.tasks.load_all_tasks` and then call
|
||||
`from_task_definition(task)` to get the canonical view.
|
||||
|
||||
Field mappings (any field not mentioned is copied verbatim):
|
||||
|
||||
- `setup.asset_packs` → `assets.seed_state` (kind="file", asset_pack=...)
|
||||
- `setup.workspace_files` → `assets.workspace_files`
|
||||
- `setup.background_services` → `assets.background_services`
|
||||
- `setup.memory_seed` → `assets.seed_state` (kind="memory")
|
||||
- `setup.pre_check_gateway` → `verifier.pre_run_queries` (GATEWAY_RPC)
|
||||
- `completion.files` → `verifier.file_states`
|
||||
- `completion.execution_checks` → `verifier.execution_checks`
|
||||
- `completion.memory` → `verifier.state_queries` (MEMORY)
|
||||
- `completion.session` → `verifier.state_queries` (SESSION)
|
||||
- `completion.cron` → `verifier.state_queries` (CRON)
|
||||
- `completion.gateway_assertions` → `verifier.state_queries` (GATEWAY_RPC)
|
||||
- `trajectory` → `verifier.trajectory`
|
||||
- `behavior` → `verifier.behavior`
|
||||
- `judge` → `verifier.judge`
|
||||
- `user` / `phases` → `phases` via `task.normalized_phases()`
|
||||
- `timeout_seconds` → `budgets.timeout_seconds` (also on each phase)
|
||||
|
||||
`required_adapter_capabilities` is computed from what the task actually
|
||||
needs: always `{FILES, EXECUTION}`, plus `MEMORY`/`SESSION`/`CRON`/
|
||||
`GATEWAY_RPC`/`BROWSER`/`MULTI_TURN_INJECTION` when the source task's
|
||||
fields trigger those capabilities.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from clawbench.canonical.schema import (
|
||||
AdapterCapability,
|
||||
BudgetSpec,
|
||||
CanonicalAssets,
|
||||
CanonicalPhase,
|
||||
CanonicalTask,
|
||||
InteractionPolicy,
|
||||
SeedEntry,
|
||||
StateQuery,
|
||||
VerifierContract,
|
||||
)
|
||||
from clawbench.schemas import (
|
||||
CronState,
|
||||
GatewayAssertion,
|
||||
MemoryState,
|
||||
SessionState,
|
||||
TaskDefinition,
|
||||
TaskFamily,
|
||||
UserTurn,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Seed state
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _seeds_from_setup(task: TaskDefinition) -> list[SeedEntry]:
|
||||
seeds: list[SeedEntry] = []
|
||||
for pack in task.setup.asset_packs:
|
||||
seeds.append(SeedEntry(kind="file", asset_pack=pack))
|
||||
for entry in task.setup.memory_seed:
|
||||
# memory_seed entries are free-form dicts in the existing schema;
|
||||
# we preserve them verbatim in `metadata` and surface `key` +
|
||||
# `content` when present so adapters can consume the structured
|
||||
# pieces without re-parsing.
|
||||
seeds.append(
|
||||
SeedEntry(
|
||||
kind="memory",
|
||||
key=str(entry.get("key", "")),
|
||||
content=entry.get("value") or entry.get("content"),
|
||||
metadata=dict(entry),
|
||||
)
|
||||
)
|
||||
return seeds
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# State queries: memory / session / cron / gateway_assertions
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _memory_state_to_query(state: MemoryState) -> StateQuery:
|
||||
expected: dict[str, object] = {}
|
||||
if state.value_contains:
|
||||
expected["value_contains"] = list(state.value_contains)
|
||||
return StateQuery(
|
||||
kind="memory",
|
||||
predicate="exists" if state.exists else "absent",
|
||||
selector={"key_pattern": state.key_pattern},
|
||||
expected=expected,
|
||||
required_capability=AdapterCapability.MEMORY,
|
||||
description=f"memory key ~ /{state.key_pattern}/",
|
||||
)
|
||||
|
||||
|
||||
def _session_state_to_query(state: SessionState) -> StateQuery:
|
||||
expected: dict[str, object] = {}
|
||||
if state.model_should_be:
|
||||
expected["model"] = state.model_should_be
|
||||
return StateQuery(
|
||||
kind="session",
|
||||
predicate="exists" if state.should_exist else "absent",
|
||||
selector={},
|
||||
expected=expected,
|
||||
required_capability=AdapterCapability.SESSION,
|
||||
description="session state",
|
||||
)
|
||||
|
||||
|
||||
def _cron_state_to_query(state: CronState) -> StateQuery:
|
||||
selector: dict[str, object] = {}
|
||||
if state.description_contains:
|
||||
selector["description_contains"] = state.description_contains
|
||||
return StateQuery(
|
||||
kind="cron",
|
||||
predicate="exists" if state.exists else "absent",
|
||||
selector=selector,
|
||||
expected={},
|
||||
required_capability=AdapterCapability.CRON,
|
||||
description="cron schedule",
|
||||
)
|
||||
|
||||
|
||||
def _gateway_assertion_to_query(assertion: GatewayAssertion) -> StateQuery:
|
||||
selector: dict[str, object] = {
|
||||
"method": assertion.method,
|
||||
"params": dict(assertion.params),
|
||||
"assert_path": assertion.assert_path,
|
||||
}
|
||||
expected: dict[str, object] = {}
|
||||
if assertion.assert_equals is not None:
|
||||
expected["equals"] = assertion.assert_equals
|
||||
if assertion.assert_contains is not None:
|
||||
expected["contains"] = assertion.assert_contains
|
||||
expected["exists"] = assertion.assert_exists
|
||||
predicate = "exists"
|
||||
if assertion.assert_equals is not None:
|
||||
predicate = "equals"
|
||||
elif assertion.assert_contains is not None:
|
||||
predicate = "contains"
|
||||
elif not assertion.assert_exists:
|
||||
predicate = "absent"
|
||||
return StateQuery(
|
||||
kind="custom",
|
||||
predicate=predicate,
|
||||
selector=selector,
|
||||
expected=expected,
|
||||
required_capability=AdapterCapability.GATEWAY_RPC,
|
||||
description=f"gateway rpc: {assertion.method}",
|
||||
)
|
||||
|
||||
|
||||
def _state_queries_from_completion(task: TaskDefinition) -> list[StateQuery]:
|
||||
queries: list[StateQuery] = []
|
||||
for mem in task.completion.memory:
|
||||
queries.append(_memory_state_to_query(mem))
|
||||
if task.completion.session is not None:
|
||||
queries.append(_session_state_to_query(task.completion.session))
|
||||
for cron in task.completion.cron:
|
||||
queries.append(_cron_state_to_query(cron))
|
||||
for assertion in task.completion.gateway_assertions:
|
||||
queries.append(_gateway_assertion_to_query(assertion))
|
||||
return queries
|
||||
|
||||
|
||||
def _pre_run_queries_from_setup(task: TaskDefinition) -> list[StateQuery]:
|
||||
return [_gateway_assertion_to_query(a) for a in task.setup.pre_check_gateway]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Phases + dynamic-turn detection
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
_DYNAMIC_TURN_FIELDS = (
|
||||
"when_tool_family",
|
||||
"when_tool_name",
|
||||
"when_assistant_contains",
|
||||
"when_last_tool_failed",
|
||||
)
|
||||
|
||||
|
||||
def _turn_is_dynamic(turn: UserTurn) -> bool:
|
||||
if turn.when_last_tool_failed:
|
||||
return True
|
||||
for name in _DYNAMIC_TURN_FIELDS:
|
||||
value = getattr(turn, name, None)
|
||||
if isinstance(value, bool):
|
||||
if value:
|
||||
return True
|
||||
elif value:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _phases_from_task(task: TaskDefinition) -> tuple[list[CanonicalPhase], bool]:
|
||||
phases: list[CanonicalPhase] = []
|
||||
any_dynamic = False
|
||||
for phase in task.normalized_phases():
|
||||
phases.append(
|
||||
CanonicalPhase(
|
||||
name=phase.name,
|
||||
user=phase.user,
|
||||
timeout_seconds=phase.timeout_seconds,
|
||||
)
|
||||
)
|
||||
if len(phase.user.turns) > 1 or any(_turn_is_dynamic(t) for t in phase.user.turns):
|
||||
any_dynamic = True
|
||||
return phases, any_dynamic
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Capability inference
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _capabilities_for_task(task: TaskDefinition, *, uses_dynamic: bool) -> set[AdapterCapability]:
|
||||
caps: set[AdapterCapability] = {AdapterCapability.FILES, AdapterCapability.EXECUTION}
|
||||
if task.completion.memory or any(seed.get("key") for seed in task.setup.memory_seed):
|
||||
caps.add(AdapterCapability.MEMORY)
|
||||
if task.completion.session is not None:
|
||||
caps.add(AdapterCapability.SESSION)
|
||||
if task.completion.cron:
|
||||
caps.add(AdapterCapability.CRON)
|
||||
if task.completion.gateway_assertions or task.setup.pre_check_gateway:
|
||||
caps.add(AdapterCapability.GATEWAY_RPC)
|
||||
if task.family == TaskFamily.BROWSER:
|
||||
caps.add(AdapterCapability.BROWSER)
|
||||
if uses_dynamic:
|
||||
caps.add(AdapterCapability.MULTI_TURN_INJECTION)
|
||||
return caps
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public entry point
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def from_task_definition(task: TaskDefinition) -> CanonicalTask:
|
||||
"""Produce the canonical view of a legacy `TaskDefinition`.
|
||||
|
||||
This is lossless for fields that have a canonical equivalent.
|
||||
OpenClaw-only constructs (gateway_assertions, pre_check_gateway,
|
||||
memory_seed) become `StateQuery` entries / `SeedEntry` entries
|
||||
tagged with the capability an adapter needs to resolve them.
|
||||
"""
|
||||
|
||||
phases, any_dynamic = _phases_from_task(task)
|
||||
|
||||
assets = CanonicalAssets(
|
||||
workspace_files=list(task.setup.workspace_files),
|
||||
background_services=list(task.setup.background_services),
|
||||
seed_state=_seeds_from_setup(task),
|
||||
)
|
||||
|
||||
verifier = VerifierContract(
|
||||
file_states=list(task.completion.files),
|
||||
execution_checks=list(task.completion.execution_checks),
|
||||
state_queries=_state_queries_from_completion(task),
|
||||
pre_run_queries=_pre_run_queries_from_setup(task),
|
||||
trajectory=task.trajectory,
|
||||
behavior=task.behavior,
|
||||
judge=task.judge,
|
||||
)
|
||||
|
||||
interaction = InteractionPolicy(
|
||||
max_turns=max((phase.user.max_turns for phase in phases), default=20),
|
||||
allow_multi_phase=len(phases) > 1,
|
||||
uses_dynamic_user_triggers=any_dynamic,
|
||||
)
|
||||
|
||||
budgets = BudgetSpec(timeout_seconds=task.timeout_seconds)
|
||||
|
||||
capabilities = _capabilities_for_task(task, uses_dynamic=any_dynamic)
|
||||
|
||||
return CanonicalTask(
|
||||
id=task.id,
|
||||
name=task.name,
|
||||
tier=task.tier,
|
||||
family=task.family,
|
||||
surface=task.surface,
|
||||
scenario=task.scenario,
|
||||
subscenario=task.subscenario,
|
||||
capabilities=list(task.capabilities),
|
||||
atomic_capabilities=list(task.atomic_capabilities),
|
||||
pool=task.pool,
|
||||
subsets=list(task.subsets),
|
||||
variant_group=task.variant_group,
|
||||
variant_id=task.variant_id,
|
||||
template_id=task.template_id,
|
||||
release_id=task.release_id,
|
||||
source_kind=task.source_kind,
|
||||
provenance_ids=list(task.provenance_ids),
|
||||
privacy_tier=task.privacy_tier,
|
||||
contamination_risk=task.contamination_risk,
|
||||
freshness_epoch=task.freshness_epoch,
|
||||
category=task.category,
|
||||
domain=task.domain,
|
||||
functionality=list(task.functionality),
|
||||
trace_distribution=list(task.trace_distribution),
|
||||
tool_surface=list(task.tool_surface),
|
||||
risk_tags=list(task.risk_tags),
|
||||
first_used_at=task.first_used_at,
|
||||
retire_after_runs=task.retire_after_runs,
|
||||
similarity_hash=task.similarity_hash,
|
||||
canary_token=task.canary_token,
|
||||
official=task.official,
|
||||
query_difficulty=task.query_difficulty,
|
||||
query_weight=task.query_weight,
|
||||
artifact_type=task.artifact_type,
|
||||
preconditions=list(task.preconditions),
|
||||
source_dataset=task.source_dataset,
|
||||
prompt_variants=list(task.prompt_variants),
|
||||
pass_threshold=task.pass_threshold,
|
||||
assets=assets,
|
||||
phases=phases,
|
||||
verifier=verifier,
|
||||
budgets=budgets,
|
||||
interaction=interaction,
|
||||
deliverables=[],
|
||||
required_adapter_capabilities=capabilities,
|
||||
)
|
||||
296
clawbench/canonical/schema.py
Normal file
296
clawbench/canonical/schema.py
Normal file
@ -0,0 +1,296 @@
|
||||
"""Canonical task schema — agent-agnostic intent.
|
||||
|
||||
This is the Phase-4 split of `TaskDefinition` (see CLAWBENCH_V0_4_SPEC.md
|
||||
§"Canonical Task Schema"). The canonical layer expresses **what** a task
|
||||
is — its identity, prompts, assets, and verification contract — without
|
||||
saying **how** it gets executed. The "how" (gateway RPCs, session
|
||||
lifecycle, tool-family normalization) lives in per-adapter code under
|
||||
`clawbench/adapters/`.
|
||||
|
||||
The rule of thumb:
|
||||
|
||||
- If a field describes what the user asked for, what files/state the
|
||||
agent is expected to produce, or what the run must satisfy to pass,
|
||||
it belongs here.
|
||||
- If a field describes how OpenClaw's gateway is called to drive the
|
||||
run or read back state, it belongs in the OpenClaw adapter (and the
|
||||
canonical version of that check is a `StateQuery` with a
|
||||
`required_capability`).
|
||||
|
||||
Converting from `TaskDefinition` → `CanonicalTask` is lossless for fields
|
||||
that have a canonical equivalent; OpenClaw-only fields (like
|
||||
`pre_check_gateway` and `gateway_assertions`) survive as `StateQuery`
|
||||
entries tagged with `AdapterCapability.GATEWAY_RPC`, so adapters that
|
||||
support them can still resolve them while adapters that don't can cleanly
|
||||
report a capability gap.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import enum
|
||||
from typing import Any, Literal
|
||||
|
||||
from pydantic import BaseModel, Field, model_validator
|
||||
|
||||
from clawbench.schemas import (
|
||||
ArtifactType,
|
||||
BackgroundService,
|
||||
BehaviorExpectations,
|
||||
CapabilityTag,
|
||||
ExecutionCheck,
|
||||
FileState,
|
||||
JudgeExpectations,
|
||||
PromptVariant,
|
||||
QueryDifficulty,
|
||||
ScenarioDomain,
|
||||
SimulatedUser,
|
||||
TaskFamily,
|
||||
TaskPool,
|
||||
TaskSubset,
|
||||
Tier,
|
||||
TrajectoryExpectations,
|
||||
)
|
||||
|
||||
|
||||
class AdapterCapability(str, enum.Enum):
|
||||
"""What an adapter is able to provide to a running task.
|
||||
|
||||
Each `StateQuery` declares a `required_capability`. If the selected
|
||||
adapter's `capabilities` set does not include that capability, the
|
||||
harness either skips the task entirely (strict mode) or scores the
|
||||
query as neutral (partial mode). This keeps the leaderboard honest
|
||||
about what an adapter can actually evaluate.
|
||||
"""
|
||||
|
||||
FILES = "files"
|
||||
EXECUTION = "execution"
|
||||
MEMORY = "memory"
|
||||
SESSION = "session"
|
||||
CRON = "cron"
|
||||
BROWSER = "browser"
|
||||
GATEWAY_RPC = "gateway_rpc"
|
||||
# The adapter can deliver additional user turns mid-trajectory in
|
||||
# response to simulated-user triggers (when_tool_family,
|
||||
# when_assistant_contains, etc). Single-shot drivers like Hermes's
|
||||
# MiniSWERunner do not provide this.
|
||||
MULTI_TURN_INJECTION = "multi_turn_injection"
|
||||
|
||||
|
||||
StateQueryKind = Literal["memory", "session", "cron", "custom"]
|
||||
StateQueryPredicate = Literal["exists", "absent", "equals", "contains"]
|
||||
|
||||
|
||||
class StateQuery(BaseModel):
|
||||
"""An abstract state assertion resolved by the active adapter.
|
||||
|
||||
The canonical layer does not commit to how the state is read. For
|
||||
example, a `kind="memory"` query with `selector={"key_pattern":"alpha"}`
|
||||
and `expected={"value_contains":["foo"]}` means "there is a memory
|
||||
entry whose key matches /alpha/ and whose value contains 'foo'".
|
||||
OpenClaw's adapter resolves that against the `memory.search` gateway
|
||||
RPC; a filesystem-memory adapter (e.g. Hermes) resolves it by
|
||||
scanning `MEMORY.md` / `memory/notes.md` in the workspace.
|
||||
|
||||
The `required_capability` is what the harness checks against the
|
||||
adapter's declared capability set.
|
||||
"""
|
||||
|
||||
kind: StateQueryKind
|
||||
predicate: StateQueryPredicate = "exists"
|
||||
selector: dict[str, Any] = Field(default_factory=dict)
|
||||
expected: dict[str, Any] = Field(default_factory=dict)
|
||||
required_capability: AdapterCapability
|
||||
description: str = ""
|
||||
|
||||
|
||||
class SeedEntry(BaseModel):
|
||||
"""A single piece of pre-task state to seed into the workspace.
|
||||
|
||||
`kind="file"`: the adapter writes `content` (or copies a bundled
|
||||
asset via `asset_pack`) to `path` inside the workspace.
|
||||
`kind="memory"`: the adapter seeds a memory entry with `key` and
|
||||
`content`. Adapters without memory support fall back to writing
|
||||
the seed as a file (see `environment_files.verify_memory_fallback`).
|
||||
"""
|
||||
|
||||
kind: Literal["file", "memory"]
|
||||
path: str | None = None
|
||||
content: str | None = None
|
||||
key: str | None = None
|
||||
asset_pack: str = ""
|
||||
metadata: dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
@model_validator(mode="after")
|
||||
def _validate_shape(self) -> SeedEntry:
|
||||
if self.kind == "file" and not self.path and not self.asset_pack:
|
||||
raise ValueError("SeedEntry(kind='file') requires `path` or `asset_pack`.")
|
||||
if self.kind == "memory" and not self.key:
|
||||
raise ValueError("SeedEntry(kind='memory') requires `key`.")
|
||||
return self
|
||||
|
||||
|
||||
class Deliverable(BaseModel):
|
||||
"""A user-visible artifact the task is expected to produce."""
|
||||
|
||||
kind: ArtifactType
|
||||
paths: list[str] = Field(default_factory=list)
|
||||
description: str = ""
|
||||
|
||||
|
||||
class BudgetSpec(BaseModel):
|
||||
"""Per-task execution budgets.
|
||||
|
||||
`timeout_seconds` is the wall clock for the full run (all phases).
|
||||
`max_tool_calls=0` means unbounded within the timeout. Adapters are
|
||||
expected to honor these as soft caps; the harness will also enforce
|
||||
the timeout as a hard deadline.
|
||||
"""
|
||||
|
||||
timeout_seconds: int = 180
|
||||
max_tool_calls: int = 0
|
||||
per_turn_timeout_seconds: int = 0
|
||||
|
||||
|
||||
class InteractionPolicy(BaseModel):
|
||||
"""How the canonical phases drive the agent."""
|
||||
|
||||
max_turns: int = 20
|
||||
allow_multi_phase: bool = True
|
||||
# Declares that the task's simulated user sends follow-up turns
|
||||
# based on trajectory triggers (not just counts). Adapters without
|
||||
# MULTI_TURN_INJECTION cannot deliver these dynamically.
|
||||
uses_dynamic_user_triggers: bool = False
|
||||
|
||||
|
||||
class VerifierContract(BaseModel):
|
||||
"""Everything needed to score a run, independent of how it ran.
|
||||
|
||||
The file/execution halves are fully agent-agnostic — `environment_files`
|
||||
evaluates them against the workspace directly. State queries are
|
||||
resolved by `adapter.verify_state_query`. Trajectory and behavior
|
||||
expectations are evaluated against the `Transcript` (already agent-
|
||||
agnostic). The optional judge rubric is evaluated against artifacts
|
||||
+ transcript + completion feedback.
|
||||
"""
|
||||
|
||||
file_states: list[FileState] = Field(default_factory=list)
|
||||
execution_checks: list[ExecutionCheck] = Field(default_factory=list)
|
||||
state_queries: list[StateQuery] = Field(default_factory=list)
|
||||
pre_run_queries: list[StateQuery] = Field(default_factory=list)
|
||||
trajectory: TrajectoryExpectations = Field(default_factory=TrajectoryExpectations)
|
||||
behavior: BehaviorExpectations = Field(default_factory=BehaviorExpectations)
|
||||
judge: JudgeExpectations | None = None
|
||||
|
||||
|
||||
class CanonicalAssets(BaseModel):
|
||||
"""Workspace + seed state the harness realizes before phases run.
|
||||
|
||||
`workspace_files` is a list of relative paths (resolved against the
|
||||
task's assets/ dir) to copy into the workspace. `background_services`
|
||||
is already canonical (subprocess + readiness probe, no OpenClaw
|
||||
coupling). `seed_state` replaces `asset_packs` + `memory_seed` with
|
||||
a uniform per-entry list.
|
||||
"""
|
||||
|
||||
workspace_files: list[str] = Field(default_factory=list)
|
||||
background_services: list[BackgroundService] = Field(default_factory=list)
|
||||
seed_state: list[SeedEntry] = Field(default_factory=list)
|
||||
|
||||
|
||||
class CanonicalPhase(BaseModel):
|
||||
"""One simulated-user phase in a multi-phase task.
|
||||
|
||||
`user` is reused verbatim from `clawbench.schemas.SimulatedUser` —
|
||||
it is already agent-agnostic (turn text + canonical trigger
|
||||
predicates). Whether a specific trigger fires on a given adapter
|
||||
depends on whether tool-family tags are populated, which is an
|
||||
adapter responsibility.
|
||||
"""
|
||||
|
||||
name: str
|
||||
user: SimulatedUser
|
||||
timeout_seconds: int | None = None
|
||||
|
||||
|
||||
class CanonicalTask(BaseModel):
|
||||
"""Agent-agnostic task definition.
|
||||
|
||||
Produced by `convert.from_task_definition` from an existing
|
||||
`TaskDefinition`. Consumed by adapters via `AdapterContext` and by
|
||||
the scorer + trajectory/judge layers. No field here is OpenClaw-
|
||||
specific; OpenClaw-only semantics survive as `StateQuery` entries
|
||||
with `required_capability=GATEWAY_RPC`.
|
||||
"""
|
||||
|
||||
# Identity and taxonomy (already canonical in TaskDefinition).
|
||||
id: str
|
||||
name: str
|
||||
tier: Tier
|
||||
family: TaskFamily
|
||||
surface: str
|
||||
scenario: ScenarioDomain | None = None
|
||||
subscenario: str = ""
|
||||
capabilities: list[CapabilityTag] = Field(default_factory=list)
|
||||
atomic_capabilities: list[str] = Field(default_factory=list)
|
||||
|
||||
# Pool / rotation / provenance.
|
||||
pool: TaskPool = TaskPool.PUBLIC_DEV
|
||||
subsets: list[TaskSubset] = Field(default_factory=list)
|
||||
variant_group: str = ""
|
||||
variant_id: str = "main"
|
||||
template_id: str = ""
|
||||
release_id: str = ""
|
||||
source_kind: str = ""
|
||||
provenance_ids: list[str] = Field(default_factory=list)
|
||||
privacy_tier: str = ""
|
||||
contamination_risk: str = ""
|
||||
freshness_epoch: str = ""
|
||||
category: str = ""
|
||||
domain: str = ""
|
||||
functionality: list[str] = Field(default_factory=list)
|
||||
trace_distribution: list[str] = Field(default_factory=list)
|
||||
tool_surface: list[str] = Field(default_factory=list)
|
||||
risk_tags: list[str] = Field(default_factory=list)
|
||||
first_used_at: str = ""
|
||||
retire_after_runs: int = 0
|
||||
similarity_hash: str = ""
|
||||
canary_token: str = ""
|
||||
official: bool = False
|
||||
|
||||
# Policy + prompts.
|
||||
query_difficulty: QueryDifficulty | None = None
|
||||
query_weight: float = 1.0
|
||||
artifact_type: ArtifactType | None = None
|
||||
preconditions: list[str] = Field(default_factory=list)
|
||||
source_dataset: str = ""
|
||||
prompt_variants: list[PromptVariant] = Field(default_factory=lambda: [PromptVariant.CLEAR])
|
||||
pass_threshold: float = 0.7
|
||||
|
||||
# Canonical body.
|
||||
assets: CanonicalAssets = Field(default_factory=CanonicalAssets)
|
||||
phases: list[CanonicalPhase]
|
||||
verifier: VerifierContract = Field(default_factory=VerifierContract)
|
||||
budgets: BudgetSpec = Field(default_factory=BudgetSpec)
|
||||
interaction: InteractionPolicy = Field(default_factory=InteractionPolicy)
|
||||
deliverables: list[Deliverable] = Field(default_factory=list)
|
||||
|
||||
# Adapter gating.
|
||||
required_adapter_capabilities: set[AdapterCapability] = Field(default_factory=set)
|
||||
|
||||
# Forward-compat: lets us evolve this schema while hidden / external
|
||||
# task manifests continue to validate.
|
||||
schema_version: str = "1"
|
||||
|
||||
@model_validator(mode="after")
|
||||
def _defaults(self) -> CanonicalTask:
|
||||
if not self.variant_group:
|
||||
self.variant_group = self.id
|
||||
if not self.prompt_variants:
|
||||
self.prompt_variants = [PromptVariant.CLEAR]
|
||||
else:
|
||||
deduped: list[PromptVariant] = []
|
||||
for variant in self.prompt_variants:
|
||||
if variant not in deduped:
|
||||
deduped.append(variant)
|
||||
self.prompt_variants = deduped
|
||||
return self
|
||||
@ -46,9 +46,16 @@ def cli(verbose: bool) -> None:
|
||||
type=click.Choice(KNOWN_ADAPTERS),
|
||||
default="openclaw",
|
||||
show_default=True,
|
||||
help="Agent harness adapter. OpenClaw is executable today; other adapters are tracked targets.",
|
||||
help="Agent harness adapter. OpenClaw uses the gateway; Hermes runs hermes-agent locally.",
|
||||
)
|
||||
@click.option("--gateway-token", envvar="OPENCLAW_GATEWAY_TOKEN", default="", help="Gateway auth token")
|
||||
@click.option(
|
||||
"--gateway-url",
|
||||
envvar="OPENCLAW_GATEWAY_URL",
|
||||
default="ws://localhost:18789",
|
||||
show_default=True,
|
||||
help="OpenClaw gateway websocket URL",
|
||||
)
|
||||
@click.option(
|
||||
"--judge-model",
|
||||
envvar="CLAWBENCH_JUDGE_MODEL",
|
||||
@ -116,6 +123,11 @@ def cli(verbose: bool) -> None:
|
||||
"completes the v0.5 Configuration Diagnostic Report is generated and "
|
||||
"the run is recorded in the historical profile database.",
|
||||
)
|
||||
@click.option(
|
||||
"--tool-profile",
|
||||
default=None,
|
||||
help="Optional label for the tool/profile axis recorded in result metadata.",
|
||||
)
|
||||
@click.option(
|
||||
"--insights-dir",
|
||||
type=click.Path(path_type=Path),
|
||||
@ -132,6 +144,7 @@ def run(
|
||||
model: str,
|
||||
adapter: str,
|
||||
gateway_token: str,
|
||||
gateway_url: str,
|
||||
judge_model: str,
|
||||
runs: int,
|
||||
tier: str | None,
|
||||
@ -149,10 +162,11 @@ def run(
|
||||
concurrency: int,
|
||||
browser_concurrency: int,
|
||||
profile: Path | None,
|
||||
tool_profile: str | None,
|
||||
insights_dir: Path,
|
||||
dynamics: bool,
|
||||
) -> None:
|
||||
gateway_config = GatewayConfig(token=gateway_token)
|
||||
gateway_config = GatewayConfig(url=gateway_url, token=gateway_token)
|
||||
harness = BenchmarkHarness(
|
||||
gateway_config=gateway_config,
|
||||
model=model,
|
||||
@ -171,6 +185,7 @@ def run(
|
||||
randomize_order=not no_randomize,
|
||||
concurrency=concurrency,
|
||||
browser_concurrency=browser_concurrency,
|
||||
tool_profile_name=tool_profile,
|
||||
)
|
||||
|
||||
result = asyncio.run(harness.run())
|
||||
@ -198,6 +213,40 @@ def run(
|
||||
asyncio.run(upload_result(result))
|
||||
|
||||
|
||||
@cli.command("compare-results")
|
||||
@click.argument("results", nargs=-1, type=click.Path(exists=True, path_type=Path), required=True)
|
||||
@click.option("--json-out", is_flag=True, help="Print machine-readable comparison JSON.")
|
||||
def compare_results_cmd(results: tuple[Path, ...], json_out: bool) -> None:
|
||||
"""Compare BenchmarkResult JSON files with fairness checks."""
|
||||
from clawbench.ablation import compare_results
|
||||
from clawbench.schemas import BenchmarkResult
|
||||
|
||||
loaded: dict[str, BenchmarkResult] = {}
|
||||
for path in results:
|
||||
with path.open(encoding="utf-8") as handle:
|
||||
loaded[path.stem] = BenchmarkResult(**json.load(handle))
|
||||
comparison = compare_results(loaded)
|
||||
if json_out:
|
||||
click.echo(json.dumps(comparison, indent=2, default=str))
|
||||
return
|
||||
|
||||
click.echo(f"Task/verifier fair: {comparison['task_verifier_fair']}")
|
||||
click.echo(f"Controlled ablation: {comparison['controlled_ablation']}")
|
||||
click.echo(f"Same model: {comparison['same_model']}")
|
||||
click.echo(f"Same task set: {comparison['same_task_set']}")
|
||||
click.echo(f"Same task snapshot: {comparison['same_task_snapshot']}")
|
||||
click.echo(f"Same prompt variant: {comparison['same_prompt_variant']}")
|
||||
for label, row in comparison["rows"].items():
|
||||
click.echo(
|
||||
f"{label}: model={row['model']} adapter={row['adapter']} "
|
||||
f"tasks={row['task_count']} score={row['score']:.3f} "
|
||||
f"C={row['completion']:.3f} T={row['trajectory']:.3f} "
|
||||
f"B={row['behavior']:.3f} R={row['reliability']:.3f}"
|
||||
)
|
||||
for label, delta in comparison["deltas"].items():
|
||||
click.echo(f"{label}: {delta:+.3f}")
|
||||
|
||||
|
||||
@cli.command("dynamics-report")
|
||||
@click.option(
|
||||
"--archive-dir",
|
||||
@ -797,6 +846,20 @@ def show(result_file: str) -> None:
|
||||
)
|
||||
console.print(f" [bold]pass^k reliability: {result.overall_pass_hat_k:.0%}[/]\n")
|
||||
|
||||
for label, dimension_items in (
|
||||
("Category", result.category_results),
|
||||
("Domain", result.domain_results),
|
||||
):
|
||||
if not dimension_items:
|
||||
continue
|
||||
summary = ", ".join(
|
||||
f"{item.value}={item.weighted_score:.3f}"
|
||||
for item in sorted(dimension_items, key=lambda item: item.value)
|
||||
)
|
||||
console.print(f" [bold]{label}:[/] {summary}")
|
||||
if result.category_results or result.domain_results:
|
||||
console.print()
|
||||
|
||||
for task in result.task_results:
|
||||
color = "green" if task.mean_task_score >= 0.7 else "yellow" if task.mean_task_score >= 0.4 else "red"
|
||||
top_failure = max(task.failure_mode_counts.items(), key=lambda item: item[1])[0] if task.failure_mode_counts else "-"
|
||||
|
||||
@ -226,14 +226,81 @@ class GatewayClient:
|
||||
attempt += 1
|
||||
try:
|
||||
remaining = max(1.0, deadline - asyncio.get_running_loop().time())
|
||||
attempt_timeout = min(30.0, remaining)
|
||||
self._ws = await websockets.connect(
|
||||
self.config.url,
|
||||
max_size=10 * 1024 * 1024,
|
||||
open_timeout=min(self.config.connect_timeout, remaining),
|
||||
open_timeout=attempt_timeout,
|
||||
additional_headers={"Origin": host},
|
||||
# The benchmark uses loopback gateway sockets and can issue
|
||||
# long-lived RPCs (notably agent.wait while a provider call
|
||||
# is in flight). Python websockets' default keepalive can
|
||||
# close the connection before the gateway surfaces the
|
||||
# actual model/provider result, contaminating runs as infra
|
||||
# timeouts. The gateway already owns run-level timeouts.
|
||||
ping_interval=None,
|
||||
ping_timeout=None,
|
||||
)
|
||||
break
|
||||
self._listen_task = asyncio.create_task(self._listener())
|
||||
challenge = await self._wait_event(
|
||||
"connect.challenge", timeout=attempt_timeout
|
||||
)
|
||||
challenge_payload = challenge.get("payload", {})
|
||||
nonce = ""
|
||||
if isinstance(challenge_payload, dict):
|
||||
raw_nonce = challenge_payload.get("nonce", "")
|
||||
if isinstance(raw_nonce, str):
|
||||
nonce = raw_nonce.strip()
|
||||
|
||||
role = "operator"
|
||||
scopes = [
|
||||
"operator.admin",
|
||||
"operator.read",
|
||||
"operator.write",
|
||||
"operator.approvals",
|
||||
"operator.pairing",
|
||||
]
|
||||
client_info = {
|
||||
"id": "openclaw-control-ui",
|
||||
"version": __version__,
|
||||
"platform": "linux",
|
||||
"mode": "ui",
|
||||
}
|
||||
connect_params: dict[str, Any] = {
|
||||
"minProtocol": PROTOCOL_VERSION,
|
||||
"maxProtocol": PROTOCOL_VERSION,
|
||||
"client": client_info,
|
||||
"role": role,
|
||||
"scopes": scopes,
|
||||
"caps": [],
|
||||
"commands": [],
|
||||
"permissions": {},
|
||||
"auth": {"token": self.config.token} if self.config.token else {},
|
||||
}
|
||||
device = _build_connect_device(
|
||||
nonce=nonce,
|
||||
token=self.config.token,
|
||||
client_id=str(client_info["id"]),
|
||||
client_mode=str(client_info["mode"]),
|
||||
role=role,
|
||||
scopes=scopes,
|
||||
platform=str(client_info["platform"]),
|
||||
)
|
||||
if device:
|
||||
connect_params["device"] = device
|
||||
|
||||
response = await self._rpc(
|
||||
"connect",
|
||||
connect_params,
|
||||
timeout=attempt_timeout,
|
||||
)
|
||||
payload = response.get("payload", {})
|
||||
if payload.get("type") != "hello-ok":
|
||||
raise ConnectionError(f"Expected hello-ok, got: {payload}")
|
||||
logger.info("Connected to gateway (protocol v%s)", payload.get("protocol", "?"))
|
||||
return
|
||||
except Exception as exc:
|
||||
await self.close()
|
||||
if not _is_transient_gateway_connect_error(exc):
|
||||
raise
|
||||
if asyncio.get_running_loop().time() >= deadline:
|
||||
@ -245,60 +312,6 @@ class GatewayClient:
|
||||
delay,
|
||||
)
|
||||
await asyncio.sleep(delay)
|
||||
self._listen_task = asyncio.create_task(self._listener())
|
||||
challenge = await self._wait_event("connect.challenge", timeout=self.config.connect_timeout)
|
||||
challenge_payload = challenge.get("payload", {})
|
||||
nonce = ""
|
||||
if isinstance(challenge_payload, dict):
|
||||
raw_nonce = challenge_payload.get("nonce", "")
|
||||
if isinstance(raw_nonce, str):
|
||||
nonce = raw_nonce.strip()
|
||||
|
||||
role = "operator"
|
||||
scopes = [
|
||||
"operator.admin",
|
||||
"operator.read",
|
||||
"operator.write",
|
||||
"operator.approvals",
|
||||
"operator.pairing",
|
||||
]
|
||||
client_info = {
|
||||
"id": "openclaw-control-ui",
|
||||
"version": __version__,
|
||||
"platform": "linux",
|
||||
"mode": "ui",
|
||||
}
|
||||
connect_params: dict[str, Any] = {
|
||||
"minProtocol": PROTOCOL_VERSION,
|
||||
"maxProtocol": PROTOCOL_VERSION,
|
||||
"client": client_info,
|
||||
"role": role,
|
||||
"scopes": scopes,
|
||||
"caps": [],
|
||||
"commands": [],
|
||||
"permissions": {},
|
||||
"auth": {"token": self.config.token} if self.config.token else {},
|
||||
}
|
||||
device = _build_connect_device(
|
||||
nonce=nonce,
|
||||
token=self.config.token,
|
||||
client_id=str(client_info["id"]),
|
||||
client_mode=str(client_info["mode"]),
|
||||
role=role,
|
||||
scopes=scopes,
|
||||
platform=str(client_info["platform"]),
|
||||
)
|
||||
if device:
|
||||
connect_params["device"] = device
|
||||
|
||||
response = await self._rpc(
|
||||
"connect",
|
||||
connect_params,
|
||||
)
|
||||
payload = response.get("payload", {})
|
||||
if payload.get("type") != "hello-ok":
|
||||
raise ConnectionError(f"Expected hello-ok, got: {payload}")
|
||||
logger.info("Connected to gateway (protocol v%s)", payload.get("protocol", "?"))
|
||||
|
||||
async def close(self) -> None:
|
||||
if self._listen_task and not self._listen_task.done():
|
||||
@ -394,6 +407,15 @@ class GatewayClient:
|
||||
except Exception as exc:
|
||||
logger.warning("Failed to delete session %s: %s", session_key, exc)
|
||||
|
||||
async def abort_session(self, session_key: str, *, run_id: str | None = None) -> None:
|
||||
params: dict[str, Any] = {"key": session_key}
|
||||
if run_id:
|
||||
params["runId"] = run_id
|
||||
try:
|
||||
await self._rpc("sessions.abort", params, timeout=min(self.config.request_timeout, 10.0))
|
||||
except Exception as exc:
|
||||
logger.warning("Failed to abort session %s run %s: %s", session_key, run_id or "-", exc)
|
||||
|
||||
async def get_effective_tools(self, session_key: str) -> dict[str, Any]:
|
||||
response = await self._rpc("tools.effective", {"sessionKey": session_key})
|
||||
return response.get("payload", {})
|
||||
@ -413,15 +435,27 @@ class GatewayClient:
|
||||
msg_queue: asyncio.Queue[dict[str, Any]] = asyncio.Queue()
|
||||
self._event_queues[chat_queue_key] = chat_queue
|
||||
self._event_queues[msg_queue_key] = msg_queue
|
||||
timeout_ms = max(1, min(int(timeout * 1000), 2_147_483_647))
|
||||
|
||||
await self._rpc(
|
||||
send_response = await self._rpc(
|
||||
"sessions.send",
|
||||
{
|
||||
"key": session_key,
|
||||
"message": message,
|
||||
"idempotencyKey": idempotency_key,
|
||||
"timeoutMs": timeout_ms,
|
||||
},
|
||||
)
|
||||
send_payload = send_response.get("payload", {})
|
||||
run_id = idempotency_key
|
||||
if isinstance(send_payload, dict):
|
||||
raw_run_id = send_payload.get("runId")
|
||||
if isinstance(raw_run_id, str) and raw_run_id.strip():
|
||||
run_id = raw_run_id.strip()
|
||||
|
||||
wait_task = asyncio.create_task(
|
||||
self._wait_for_agent_run(run_id, timeout_ms=timeout_ms)
|
||||
)
|
||||
|
||||
collected_messages: list[TranscriptMessage] = []
|
||||
done = False
|
||||
@ -430,8 +464,31 @@ class GatewayClient:
|
||||
while not done:
|
||||
remaining = deadline - asyncio.get_running_loop().time()
|
||||
if remaining <= 0:
|
||||
logger.warning("Timeout waiting for final state on session %s", session_key)
|
||||
logger.warning(
|
||||
"Timeout waiting for final state on session %s run %s",
|
||||
session_key,
|
||||
run_id,
|
||||
)
|
||||
break
|
||||
if wait_task.done():
|
||||
wait_payload = _task_result_or_empty(wait_task)
|
||||
status = str(wait_payload.get("status", ""))
|
||||
if status and status != "timeout":
|
||||
logger.info(
|
||||
"agent.wait observed terminal status for session %s run %s: %s",
|
||||
session_key,
|
||||
run_id,
|
||||
status,
|
||||
)
|
||||
done = True
|
||||
break
|
||||
if status == "timeout":
|
||||
logger.warning(
|
||||
"agent.wait timed out for session %s run %s",
|
||||
session_key,
|
||||
run_id,
|
||||
)
|
||||
break
|
||||
try:
|
||||
event = await asyncio.wait_for(chat_queue.get(), timeout=min(0.5, remaining))
|
||||
state = event.get("payload", {}).get("state", "")
|
||||
@ -440,6 +497,9 @@ class GatewayClient:
|
||||
except asyncio.TimeoutError:
|
||||
pass
|
||||
|
||||
if not done:
|
||||
await self.abort_session(session_key, run_id=run_id)
|
||||
|
||||
collected_messages.extend(
|
||||
await _drain_message_queue(
|
||||
msg_queue,
|
||||
@ -464,11 +524,30 @@ class GatewayClient:
|
||||
):
|
||||
collected_messages = history_messages
|
||||
finally:
|
||||
if not wait_task.done():
|
||||
wait_task.cancel()
|
||||
try:
|
||||
await wait_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
self._event_queues.pop(chat_queue_key, None)
|
||||
self._event_queues.pop(msg_queue_key, None)
|
||||
|
||||
return _correlate_transcript(Transcript(messages=collected_messages))
|
||||
|
||||
async def _wait_for_agent_run(self, run_id: str, *, timeout_ms: int) -> dict[str, Any]:
|
||||
try:
|
||||
response = await self._rpc(
|
||||
"agent.wait",
|
||||
{"runId": run_id, "timeoutMs": timeout_ms},
|
||||
timeout=(timeout_ms / 1000.0) + 10.0,
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning("agent.wait failed for run %s: %s", run_id, exc)
|
||||
return {}
|
||||
payload = response.get("payload", {})
|
||||
return payload if isinstance(payload, dict) else {}
|
||||
|
||||
async def get_session_messages(self, session_key: str) -> list[TranscriptMessage]:
|
||||
try:
|
||||
response = await self._rpc("sessions.get", {"key": session_key})
|
||||
@ -574,6 +653,13 @@ def _build_connect_device(
|
||||
platform: str,
|
||||
device_family: str | None = None,
|
||||
) -> dict[str, Any] | None:
|
||||
if os.environ.get("CLAWBENCH_DISABLE_GATEWAY_DEVICE_IDENTITY", "").strip().lower() in {
|
||||
"1",
|
||||
"true",
|
||||
"yes",
|
||||
"on",
|
||||
}:
|
||||
return None
|
||||
if not nonce:
|
||||
return None
|
||||
|
||||
@ -643,6 +729,10 @@ def _resolve_node_executable() -> str | None:
|
||||
|
||||
|
||||
def _is_transient_gateway_connect_error(exc: Exception) -> bool:
|
||||
if isinstance(exc, (TimeoutError, asyncio.TimeoutError)):
|
||||
return True
|
||||
if isinstance(exc, websockets.exceptions.ConnectionClosed):
|
||||
return True
|
||||
if isinstance(exc, InvalidStatus):
|
||||
return exc.response.status_code in {502, 503, 504}
|
||||
if isinstance(exc, InvalidMessage):
|
||||
@ -658,6 +748,13 @@ def _describe_connect_error(exc: Exception) -> str:
|
||||
return exc.__class__.__name__
|
||||
|
||||
|
||||
def _task_result_or_empty(task: asyncio.Task[dict[str, Any]]) -> dict[str, Any]:
|
||||
try:
|
||||
return task.result()
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
|
||||
def _parse_single_message(message_data: dict[str, Any]) -> TranscriptMessage | None:
|
||||
role = message_data.get("role", "")
|
||||
if not role:
|
||||
|
||||
@ -1,17 +1,44 @@
|
||||
"""Completion verification for ClawBench v0.3."""
|
||||
"""Completion verification — OpenClaw-aware entry point.
|
||||
|
||||
Historically this module contained both agent-agnostic verification
|
||||
primitives (file states, execution checks, workspace memory scans, JSON
|
||||
path resolution) and OpenClaw-specific verifiers that reach into the
|
||||
gateway via RPCs (`memory.search`, `sessions.resolve`, `cron.list`,
|
||||
arbitrary `_rpc(method)`).
|
||||
|
||||
Phase-4 splits them:
|
||||
|
||||
- The agent-agnostic primitives now live in `clawbench.environment_files`
|
||||
and are used by every adapter.
|
||||
- The OpenClaw-specific primitives stay here for now and will move into
|
||||
`clawbench/adapters/openclaw.py` once the adapter wiring lands in a
|
||||
later step.
|
||||
|
||||
The public surface — `verify_completion`, `run_execution_check`, module-
|
||||
level helpers — stays unchanged so existing callers (harness, scorer,
|
||||
tests) keep working. Function bodies that used to do real work now
|
||||
delegate to `environment_files` to keep behavior identical.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import shlex
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from clawbench.client import GatewayClient
|
||||
from clawbench.render import render_template, render_value
|
||||
from clawbench.environment_files import (
|
||||
MEMORY_FILE_CANDIDATES,
|
||||
evaluate_execution_result as _evaluate_execution_result_impl,
|
||||
memory_visible_in_transcript as _memory_visible_in_transcript_impl,
|
||||
read_workspace_memory_text,
|
||||
resolve_json_path,
|
||||
run_execution_check as _run_execution_check_impl,
|
||||
verify_file_state as _verify_file_state_impl,
|
||||
verify_memory_fallback,
|
||||
)
|
||||
from clawbench.schemas import (
|
||||
CompletionResult,
|
||||
CompletionSpec,
|
||||
@ -52,7 +79,9 @@ async def verify_completion(
|
||||
failures.append(f"FILE {spec.path}: {reason}")
|
||||
|
||||
for spec in completion.memory:
|
||||
ok, reason = await _verify_memory(spec, client, session_key, agent_id=agent_id, transcript=transcript)
|
||||
ok, reason = await _verify_memory(
|
||||
spec, client, session_key, agent_id=agent_id, transcript=transcript, workspace=workspace
|
||||
)
|
||||
total += 1
|
||||
if ok:
|
||||
passed += 1
|
||||
@ -102,82 +131,20 @@ async def verify_completion(
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Agent-agnostic primitives — re-exported via delegates so historical
|
||||
# callers that import from `clawbench.environment` keep working.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def run_execution_check(
|
||||
spec: ExecutionCheck,
|
||||
*,
|
||||
workspace: Path,
|
||||
runtime_values: dict[str, Any],
|
||||
) -> ExecutionCheckResult:
|
||||
rendered_command = render_template(spec.command, runtime_values)
|
||||
rendered_cwd = workspace / render_template(spec.cwd, runtime_values)
|
||||
rendered_env = render_value(spec.env, runtime_values)
|
||||
import os
|
||||
import sys
|
||||
|
||||
full_env = {
|
||||
**os.environ,
|
||||
**{key: str(value) for key, value in rendered_env.items()},
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
}
|
||||
python_bin_dir = str(Path(sys.executable).parent)
|
||||
full_env["PATH"] = f"{python_bin_dir}:{full_env.get('PATH', '')}"
|
||||
python_path_parts = [str(rendered_cwd), str(workspace)]
|
||||
existing_pythonpath = full_env.get("PYTHONPATH")
|
||||
if existing_pythonpath:
|
||||
python_path_parts.append(existing_pythonpath)
|
||||
full_env["PYTHONPATH"] = ":".join(python_path_parts)
|
||||
|
||||
try:
|
||||
if spec.shell:
|
||||
process = await asyncio.create_subprocess_shell(
|
||||
rendered_command,
|
||||
cwd=str(rendered_cwd),
|
||||
env=full_env,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
else:
|
||||
process = await asyncio.create_subprocess_exec(
|
||||
*shlex.split(rendered_command),
|
||||
cwd=str(rendered_cwd),
|
||||
env=full_env,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
stdout_bytes, stderr_bytes = await asyncio.wait_for(
|
||||
process.communicate(),
|
||||
timeout=spec.timeout_seconds,
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
process.kill()
|
||||
await process.communicate()
|
||||
return ExecutionCheckResult(
|
||||
name=spec.name,
|
||||
command=rendered_command,
|
||||
exit_code=-1,
|
||||
passed=False,
|
||||
reason=f"Timed out after {spec.timeout_seconds}s",
|
||||
)
|
||||
except Exception as exc:
|
||||
return ExecutionCheckResult(
|
||||
name=spec.name,
|
||||
command=rendered_command,
|
||||
exit_code=-1,
|
||||
passed=False,
|
||||
reason=str(exc),
|
||||
)
|
||||
|
||||
stdout = stdout_bytes.decode("utf-8", errors="replace")
|
||||
stderr = stderr_bytes.decode("utf-8", errors="replace")
|
||||
passed, reason = _evaluate_execution_result(spec, workspace, runtime_values, process.returncode, stdout, stderr)
|
||||
return ExecutionCheckResult(
|
||||
name=spec.name,
|
||||
command=rendered_command,
|
||||
exit_code=process.returncode,
|
||||
stdout=stdout,
|
||||
stderr=stderr,
|
||||
passed=passed,
|
||||
reason=reason,
|
||||
return await _run_execution_check_impl(
|
||||
spec, workspace=workspace, runtime_values=runtime_values
|
||||
)
|
||||
|
||||
|
||||
@ -189,92 +156,27 @@ def _evaluate_execution_result(
|
||||
stdout: str,
|
||||
stderr: str,
|
||||
) -> tuple[bool, str]:
|
||||
if exit_code != spec.expected_exit_code:
|
||||
return False, f"Exit code {exit_code} != expected {spec.expected_exit_code}"
|
||||
|
||||
for token in spec.stdout_contains:
|
||||
rendered = render_template(token, runtime_values)
|
||||
if rendered not in stdout:
|
||||
return False, f"stdout missing '{rendered}'"
|
||||
|
||||
for token in spec.stdout_not_contains:
|
||||
rendered = render_template(token, runtime_values)
|
||||
if rendered in stdout:
|
||||
return False, f"stdout unexpectedly contains '{rendered}'"
|
||||
|
||||
for token in spec.stderr_contains:
|
||||
rendered = render_template(token, runtime_values)
|
||||
if rendered not in stderr:
|
||||
return False, f"stderr missing '{rendered}'"
|
||||
|
||||
if spec.stdout_matches and not re.search(render_template(spec.stdout_matches, runtime_values), stdout, re.MULTILINE | re.DOTALL):
|
||||
return False, f"stdout does not match {spec.stdout_matches}"
|
||||
|
||||
if spec.stderr_matches and not re.search(render_template(spec.stderr_matches, runtime_values), stderr, re.MULTILINE | re.DOTALL):
|
||||
return False, f"stderr does not match {spec.stderr_matches}"
|
||||
|
||||
if spec.expected_stdout is not None:
|
||||
rendered = render_template(spec.expected_stdout, runtime_values).strip()
|
||||
if stdout.strip() != rendered:
|
||||
return False, "stdout did not match expected text"
|
||||
|
||||
if spec.expected_stdout_file:
|
||||
expected_path = workspace / render_template(spec.expected_stdout_file, runtime_values)
|
||||
if stdout.strip() != expected_path.read_text(encoding="utf-8").strip():
|
||||
return False, f"stdout did not match {spec.expected_stdout_file}"
|
||||
|
||||
if spec.expected_json is not None:
|
||||
try:
|
||||
parsed = json.loads(stdout)
|
||||
except json.JSONDecodeError as exc:
|
||||
return False, f"stdout was not valid JSON: {exc}"
|
||||
if parsed != render_value(spec.expected_json, runtime_values):
|
||||
return False, "stdout JSON did not match expected JSON"
|
||||
|
||||
if spec.expected_json_file:
|
||||
expected_path = workspace / render_template(spec.expected_json_file, runtime_values)
|
||||
try:
|
||||
parsed = json.loads(stdout)
|
||||
except json.JSONDecodeError as exc:
|
||||
return False, f"stdout was not valid JSON: {exc}"
|
||||
expected_json = json.loads(expected_path.read_text(encoding="utf-8"))
|
||||
if parsed != expected_json:
|
||||
return False, f"stdout JSON did not match {spec.expected_json_file}"
|
||||
|
||||
return True, "OK"
|
||||
return _evaluate_execution_result_impl(
|
||||
spec, workspace, runtime_values, exit_code, stdout, stderr
|
||||
)
|
||||
|
||||
|
||||
def _verify_file(spec: FileState, workspace: Path, runtime_values: dict[str, Any]) -> tuple[bool, str]:
|
||||
path = workspace / render_template(spec.path, runtime_values)
|
||||
exists = path.exists() and path.is_file()
|
||||
return _verify_file_state_impl(spec, workspace, runtime_values)
|
||||
|
||||
if not spec.exists:
|
||||
return (not exists, "Correctly absent" if not exists else "File should not exist")
|
||||
if not exists:
|
||||
return False, "File does not exist"
|
||||
|
||||
content = path.read_text(encoding="utf-8", errors="replace")
|
||||
if spec.min_size_bytes > 0 and path.stat().st_size < spec.min_size_bytes:
|
||||
return False, f"File too small: {path.stat().st_size} < {spec.min_size_bytes}"
|
||||
def _memory_visible_in_transcript(spec: MemoryState, transcript: Transcript) -> bool:
|
||||
return _memory_visible_in_transcript_impl(spec, transcript)
|
||||
|
||||
for token in spec.content_contains:
|
||||
rendered = render_template(token, runtime_values)
|
||||
if rendered not in content:
|
||||
return False, f"Missing expected content '{rendered}'"
|
||||
|
||||
for token in spec.content_not_contains:
|
||||
rendered = render_template(token, runtime_values)
|
||||
if rendered in content:
|
||||
return False, f"Contains forbidden content '{rendered}'"
|
||||
def _resolve_path(payload: Any, path: str) -> Any:
|
||||
return resolve_json_path(payload, path)
|
||||
|
||||
if spec.content_matches and not re.search(
|
||||
render_template(spec.content_matches, runtime_values),
|
||||
content,
|
||||
re.MULTILINE | re.DOTALL,
|
||||
):
|
||||
return False, f"Content does not match {spec.content_matches}"
|
||||
|
||||
return True, "OK"
|
||||
# ---------------------------------------------------------------------------
|
||||
# OpenClaw-tied verifiers. These call `GatewayClient` RPCs; they will
|
||||
# migrate into `adapters/openclaw.py` once the adapter wiring lands.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def _verify_memory(
|
||||
@ -284,6 +186,7 @@ async def _verify_memory(
|
||||
*,
|
||||
agent_id: str | None = None,
|
||||
transcript: Transcript | None = None,
|
||||
workspace: Path | None = None,
|
||||
) -> tuple[bool, str]:
|
||||
try:
|
||||
response = await client._rpc(
|
||||
@ -305,16 +208,42 @@ async def _verify_memory(
|
||||
return False, f"Memory value missing '{token}'"
|
||||
return True, "OK"
|
||||
except Exception as exc:
|
||||
logger.info("memory.search unavailable for verification, falling back to agent memory files: %s", exc)
|
||||
logger.info(
|
||||
"memory.search unavailable for verification, falling back to agent memory files: %s",
|
||||
exc,
|
||||
)
|
||||
|
||||
# Fallback path: pull the same set of memory files the agent would
|
||||
# produce (MEMORY.md, memory/notes.md, …) via the gateway, then hand
|
||||
# the resulting text to the shared filesystem-fallback resolver in
|
||||
# `environment_files`. If no gateway is available (agent_id is None
|
||||
# or the calls error) and a workspace was supplied, fall back further
|
||||
# to scanning the workspace filesystem directly.
|
||||
|
||||
extra_memory_text = ""
|
||||
if agent_id:
|
||||
try:
|
||||
extra_memory_text = await _read_agent_memory_text(client, agent_id)
|
||||
except Exception:
|
||||
extra_memory_text = ""
|
||||
|
||||
if workspace is not None:
|
||||
return verify_memory_fallback(
|
||||
spec,
|
||||
workspace,
|
||||
transcript=transcript,
|
||||
extra_memory_text=extra_memory_text,
|
||||
)
|
||||
|
||||
if not agent_id:
|
||||
return False, "memory.search unavailable and no agent id was provided for fallback verification"
|
||||
|
||||
fallback_text = await _read_agent_memory_text(client, agent_id)
|
||||
normalized = fallback_text.lower()
|
||||
# Legacy pre-workspace path: agent_id is set but we don't have a
|
||||
# workspace handle. Resolve using only the gateway-sourced text +
|
||||
# transcript scan to preserve the exact prior behavior.
|
||||
normalized = extra_memory_text.lower()
|
||||
needle = spec.key_pattern.lower()
|
||||
found = needle in normalized
|
||||
|
||||
if not spec.exists:
|
||||
return (not found, "Correctly absent" if not found else "Memory entry exists")
|
||||
if found:
|
||||
@ -322,23 +251,17 @@ async def _verify_memory(
|
||||
if token.lower() not in normalized:
|
||||
return False, f"Memory value missing '{token}'"
|
||||
return True, "OK"
|
||||
|
||||
if transcript and _memory_visible_in_transcript(spec, transcript):
|
||||
return True, "Verified from transcript fallback"
|
||||
return False, "No matching memory content found in persisted memory files or transcript fallback"
|
||||
return (
|
||||
False,
|
||||
"No matching memory content found in persisted memory files or transcript fallback",
|
||||
)
|
||||
|
||||
|
||||
async def _read_agent_memory_text(client: GatewayClient, agent_id: str) -> str:
|
||||
contents: list[str] = []
|
||||
for file_name in (
|
||||
"MEMORY.md",
|
||||
"memory.md",
|
||||
"memory/MEMORY.md",
|
||||
"memory/memory.md",
|
||||
"memory/notes.md",
|
||||
"memory/NOTES.md",
|
||||
"notes.md",
|
||||
):
|
||||
for file_name in MEMORY_FILE_CANDIDATES:
|
||||
try:
|
||||
payload = await client.get_agent_file(agent_id, file_name)
|
||||
except Exception:
|
||||
@ -350,30 +273,6 @@ async def _read_agent_memory_text(client: GatewayClient, agent_id: str) -> str:
|
||||
return "\n".join(contents)
|
||||
|
||||
|
||||
def _memory_visible_in_transcript(spec: MemoryState, transcript: Transcript) -> bool:
|
||||
needle = spec.key_pattern.lower()
|
||||
for call in transcript.tool_call_sequence:
|
||||
family = (call.family or "").lower()
|
||||
name = call.name.lower()
|
||||
path = str(call.input.get("path", "")).lower()
|
||||
if family != "memory" and "memory" not in path:
|
||||
continue
|
||||
if family == "memory" and "search" in name and "write" not in name and "store" not in name and "save" not in name:
|
||||
continue
|
||||
|
||||
serialized_bits = [call.output, call.error]
|
||||
try:
|
||||
serialized_bits.append(json.dumps(call.input, sort_keys=True))
|
||||
except TypeError:
|
||||
serialized_bits.append(str(call.input))
|
||||
haystack = " ".join(bit for bit in serialized_bits if bit).lower()
|
||||
if needle not in haystack:
|
||||
continue
|
||||
if all(token.lower() in haystack for token in spec.value_contains):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
async def _verify_session(
|
||||
spec: SessionState,
|
||||
client: GatewayClient,
|
||||
@ -404,8 +303,7 @@ async def _verify_cron(spec: CronState, client: GatewayClient) -> tuple[bool, st
|
||||
if not jobs:
|
||||
return False, "No cron jobs found"
|
||||
if spec.description_contains and not any(
|
||||
spec.description_contains.lower() in json.dumps(job).lower()
|
||||
for job in jobs
|
||||
spec.description_contains.lower() in json.dumps(job).lower() for job in jobs
|
||||
):
|
||||
return False, f"No cron job matched '{spec.description_contains}'"
|
||||
return True, "OK"
|
||||
@ -420,7 +318,7 @@ async def _verify_gateway_assertion(
|
||||
try:
|
||||
response = await client._rpc(spec.method, spec.params)
|
||||
payload = response.get("payload", {})
|
||||
value = _resolve_path(payload, spec.assert_path)
|
||||
value = resolve_json_path(payload, spec.assert_path)
|
||||
if not spec.assert_exists:
|
||||
return (value is None, "Correctly absent" if value is None else "Path exists")
|
||||
if value is None:
|
||||
@ -434,28 +332,13 @@ async def _verify_gateway_assertion(
|
||||
return False, str(exc)
|
||||
|
||||
|
||||
def _resolve_path(payload: Any, path: str) -> Any:
|
||||
if path == "$":
|
||||
return payload
|
||||
current = payload
|
||||
for part in path.lstrip("$").lstrip(".").split("."):
|
||||
if not part:
|
||||
continue
|
||||
match = re.fullmatch(r"([^\[]+)\[(\d+)\]", part)
|
||||
if match:
|
||||
key, index = match.groups()
|
||||
if not isinstance(current, dict) or key not in current:
|
||||
return None
|
||||
current = current[key]
|
||||
if not isinstance(current, list):
|
||||
return None
|
||||
idx = int(index)
|
||||
if idx >= len(current):
|
||||
return None
|
||||
current = current[idx]
|
||||
continue
|
||||
if isinstance(current, dict) and part in current:
|
||||
current = current[part]
|
||||
continue
|
||||
return None
|
||||
return current
|
||||
# Backward-compatible names for any external users that imported the
|
||||
# private delegates directly. The old symbols resolve to the new ones.
|
||||
_verify_file_state = _verify_file
|
||||
_verify_execution = _evaluate_execution_result_impl
|
||||
|
||||
|
||||
__all__ = [
|
||||
"run_execution_check",
|
||||
"verify_completion",
|
||||
]
|
||||
|
||||
403
clawbench/environment_files.py
Normal file
403
clawbench/environment_files.py
Normal file
@ -0,0 +1,403 @@
|
||||
"""Agent-agnostic workspace verification primitives.
|
||||
|
||||
This is the half of `environment.py` that does not touch the OpenClaw
|
||||
gateway: file-state checks, execution-check subprocessing, stdout/JSON
|
||||
assertions, JSON path resolution, and the filesystem/transcript-based
|
||||
memory fallback readers.
|
||||
|
||||
Adapters (OpenClaw, Hermes, future) consume these primitives directly.
|
||||
`environment.py` re-exports them for back-compat so existing callers
|
||||
keep working while the gateway-tied halves (`_verify_memory` primary
|
||||
path, `_verify_session`, `_verify_cron`, `_verify_gateway_assertion`)
|
||||
stay where they are and move to `adapters/openclaw.py` in a later step.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import shlex
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from clawbench.render import render_template, render_value
|
||||
from clawbench.schemas import (
|
||||
ExecutionCheck,
|
||||
ExecutionCheckResult,
|
||||
FileState,
|
||||
MemoryState,
|
||||
Transcript,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# File-state verification
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def verify_file_state(
|
||||
spec: FileState,
|
||||
workspace: Path,
|
||||
runtime_values: dict[str, Any],
|
||||
) -> tuple[bool, str]:
|
||||
"""Verify a single `FileState` against the workspace filesystem."""
|
||||
|
||||
path = workspace / render_template(spec.path, runtime_values)
|
||||
exists = path.exists() and path.is_file()
|
||||
|
||||
if not spec.exists:
|
||||
return (not exists, "Correctly absent" if not exists else "File should not exist")
|
||||
if not exists:
|
||||
return False, "File does not exist"
|
||||
|
||||
content = path.read_text(encoding="utf-8", errors="replace")
|
||||
if spec.min_size_bytes > 0 and path.stat().st_size < spec.min_size_bytes:
|
||||
return False, f"File too small: {path.stat().st_size} < {spec.min_size_bytes}"
|
||||
|
||||
for token in spec.content_contains:
|
||||
rendered = render_template(token, runtime_values)
|
||||
if rendered not in content:
|
||||
return False, f"Missing expected content '{rendered}'"
|
||||
|
||||
for token in spec.content_not_contains:
|
||||
rendered = render_template(token, runtime_values)
|
||||
if rendered in content:
|
||||
return False, f"Contains forbidden content '{rendered}'"
|
||||
|
||||
if spec.content_matches and not re.search(
|
||||
render_template(spec.content_matches, runtime_values),
|
||||
content,
|
||||
re.MULTILINE | re.DOTALL,
|
||||
):
|
||||
return False, f"Content does not match {spec.content_matches}"
|
||||
|
||||
return True, "OK"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Execution checks
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def run_execution_check(
|
||||
spec: ExecutionCheck,
|
||||
*,
|
||||
workspace: Path,
|
||||
runtime_values: dict[str, Any],
|
||||
) -> ExecutionCheckResult:
|
||||
"""Run a single `ExecutionCheck` subprocess and evaluate its output."""
|
||||
|
||||
rendered_command = render_template(spec.command, runtime_values)
|
||||
rendered_cwd = workspace / render_template(spec.cwd, runtime_values)
|
||||
rendered_env = render_value(spec.env, runtime_values)
|
||||
|
||||
full_env = {
|
||||
**os.environ,
|
||||
**{key: str(value) for key, value in rendered_env.items()},
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
}
|
||||
python_bin_dir = str(Path(sys.executable).parent)
|
||||
full_env["PATH"] = f"{python_bin_dir}:{full_env.get('PATH', '')}"
|
||||
python_path_parts = [str(rendered_cwd), str(workspace)]
|
||||
existing_pythonpath = full_env.get("PYTHONPATH")
|
||||
if existing_pythonpath:
|
||||
python_path_parts.append(existing_pythonpath)
|
||||
full_env["PYTHONPATH"] = ":".join(python_path_parts)
|
||||
|
||||
try:
|
||||
if spec.shell:
|
||||
process = await asyncio.create_subprocess_shell(
|
||||
rendered_command,
|
||||
cwd=str(rendered_cwd),
|
||||
env=full_env,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
else:
|
||||
process = await asyncio.create_subprocess_exec(
|
||||
*shlex.split(rendered_command),
|
||||
cwd=str(rendered_cwd),
|
||||
env=full_env,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
stdout_bytes, stderr_bytes = await asyncio.wait_for(
|
||||
process.communicate(),
|
||||
timeout=spec.timeout_seconds,
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
process.kill()
|
||||
await process.communicate()
|
||||
return ExecutionCheckResult(
|
||||
name=spec.name,
|
||||
command=rendered_command,
|
||||
exit_code=-1,
|
||||
passed=False,
|
||||
reason=f"Timed out after {spec.timeout_seconds}s",
|
||||
)
|
||||
except Exception as exc:
|
||||
return ExecutionCheckResult(
|
||||
name=spec.name,
|
||||
command=rendered_command,
|
||||
exit_code=-1,
|
||||
passed=False,
|
||||
reason=str(exc),
|
||||
)
|
||||
|
||||
stdout = stdout_bytes.decode("utf-8", errors="replace")
|
||||
stderr = stderr_bytes.decode("utf-8", errors="replace")
|
||||
passed, reason = evaluate_execution_result(
|
||||
spec, workspace, runtime_values, process.returncode, stdout, stderr
|
||||
)
|
||||
return ExecutionCheckResult(
|
||||
name=spec.name,
|
||||
command=rendered_command,
|
||||
exit_code=process.returncode,
|
||||
stdout=stdout,
|
||||
stderr=stderr,
|
||||
passed=passed,
|
||||
reason=reason,
|
||||
)
|
||||
|
||||
|
||||
def evaluate_execution_result(
|
||||
spec: ExecutionCheck,
|
||||
workspace: Path,
|
||||
runtime_values: dict[str, Any],
|
||||
exit_code: int,
|
||||
stdout: str,
|
||||
stderr: str,
|
||||
) -> tuple[bool, str]:
|
||||
"""Apply every assertion declared on an `ExecutionCheck`."""
|
||||
|
||||
if exit_code != spec.expected_exit_code:
|
||||
return False, f"Exit code {exit_code} != expected {spec.expected_exit_code}"
|
||||
|
||||
for token in spec.stdout_contains:
|
||||
rendered = render_template(token, runtime_values)
|
||||
if rendered not in stdout:
|
||||
return False, f"stdout missing '{rendered}'"
|
||||
|
||||
for token in spec.stdout_not_contains:
|
||||
rendered = render_template(token, runtime_values)
|
||||
if rendered in stdout:
|
||||
return False, f"stdout unexpectedly contains '{rendered}'"
|
||||
|
||||
for token in spec.stderr_contains:
|
||||
rendered = render_template(token, runtime_values)
|
||||
if rendered not in stderr:
|
||||
return False, f"stderr missing '{rendered}'"
|
||||
|
||||
if spec.stdout_matches and not re.search(
|
||||
render_template(spec.stdout_matches, runtime_values), stdout, re.MULTILINE | re.DOTALL
|
||||
):
|
||||
return False, f"stdout does not match {spec.stdout_matches}"
|
||||
|
||||
if spec.stderr_matches and not re.search(
|
||||
render_template(spec.stderr_matches, runtime_values), stderr, re.MULTILINE | re.DOTALL
|
||||
):
|
||||
return False, f"stderr does not match {spec.stderr_matches}"
|
||||
|
||||
if spec.expected_stdout is not None:
|
||||
rendered = render_template(spec.expected_stdout, runtime_values).strip()
|
||||
if stdout.strip() != rendered:
|
||||
return False, "stdout did not match expected text"
|
||||
|
||||
if spec.expected_stdout_file:
|
||||
expected_path = workspace / render_template(spec.expected_stdout_file, runtime_values)
|
||||
if stdout.strip() != expected_path.read_text(encoding="utf-8").strip():
|
||||
return False, f"stdout did not match {spec.expected_stdout_file}"
|
||||
|
||||
if spec.expected_json is not None:
|
||||
try:
|
||||
parsed = json.loads(stdout)
|
||||
except json.JSONDecodeError as exc:
|
||||
return False, f"stdout was not valid JSON: {exc}"
|
||||
if parsed != render_value(spec.expected_json, runtime_values):
|
||||
return False, "stdout JSON did not match expected JSON"
|
||||
|
||||
if spec.expected_json_file:
|
||||
expected_path = workspace / render_template(spec.expected_json_file, runtime_values)
|
||||
try:
|
||||
parsed = json.loads(stdout)
|
||||
except json.JSONDecodeError as exc:
|
||||
return False, f"stdout was not valid JSON: {exc}"
|
||||
expected_json = json.loads(expected_path.read_text(encoding="utf-8"))
|
||||
if parsed != expected_json:
|
||||
return False, f"stdout JSON did not match {spec.expected_json_file}"
|
||||
|
||||
return True, "OK"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Memory fallback: read well-known files from the workspace directly.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
MEMORY_FILE_CANDIDATES: tuple[str, ...] = (
|
||||
"MEMORY.md",
|
||||
"memory.md",
|
||||
"memory/MEMORY.md",
|
||||
"memory/memory.md",
|
||||
"memory/notes.md",
|
||||
"memory/NOTES.md",
|
||||
"notes.md",
|
||||
)
|
||||
|
||||
|
||||
def read_workspace_memory_text(workspace: Path) -> str:
|
||||
"""Read concatenated memory-file contents straight from the workspace.
|
||||
|
||||
This is the adapter-free equivalent of
|
||||
`environment._read_agent_memory_text`, which reads the same files via
|
||||
`GatewayClient.get_agent_file`. Use this from any adapter whose agent
|
||||
runs directly in the ClawBench workspace (Hermes, Claude Code, Codex).
|
||||
"""
|
||||
|
||||
contents: list[str] = []
|
||||
for name in MEMORY_FILE_CANDIDATES:
|
||||
path = workspace / name
|
||||
try:
|
||||
if path.is_file():
|
||||
text = path.read_text(encoding="utf-8", errors="replace")
|
||||
if text.strip():
|
||||
contents.append(text)
|
||||
except Exception:
|
||||
continue
|
||||
return "\n".join(contents)
|
||||
|
||||
|
||||
def memory_visible_in_transcript(spec: MemoryState, transcript: Transcript) -> bool:
|
||||
"""Return True if the transcript shows a memory *write* matching `spec`.
|
||||
|
||||
Same heuristic as `environment._memory_visible_in_transcript` — kept
|
||||
agent-agnostic: it reads `ToolCall.family`, `call.name`, `call.input`,
|
||||
`call.output`, `call.error`, all of which are canonical.
|
||||
"""
|
||||
|
||||
needle = spec.key_pattern.lower()
|
||||
for call in transcript.tool_call_sequence:
|
||||
family = (call.family or "").lower()
|
||||
name = call.name.lower()
|
||||
path = str(call.input.get("path", "")).lower()
|
||||
if family != "memory" and "memory" not in path:
|
||||
continue
|
||||
if (
|
||||
family == "memory"
|
||||
and "search" in name
|
||||
and "write" not in name
|
||||
and "store" not in name
|
||||
and "save" not in name
|
||||
):
|
||||
continue
|
||||
|
||||
serialized_bits = [call.output, call.error]
|
||||
try:
|
||||
serialized_bits.append(json.dumps(call.input, sort_keys=True))
|
||||
except TypeError:
|
||||
serialized_bits.append(str(call.input))
|
||||
haystack = " ".join(bit for bit in serialized_bits if bit).lower()
|
||||
if needle not in haystack:
|
||||
continue
|
||||
if all(token.lower() in haystack for token in spec.value_contains):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def verify_memory_fallback(
|
||||
spec: MemoryState,
|
||||
workspace: Path,
|
||||
*,
|
||||
transcript: Transcript | None = None,
|
||||
extra_memory_text: str = "",
|
||||
) -> tuple[bool, str]:
|
||||
"""Resolve a `MemoryState` assertion using workspace files + transcript.
|
||||
|
||||
Used by any adapter that doesn't expose an OpenClaw-style
|
||||
`memory.search` RPC. The lookup strategy is deliberately permissive
|
||||
(matches the existing fallback path in `environment._verify_memory`):
|
||||
|
||||
1. Concatenate every known memory file in the workspace.
|
||||
2. Optionally add any adapter-supplied text (e.g. OpenClaw's
|
||||
`_read_agent_memory_text`) via `extra_memory_text`.
|
||||
3. If the key_pattern appears (case-insensitive), check every
|
||||
`value_contains` token.
|
||||
4. If that fails, fall back to scanning the transcript for a memory
|
||||
write that matches.
|
||||
"""
|
||||
|
||||
memory_text = (read_workspace_memory_text(workspace) + "\n" + extra_memory_text).lower()
|
||||
needle = spec.key_pattern.lower()
|
||||
found = needle in memory_text
|
||||
|
||||
if not spec.exists:
|
||||
return (not found, "Correctly absent" if not found else "Memory entry exists")
|
||||
|
||||
if found:
|
||||
for token in spec.value_contains:
|
||||
if token.lower() not in memory_text:
|
||||
return False, f"Memory value missing '{token}'"
|
||||
return True, "OK"
|
||||
|
||||
if transcript is not None and memory_visible_in_transcript(spec, transcript):
|
||||
return True, "Verified from transcript fallback"
|
||||
return (
|
||||
False,
|
||||
"No matching memory content found in persisted memory files or transcript fallback",
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# JSON-path resolver (pure function over dict/list payloads)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def resolve_json_path(payload: Any, path: str) -> Any:
|
||||
"""Resolve a dotted `$.foo.bar[0].baz` path into `payload`.
|
||||
|
||||
Returns None if any part of the path is missing or the type is
|
||||
wrong. Handles index syntax via `foo[3]`.
|
||||
"""
|
||||
|
||||
if path == "$":
|
||||
return payload
|
||||
current = payload
|
||||
for part in path.lstrip("$").lstrip(".").split("."):
|
||||
if not part:
|
||||
continue
|
||||
match = re.fullmatch(r"([^\[]+)\[(\d+)\]", part)
|
||||
if match:
|
||||
key, index = match.groups()
|
||||
if not isinstance(current, dict) or key not in current:
|
||||
return None
|
||||
current = current[key]
|
||||
if not isinstance(current, list):
|
||||
return None
|
||||
idx = int(index)
|
||||
if idx >= len(current):
|
||||
return None
|
||||
current = current[idx]
|
||||
continue
|
||||
if isinstance(current, dict) and part in current:
|
||||
current = current[part]
|
||||
continue
|
||||
return None
|
||||
return current
|
||||
|
||||
|
||||
__all__ = [
|
||||
"MEMORY_FILE_CANDIDATES",
|
||||
"evaluate_execution_result",
|
||||
"memory_visible_in_transcript",
|
||||
"read_workspace_memory_text",
|
||||
"resolve_json_path",
|
||||
"run_execution_check",
|
||||
"verify_file_state",
|
||||
"verify_memory_fallback",
|
||||
]
|
||||
@ -8,21 +8,35 @@ import hashlib
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import time
|
||||
import uuid
|
||||
from collections.abc import Awaitable, Callable
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from rich.console import Console
|
||||
from rich.table import Table
|
||||
|
||||
from clawbench import __version__
|
||||
from clawbench.ablation import build_ablation_profile, git_head
|
||||
from clawbench.adapters import get_adapter
|
||||
from clawbench.adapters.base import AdapterContext
|
||||
from clawbench.adapters.hermes import HermesAdapterConfig
|
||||
from clawbench.adapters.openclaw import OpenClawAdapterConfig
|
||||
from clawbench.canonical.convert import from_task_definition
|
||||
from clawbench.client import GatewayClient, GatewayConfig
|
||||
from clawbench.environment_files import run_execution_check, verify_file_state
|
||||
from clawbench.judge import judge_task_run
|
||||
from clawbench.releases import compute_task_snapshot_fingerprint, load_active_release
|
||||
from clawbench.schemas import (
|
||||
BenchmarkResult,
|
||||
CompletionResult,
|
||||
DimensionResult,
|
||||
DeliveryOutcome,
|
||||
EfficiencyResult,
|
||||
JudgeResult,
|
||||
ScenarioResult,
|
||||
TaskDefinition,
|
||||
TaskRunResult,
|
||||
@ -30,18 +44,38 @@ from clawbench.schemas import (
|
||||
TierResult,
|
||||
Transcript,
|
||||
)
|
||||
from clawbench.scorer import classify_error_failure_mode, score_task_run
|
||||
from clawbench.session_labels import unique_session_label
|
||||
from clawbench.scorer import (
|
||||
classify_delivery_outcome,
|
||||
classify_error_failure_mode,
|
||||
classify_failure_mode,
|
||||
combine_run_score,
|
||||
evaluate_behavior,
|
||||
)
|
||||
from clawbench.services import build_runtime_values, start_background_services, stop_background_services
|
||||
from clawbench.simulated_user import UserSimulator
|
||||
from clawbench.stats import bootstrap_ci, summarize_task_runs
|
||||
from clawbench.tasks import get_assets_dir, load_all_tasks
|
||||
from clawbench.trajectory import annotate_transcript_tool_calls, evaluate_trajectory
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
console = Console()
|
||||
|
||||
KNOWN_ADAPTERS = ("openclaw", "hermes", "codex", "claude-code")
|
||||
EXECUTABLE_ADAPTERS = {"openclaw"}
|
||||
EXECUTABLE_ADAPTERS = {"openclaw", "hermes"}
|
||||
|
||||
|
||||
def _command_version(command: list[str]) -> str:
|
||||
try:
|
||||
result = subprocess.run(
|
||||
command,
|
||||
check=False,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
except Exception:
|
||||
return ""
|
||||
return (result.stdout or "").strip().splitlines()[0] if result.stdout else ""
|
||||
|
||||
|
||||
class _NullCtx:
|
||||
@ -83,6 +117,7 @@ class BenchmarkHarness:
|
||||
concurrency: int = 1,
|
||||
browser_concurrency: int = 1,
|
||||
adapter: str = "openclaw",
|
||||
tool_profile_name: str | None = None,
|
||||
) -> None:
|
||||
self.gateway_config = gateway_config
|
||||
self.model = model
|
||||
@ -107,6 +142,7 @@ class BenchmarkHarness:
|
||||
self.concurrency = max(1, int(concurrency))
|
||||
self.browser_concurrency = max(1, int(browser_concurrency))
|
||||
self.adapter = adapter
|
||||
self.tool_profile_name = tool_profile_name
|
||||
self.repo_root = Path(__file__).parent.parent
|
||||
self.last_task_runs: dict[str, list[TaskRunResult]] = {}
|
||||
|
||||
@ -136,6 +172,8 @@ class BenchmarkHarness:
|
||||
if not tasks:
|
||||
raise ValueError("No tasks to run")
|
||||
|
||||
tasks = self._filter_tasks_for_adapter(tasks)
|
||||
|
||||
if self.randomize_order:
|
||||
import random
|
||||
|
||||
@ -261,66 +299,168 @@ class BenchmarkHarness:
|
||||
console.print(f" [red]! {failure}[/]")
|
||||
|
||||
async def _run_single(self, task: TaskDefinition, run_index: int) -> TaskRunResult:
|
||||
# Per-turn timeout cap: prevents a single send_and_wait from burning the entire task
|
||||
# timeout (often 300-600s). Default 180s is enough for any reasonable single-turn
|
||||
# response and fails fast on stuck models. Override with env var if needed.
|
||||
per_turn_cap = float(os.environ.get("CLAWBENCH_PER_TURN_TIMEOUT_SECONDS", "180"))
|
||||
# Per-run hard budget: total wall time a single (task, run) is allowed to consume.
|
||||
# Default 300s (5 min) bounds the worst case to 5min * 120 = 10h/model if fully
|
||||
# serial, and <3h/model at lanes=4. Env override available for longer slower models.
|
||||
per_run_budget = float(os.environ.get("CLAWBENCH_PER_RUN_BUDGET_SECONDS", "300"))
|
||||
return await self._run_single_with_agent_adapter(task, run_index)
|
||||
|
||||
# Per-run result cache: allows a failed job to resume from previously completed
|
||||
# (task, run) pairs on resubmit. Keyed by model + task + run_index so the same
|
||||
# model's runs are reused, but different models stay isolated. The cache is
|
||||
# written AFTER successful score_task_run and read at the start of this method.
|
||||
# Set CLAWBENCH_RUN_CACHE_DIR="" to disable.
|
||||
def _filter_tasks_for_adapter(self, tasks: list[TaskDefinition]) -> list[TaskDefinition]:
|
||||
"""Drop tasks the selected adapter cannot execute."""
|
||||
|
||||
adapter_cls = get_adapter(self.adapter)
|
||||
adapter_config = self._adapter_config()
|
||||
compatible: list[TaskDefinition] = []
|
||||
skipped: list[tuple[str, str]] = []
|
||||
for task in tasks:
|
||||
canonical = from_task_definition(task)
|
||||
missing = adapter_cls.missing_capabilities_for(canonical, adapter_config)
|
||||
if missing:
|
||||
skipped.append((task.id, ", ".join(sorted(cap.value for cap in missing))))
|
||||
continue
|
||||
compatible.append(task)
|
||||
|
||||
if skipped and not self.quiet:
|
||||
console.print(
|
||||
f"[yellow]Adapter '{self.adapter}' skipped {len(skipped)} incompatible task(s).[/]"
|
||||
)
|
||||
for task_id, caps in skipped[:5]:
|
||||
console.print(f" [yellow]- {task_id}: missing {caps}[/]")
|
||||
if len(skipped) > 5:
|
||||
console.print(f" [yellow]- ... {len(skipped) - 5} more[/]")
|
||||
|
||||
if not compatible:
|
||||
raise ValueError(
|
||||
f"No selected tasks are compatible with adapter '{self.adapter}'. "
|
||||
"Try a files/execution task such as t1-bugfix-discount, or use adapter 'openclaw'."
|
||||
)
|
||||
return compatible
|
||||
|
||||
def _adapter_config(self) -> object:
|
||||
if self.adapter == "openclaw":
|
||||
per_turn_cap = float(os.environ.get("CLAWBENCH_PER_TURN_TIMEOUT_SECONDS", "180"))
|
||||
return OpenClawAdapterConfig(
|
||||
gateway=self.gateway_config,
|
||||
prompt_variant=self.prompt_variant,
|
||||
turn_timeout_seconds=per_turn_cap,
|
||||
)
|
||||
if self.adapter == "hermes":
|
||||
provider = os.environ.get("HERMES_PROVIDER") or None
|
||||
base_url = os.environ.get("HERMES_BASE_URL") or None
|
||||
api_mode = os.environ.get("HERMES_API_MODE") or None
|
||||
api_key = (
|
||||
os.environ.get("HERMES_API_KEY")
|
||||
or os.environ.get("OPENROUTER_API_KEY")
|
||||
or os.environ.get("OPENAI_API_KEY")
|
||||
or None
|
||||
)
|
||||
if provider:
|
||||
base_url = None
|
||||
api_key = None
|
||||
elif provider is None and self.model.startswith("openai/"):
|
||||
base_url = (
|
||||
base_url
|
||||
or os.environ.get("OPENAI_BASE_URL")
|
||||
or ("https://api.openai.com/v1" if os.environ.get("OPENAI_API_KEY") else None)
|
||||
)
|
||||
host = ""
|
||||
try:
|
||||
host = urlparse(base_url or "").hostname or ""
|
||||
except Exception:
|
||||
host = ""
|
||||
if host == "api.openai.com":
|
||||
api_key = os.environ.get("OPENAI_API_KEY") or os.environ.get("HERMES_API_KEY") or None
|
||||
if api_mode is None and self.model.split("/", 1)[1].lower().startswith("gpt-5"):
|
||||
api_mode = "codex_responses"
|
||||
elif provider is None and self.model.startswith("anthropic/"):
|
||||
provider = "anthropic"
|
||||
base_url = None
|
||||
api_key = None
|
||||
elif (
|
||||
base_url is None
|
||||
and os.environ.get("OPENAI_API_KEY")
|
||||
and not os.environ.get("HERMES_API_KEY")
|
||||
and not os.environ.get("OPENROUTER_API_KEY")
|
||||
):
|
||||
base_url = "https://api.openai.com/v1"
|
||||
enabled_toolsets = [
|
||||
item.strip()
|
||||
for item in os.environ.get("HERMES_TOOLSETS", "hermes-api-server").split(",")
|
||||
if item.strip()
|
||||
]
|
||||
disabled_toolsets = [
|
||||
item.strip()
|
||||
for item in os.environ.get("HERMES_DISABLED_TOOLSETS", "").split(",")
|
||||
if item.strip()
|
||||
] or None
|
||||
return HermesAdapterConfig(
|
||||
model=self.model,
|
||||
env_type=os.environ.get("HERMES_ENV_TYPE", "local"),
|
||||
max_iterations=int(os.environ.get("HERMES_MAX_ITERATIONS", "15")),
|
||||
timeout_seconds=int(os.environ.get("HERMES_STEP_TIMEOUT_SECONDS", "60")),
|
||||
base_url=base_url,
|
||||
api_key=api_key,
|
||||
provider=provider,
|
||||
api_mode=api_mode,
|
||||
prompt_variant=self.prompt_variant,
|
||||
driver_mode=os.environ.get("HERMES_DRIVER", "ai_agent"),
|
||||
enabled_toolsets=enabled_toolsets,
|
||||
disabled_toolsets=disabled_toolsets,
|
||||
hermes_home=os.environ.get("HERMES_HOME_BASE") or None,
|
||||
)
|
||||
raise ValueError(f"No config builder for adapter '{self.adapter}'")
|
||||
|
||||
async def _run_single_with_agent_adapter(
|
||||
self,
|
||||
task: TaskDefinition,
|
||||
run_index: int,
|
||||
) -> TaskRunResult:
|
||||
per_run_budget = float(os.environ.get("CLAWBENCH_PER_RUN_BUDGET_SECONDS", "300"))
|
||||
cache_dir_env = os.environ.get("CLAWBENCH_RUN_CACHE_DIR", "/data/run_cache")
|
||||
cache_path: Path | None = None
|
||||
if cache_dir_env:
|
||||
safe_model = self.model.replace("/", "_").replace(":", "_")
|
||||
cache_path = Path(cache_dir_env) / safe_model / task.id / f"run{run_index}.json"
|
||||
cache_path = (
|
||||
Path(cache_dir_env)
|
||||
/ f"{self.adapter}-{safe_model}"
|
||||
/ task.id
|
||||
/ f"run{run_index}.json"
|
||||
)
|
||||
if cache_path.exists():
|
||||
try:
|
||||
cached = TaskRunResult.model_validate_json(cache_path.read_text(encoding="utf-8"))
|
||||
cached.run_index = run_index
|
||||
logger.info(
|
||||
"TIMING %s/run%s total=cached score=%.2f C=%.2f T=%.2f B=%.2f J=%.2f (resumed from %s)",
|
||||
task.id, run_index,
|
||||
cached.run_score,
|
||||
cached.completion_result.score,
|
||||
cached.trajectory_result.score,
|
||||
cached.behavior_result.score,
|
||||
cached.judge_result.score if cached.judge_result.enabled else 0.0,
|
||||
cache_path,
|
||||
cached = TaskRunResult.model_validate_json(
|
||||
cache_path.read_text(encoding="utf-8")
|
||||
)
|
||||
cached.run_index = run_index
|
||||
return cached
|
||||
except Exception as exc:
|
||||
logger.warning("Cache load failed for %s/run%s: %s (will re-run)", task.id, run_index, exc)
|
||||
logger.warning(
|
||||
"Adapter cache load failed for %s/run%s: %s (will re-run)",
|
||||
task.id,
|
||||
run_index,
|
||||
exc,
|
||||
)
|
||||
|
||||
workspace = self._create_run_workspace(task, run_index)
|
||||
services = []
|
||||
session_keys: list[str] = []
|
||||
agent_id: str | None = None
|
||||
|
||||
# Per-phase timings so we can see where slow runs are spending their wall time.
|
||||
timings: dict[str, float] = {}
|
||||
|
||||
def _tick(label: str, since: float) -> float:
|
||||
now = time.monotonic()
|
||||
timings[label] = round(now - since, 2)
|
||||
return now
|
||||
|
||||
t_run_start = time.monotonic()
|
||||
try:
|
||||
t_phase = t_run_start
|
||||
self._setup_workspace(task, workspace)
|
||||
t_phase = _tick("workspace_setup", t_phase)
|
||||
transcript = Transcript()
|
||||
canonical = from_task_definition(task)
|
||||
ctx = AdapterContext(
|
||||
task=canonical,
|
||||
workspace=workspace,
|
||||
runtime_values={},
|
||||
run_index=run_index,
|
||||
model=self.model,
|
||||
transcript=transcript,
|
||||
)
|
||||
|
||||
try:
|
||||
self._setup_workspace(task, workspace)
|
||||
runtime_values = build_runtime_values(
|
||||
workspace=workspace,
|
||||
repo_root=self.repo_root,
|
||||
extra={"task_id": task.id, "model": self.model, "prompt_variant": self.prompt_variant},
|
||||
extra={
|
||||
"task_id": task.id,
|
||||
"model": self.model,
|
||||
"prompt_variant": self.prompt_variant,
|
||||
},
|
||||
)
|
||||
services, runtime_values = await start_background_services(
|
||||
task.setup.background_services,
|
||||
@ -328,118 +468,65 @@ class BenchmarkHarness:
|
||||
repo_root=self.repo_root,
|
||||
runtime_values=runtime_values,
|
||||
)
|
||||
t_phase = _tick("bg_services_start", t_phase)
|
||||
ctx.runtime_values = runtime_values
|
||||
|
||||
transcript = Transcript()
|
||||
adapter_cls = get_adapter(self.adapter)
|
||||
adapter = adapter_cls(self._adapter_config()) # type: ignore[arg-type]
|
||||
phase_errors: list[str] = []
|
||||
start_ms = _now_ms()
|
||||
async with adapter:
|
||||
try:
|
||||
await adapter.setup(ctx)
|
||||
pre_run_failures = ctx.adapter_state.get("pre_run_failures") or []
|
||||
if pre_run_failures:
|
||||
raise RuntimeError("; ".join(str(item) for item in pre_run_failures))
|
||||
|
||||
async with GatewayClient(self.gateway_config) as client:
|
||||
t_phase = _tick("gateway_connect", t_phase)
|
||||
agent_id = await self._create_run_agent(
|
||||
client,
|
||||
task=task,
|
||||
workspace=workspace,
|
||||
run_index=run_index,
|
||||
)
|
||||
t_phase = _tick("agent_create", t_phase)
|
||||
for phase_index, phase in enumerate(task.normalized_phases()):
|
||||
session_key = await client.create_session(
|
||||
model=self.model,
|
||||
agent_id=agent_id,
|
||||
label=unique_session_label(
|
||||
f"clawbench-{task.id}-run{run_index}-phase{phase_index}"
|
||||
),
|
||||
)
|
||||
session_keys.append(session_key)
|
||||
await client.subscribe(session_key)
|
||||
if task.family.value == "browser":
|
||||
await self._assert_browser_support(client, session_key)
|
||||
t_phase = _tick(f"phase{phase_index}_session_setup", t_phase)
|
||||
|
||||
simulator = UserSimulator(
|
||||
phase.user,
|
||||
runtime_values,
|
||||
prompt_variant=self.prompt_variant,
|
||||
)
|
||||
turn_index = 0
|
||||
phase_raw_timeout = float(phase.timeout_seconds or task.timeout_seconds)
|
||||
turn_timeout = min(phase_raw_timeout, per_turn_cap)
|
||||
while not simulator.is_done:
|
||||
# Enforce per-run budget: if we've already burned our whole budget
|
||||
# on previous turns of this run, bail out and score whatever we have.
|
||||
for phase in canonical.phases:
|
||||
elapsed = time.monotonic() - t_run_start
|
||||
if elapsed >= per_run_budget:
|
||||
logger.warning(
|
||||
"Run %s/%s hit per-run budget (%.0fs); stopping user simulator",
|
||||
task.id,
|
||||
run_index,
|
||||
per_run_budget,
|
||||
remaining_budget = per_run_budget - elapsed
|
||||
if remaining_budget <= 0:
|
||||
phase_errors.append(
|
||||
f"Adapter run hit per-run budget ({per_run_budget:.0f}s)"
|
||||
)
|
||||
break
|
||||
remaining_budget = per_run_budget - elapsed
|
||||
effective_timeout = min(turn_timeout, remaining_budget)
|
||||
|
||||
user_message = await simulator.next_message(transcript)
|
||||
if user_message is None:
|
||||
try:
|
||||
phase_result = await asyncio.wait_for(
|
||||
adapter.run_phase(phase, ctx),
|
||||
timeout=remaining_budget,
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
phase_errors.append(
|
||||
f"Adapter run hit per-run budget ({per_run_budget:.0f}s)"
|
||||
)
|
||||
break
|
||||
if phase_result.error:
|
||||
phase_errors.append(phase_result.error)
|
||||
break
|
||||
t_turn_start = time.monotonic()
|
||||
phase_transcript = await client.send_and_wait(
|
||||
session_key,
|
||||
user_message,
|
||||
timeout=effective_timeout,
|
||||
)
|
||||
timings[f"phase{phase_index}_turn{turn_index}"] = round(
|
||||
time.monotonic() - t_turn_start, 2
|
||||
)
|
||||
transcript.messages.extend(phase_transcript.messages)
|
||||
turn_index += 1
|
||||
t_phase = _tick(f"phase{phase_index}_total", t_phase)
|
||||
|
||||
duration_ms = _now_ms() - start_ms
|
||||
last_session_key = session_keys[-1] if session_keys else ""
|
||||
t_score_start = time.monotonic()
|
||||
result = await score_task_run(
|
||||
task=task,
|
||||
transcript=transcript,
|
||||
workspace=workspace,
|
||||
client=client,
|
||||
session_key=last_session_key,
|
||||
agent_id=agent_id,
|
||||
duration_ms=duration_ms,
|
||||
runtime_values=runtime_values,
|
||||
judge_model=self.judge_model,
|
||||
)
|
||||
timings["score"] = round(time.monotonic() - t_score_start, 2)
|
||||
timings["total"] = round(time.monotonic() - t_run_start, 2)
|
||||
result.run_index = run_index
|
||||
duration_ms = _now_ms() - start_ms
|
||||
result = await self._score_adapter_task_run(
|
||||
task=task,
|
||||
canonical_task=canonical,
|
||||
ctx=ctx,
|
||||
duration_ms=duration_ms,
|
||||
adapter=adapter,
|
||||
error="; ".join(phase_errors) if phase_errors else None,
|
||||
)
|
||||
finally:
|
||||
await adapter.teardown(ctx)
|
||||
result.run_index = run_index
|
||||
|
||||
# Write per-run cache so a future resume of this job can skip this run.
|
||||
if cache_path is not None:
|
||||
try:
|
||||
cache_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
tmp_path = cache_path.with_suffix(".json.tmp")
|
||||
tmp_path.write_text(
|
||||
result.model_dump_json(indent=2), encoding="utf-8"
|
||||
)
|
||||
tmp_path.replace(cache_path)
|
||||
except Exception as exc:
|
||||
logger.warning("Cache write failed for %s/run%s: %s", task.id, run_index, exc)
|
||||
|
||||
logger.info(
|
||||
"TIMING %s/run%s total=%.1fs score=%.2f C=%.2f T=%.2f B=%.2f J=%.2f %s",
|
||||
task.id,
|
||||
run_index,
|
||||
timings["total"],
|
||||
result.run_score,
|
||||
result.completion_result.score,
|
||||
result.trajectory_result.score,
|
||||
result.behavior_result.score,
|
||||
result.judge_result.score if (result.judge_result.enabled and not result.judge_result.error) else 0.0,
|
||||
" ".join(f"{k}={v}s" for k, v in timings.items() if k != "total"),
|
||||
)
|
||||
return result
|
||||
if cache_path is not None:
|
||||
try:
|
||||
cache_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
tmp_path = cache_path.with_suffix(".json.tmp")
|
||||
tmp_path.write_text(result.model_dump_json(indent=2), encoding="utf-8")
|
||||
tmp_path.replace(cache_path)
|
||||
except Exception as exc:
|
||||
logger.warning("Adapter cache write failed for %s/run%s: %s", task.id, run_index, exc)
|
||||
return result
|
||||
except Exception as exc:
|
||||
logger.exception("Run %s/%s failed", task.id, run_index)
|
||||
logger.exception("Adapter run %s/%s failed", task.id, run_index)
|
||||
return TaskRunResult(
|
||||
task_id=task.id,
|
||||
tier=task.tier.value,
|
||||
@ -461,30 +548,171 @@ class BenchmarkHarness:
|
||||
privacy_tier=task.privacy_tier,
|
||||
contamination_risk=task.contamination_risk,
|
||||
freshness_epoch=task.freshness_epoch,
|
||||
category=task.category,
|
||||
domain=task.domain,
|
||||
functionality=list(task.functionality),
|
||||
trace_distribution=list(task.trace_distribution),
|
||||
tool_surface=list(task.tool_surface),
|
||||
risk_tags=list(task.risk_tags),
|
||||
similarity_hash=task.similarity_hash,
|
||||
official=task.official,
|
||||
run_index=run_index,
|
||||
run_score=0.0,
|
||||
transcript=Transcript(),
|
||||
duration_ms=0,
|
||||
transcript=transcript,
|
||||
duration_ms=round((time.monotonic() - t_run_start) * 1000),
|
||||
delivery_outcome=DeliveryOutcome.FAIL,
|
||||
failure_mode=classify_error_failure_mode(task, str(exc)),
|
||||
error=str(exc),
|
||||
)
|
||||
finally:
|
||||
await stop_background_services(services)
|
||||
if session_keys or agent_id:
|
||||
try:
|
||||
async with GatewayClient(self.gateway_config) as cleanup_client:
|
||||
for session_key in session_keys:
|
||||
await cleanup_client.delete_session(session_key)
|
||||
if agent_id:
|
||||
await cleanup_client.delete_agent(agent_id, delete_files=False)
|
||||
except Exception as exc:
|
||||
logger.warning("Session cleanup failed: %s", exc)
|
||||
if os.environ.get("CLAWBENCH_KEEP_WORKSPACES") != "1":
|
||||
shutil.rmtree(workspace, ignore_errors=True)
|
||||
|
||||
async def _score_adapter_task_run(
|
||||
self,
|
||||
*,
|
||||
task: TaskDefinition,
|
||||
canonical_task,
|
||||
ctx: AdapterContext,
|
||||
duration_ms: int,
|
||||
adapter,
|
||||
error: str | None,
|
||||
) -> TaskRunResult:
|
||||
annotate_transcript_tool_calls(ctx.transcript)
|
||||
|
||||
total = 0
|
||||
passed = 0
|
||||
failures: list[str] = []
|
||||
execution_results = []
|
||||
|
||||
for spec in canonical_task.verifier.file_states:
|
||||
ok, reason = verify_file_state(spec, ctx.workspace, ctx.runtime_values)
|
||||
total += 1
|
||||
if ok:
|
||||
passed += 1
|
||||
else:
|
||||
failures.append(f"FILE {spec.path}: {reason}")
|
||||
|
||||
for query in canonical_task.verifier.state_queries:
|
||||
state = await adapter.verify_state_query(query, ctx)
|
||||
if state.capability_missing:
|
||||
failures.append(f"SKIP {query.kind}: {state.detail}")
|
||||
continue
|
||||
total += 1
|
||||
if state.ok:
|
||||
passed += 1
|
||||
else:
|
||||
failures.append(f"{query.kind.upper()}: {state.detail or query.description}")
|
||||
|
||||
for spec in canonical_task.verifier.execution_checks:
|
||||
result = await run_execution_check(
|
||||
spec,
|
||||
workspace=ctx.workspace,
|
||||
runtime_values=ctx.runtime_values,
|
||||
)
|
||||
execution_results.append(result)
|
||||
total += 1
|
||||
if result.passed:
|
||||
passed += 1
|
||||
else:
|
||||
failures.append(f"EXEC {spec.name}: {result.reason}")
|
||||
|
||||
completion_result = CompletionResult(
|
||||
total_assertions=total,
|
||||
passed_assertions=passed,
|
||||
failed_assertions=failures,
|
||||
execution_results=execution_results,
|
||||
score=round(passed / total if total else 1.0, 4),
|
||||
)
|
||||
trajectory_result = evaluate_trajectory(ctx.transcript, canonical_task.verifier.trajectory)
|
||||
behavior_result = evaluate_behavior(canonical_task.verifier.behavior, ctx.transcript)
|
||||
if self.judge_model:
|
||||
async with GatewayClient(self.gateway_config) as judge_client:
|
||||
judge_result = await judge_task_run(
|
||||
task=task,
|
||||
transcript=ctx.transcript,
|
||||
workspace=ctx.workspace,
|
||||
client=judge_client,
|
||||
judge_model=self.judge_model,
|
||||
completion_result=completion_result,
|
||||
)
|
||||
else:
|
||||
judge_result = JudgeResult()
|
||||
token_usage = ctx.transcript.total_usage
|
||||
efficiency_result = EfficiencyResult.from_usage(
|
||||
duration_ms=duration_ms,
|
||||
usage=token_usage,
|
||||
)
|
||||
run_score = combine_run_score(
|
||||
completion=completion_result.score,
|
||||
trajectory=trajectory_result.score,
|
||||
behavior=behavior_result.score,
|
||||
judge=(
|
||||
judge_result.score
|
||||
if judge_result.enabled and not judge_result.error
|
||||
else None
|
||||
),
|
||||
has_deterministic_verifier=completion_result.total_assertions > 0,
|
||||
)
|
||||
delivery_outcome = classify_delivery_outcome(
|
||||
task=task,
|
||||
completion_result=completion_result,
|
||||
run_score=run_score,
|
||||
)
|
||||
failure_mode = classify_failure_mode(
|
||||
task=task,
|
||||
transcript=ctx.transcript,
|
||||
completion_result=completion_result,
|
||||
trajectory_result=trajectory_result,
|
||||
behavior_result=behavior_result,
|
||||
error=error,
|
||||
)
|
||||
|
||||
return TaskRunResult(
|
||||
task_id=task.id,
|
||||
tier=task.tier.value,
|
||||
family=task.family.value,
|
||||
scenario=task.scenario.value if task.scenario else "",
|
||||
subscenario=task.subscenario,
|
||||
artifact_type=task.artifact_type.value if task.artifact_type else "",
|
||||
prompt_variant=self.prompt_variant,
|
||||
query_difficulty=task.query_difficulty.value if task.query_difficulty else "",
|
||||
query_weight=task.query_weight,
|
||||
pool=task.pool.value,
|
||||
subsets=[subset.value for subset in task.subsets],
|
||||
capabilities=[capability.value for capability in task.capabilities],
|
||||
variant_group=task.variant_group,
|
||||
variant_id=task.variant_id,
|
||||
template_id=task.template_id,
|
||||
release_id=task.release_id,
|
||||
source_kind=task.source_kind,
|
||||
privacy_tier=task.privacy_tier,
|
||||
contamination_risk=task.contamination_risk,
|
||||
freshness_epoch=task.freshness_epoch,
|
||||
category=task.category,
|
||||
domain=task.domain,
|
||||
functionality=list(task.functionality),
|
||||
trace_distribution=list(task.trace_distribution),
|
||||
tool_surface=list(task.tool_surface),
|
||||
risk_tags=list(task.risk_tags),
|
||||
similarity_hash=task.similarity_hash,
|
||||
official=task.official,
|
||||
run_index=0,
|
||||
completion_result=completion_result,
|
||||
trajectory_result=trajectory_result,
|
||||
behavior_result=behavior_result,
|
||||
judge_result=judge_result,
|
||||
run_score=round(run_score, 4),
|
||||
transcript=ctx.transcript,
|
||||
duration_ms=duration_ms,
|
||||
token_usage=token_usage,
|
||||
efficiency_result=efficiency_result,
|
||||
delivery_outcome=delivery_outcome,
|
||||
failure_mode=failure_mode,
|
||||
error=error,
|
||||
)
|
||||
|
||||
async def _create_run_agent(
|
||||
self,
|
||||
client: GatewayClient,
|
||||
@ -606,6 +834,12 @@ class BenchmarkHarness:
|
||||
privacy_tier=task.privacy_tier,
|
||||
contamination_risk=task.contamination_risk,
|
||||
freshness_epoch=task.freshness_epoch,
|
||||
category=task.category,
|
||||
domain=task.domain,
|
||||
functionality=list(task.functionality),
|
||||
trace_distribution=list(task.trace_distribution),
|
||||
tool_surface=list(task.tool_surface),
|
||||
risk_tags=list(task.risk_tags),
|
||||
similarity_hash=task.similarity_hash,
|
||||
official=task.official,
|
||||
runs=len(runs),
|
||||
@ -712,6 +946,45 @@ class BenchmarkHarness:
|
||||
)
|
||||
)
|
||||
|
||||
category_results = _dimension_results(
|
||||
task_stats,
|
||||
dimension="category",
|
||||
values_for=lambda stat: [stat.category] if stat.category else [],
|
||||
)
|
||||
domain_results = _dimension_results(
|
||||
task_stats,
|
||||
dimension="domain",
|
||||
values_for=lambda stat: [stat.domain] if stat.domain else [],
|
||||
)
|
||||
functionality_results = _dimension_results(
|
||||
task_stats,
|
||||
dimension="functionality",
|
||||
values_for=lambda stat: stat.functionality,
|
||||
)
|
||||
trace_distribution_results = _dimension_results(
|
||||
task_stats,
|
||||
dimension="trace_distribution",
|
||||
values_for=lambda stat: stat.trace_distribution,
|
||||
)
|
||||
tool_surface_results = _dimension_results(
|
||||
task_stats,
|
||||
dimension="tool_surface",
|
||||
values_for=lambda stat: stat.tool_surface,
|
||||
)
|
||||
risk_tag_results = _dimension_results(
|
||||
task_stats,
|
||||
dimension="risk_tag",
|
||||
values_for=lambda stat: stat.risk_tags,
|
||||
)
|
||||
dimension_results = {
|
||||
"category": category_results,
|
||||
"domain": domain_results,
|
||||
"functionality": functionality_results,
|
||||
"trace_distribution": trace_distribution_results,
|
||||
"tool_surface": tool_surface_results,
|
||||
"risk_tag": risk_tag_results,
|
||||
}
|
||||
|
||||
overall_ci = bootstrap_ci([stat.mean_task_score for stat in task_stats])
|
||||
total_weight = sum(stat.query_weight for stat in task_stats)
|
||||
overall_failure_mode_counts = _count_values(
|
||||
@ -727,6 +1000,7 @@ class BenchmarkHarness:
|
||||
for _ in range(count)
|
||||
)
|
||||
active_release = load_active_release()
|
||||
ablation_profile = self._ablation_profile()
|
||||
result = BenchmarkResult(
|
||||
submission_id=str(uuid.uuid4()),
|
||||
model=self.model,
|
||||
@ -743,10 +1017,17 @@ class BenchmarkHarness:
|
||||
"prompt_variant": self.prompt_variant,
|
||||
"judge_model": self.judge_model,
|
||||
"adapter": self.adapter,
|
||||
"ablation_profile": ablation_profile.model_dump(),
|
||||
"tool_profile": ablation_profile.tool_profile.model_dump(),
|
||||
"harness": ablation_profile.harness.model_dump(),
|
||||
"known_adapters": list(KNOWN_ADAPTERS),
|
||||
"executable_adapters": sorted(EXECUTABLE_ADAPTERS),
|
||||
"subsets": self.subsets,
|
||||
"capabilities": self.capabilities,
|
||||
"dimension_coverage": {
|
||||
key: len(value)
|
||||
for key, value in dimension_results.items()
|
||||
},
|
||||
"official_only": self.official_only,
|
||||
**(environment_extra or {}),
|
||||
},
|
||||
@ -803,6 +1084,13 @@ class BenchmarkHarness:
|
||||
overall_pass_hat_k=_mean([1.0 if stat.pass_hat_k else 0.0 for stat in task_stats]),
|
||||
tier_results=tier_results,
|
||||
scenario_results=scenario_results,
|
||||
category_results=category_results,
|
||||
domain_results=domain_results,
|
||||
functionality_results=functionality_results,
|
||||
trace_distribution_results=trace_distribution_results,
|
||||
tool_surface_results=tool_surface_results,
|
||||
risk_tag_results=risk_tag_results,
|
||||
dimension_results=dimension_results,
|
||||
task_results=task_stats,
|
||||
environment_checksum=self._benchmark_checksum(tasks),
|
||||
task_snapshot_fingerprint=compute_task_snapshot_fingerprint(tasks),
|
||||
@ -823,6 +1111,48 @@ class BenchmarkHarness:
|
||||
completion_passed = completion.score >= 0.9999
|
||||
return completion_passed and result.run_score >= task.pass_threshold
|
||||
|
||||
def _ablation_profile(self):
|
||||
config = self._adapter_config()
|
||||
driver = ""
|
||||
enabled_toolsets: list[str] = []
|
||||
disabled_toolsets: list[str] = []
|
||||
if isinstance(config, HermesAdapterConfig):
|
||||
driver = config.driver_mode
|
||||
enabled_toolsets = list(config.enabled_toolsets or [])
|
||||
disabled_toolsets = list(config.disabled_toolsets or [])
|
||||
elif isinstance(config, OpenClawAdapterConfig):
|
||||
driver = "gateway"
|
||||
|
||||
source = ""
|
||||
sha = ""
|
||||
version = ""
|
||||
if self.adapter == "hermes":
|
||||
repo = os.environ.get("HERMES_AGENT_REPO") or os.environ.get("HERMES_INSTALL_DIR")
|
||||
if repo:
|
||||
source = str(Path(repo).expanduser())
|
||||
sha, version = git_head(Path(source))
|
||||
elif self.adapter == "openclaw":
|
||||
candidate = Path(os.environ.get("OPENCLAW_REPO", self.repo_root.parent / "openclaw"))
|
||||
if candidate.exists():
|
||||
source = str(candidate)
|
||||
sha, version = git_head(candidate)
|
||||
if not version:
|
||||
version = _command_version(["openclaw", "--version"])
|
||||
|
||||
return build_ablation_profile(
|
||||
model=self.model,
|
||||
adapter=self.adapter,
|
||||
config=config, # type: ignore[arg-type]
|
||||
prompt_profile=self.prompt_variant,
|
||||
harness_version=version,
|
||||
harness_git_sha=sha,
|
||||
harness_source=source,
|
||||
driver=driver,
|
||||
tool_profile_name=self.tool_profile_name,
|
||||
enabled_toolsets=enabled_toolsets,
|
||||
disabled_toolsets=disabled_toolsets,
|
||||
)
|
||||
|
||||
def _print_report(self, result: BenchmarkResult) -> None:
|
||||
console.print(f"\n[bold]{'=' * 60}[/]")
|
||||
console.print(f"[bold]Results — {result.model}[/]")
|
||||
@ -909,6 +1239,47 @@ def _mean(values: list[float]) -> float:
|
||||
return sum(values) / len(values) if values else 0.0
|
||||
|
||||
|
||||
def _dimension_results(
|
||||
task_stats: list[TaskStats],
|
||||
*,
|
||||
dimension: str,
|
||||
values_for: Callable[[TaskStats], list[str]],
|
||||
) -> list[DimensionResult]:
|
||||
grouped: dict[str, list[TaskStats]] = {}
|
||||
for stat in task_stats:
|
||||
values = sorted({value.strip() for value in values_for(stat) if value.strip()})
|
||||
for value in values:
|
||||
grouped.setdefault(value, []).append(stat)
|
||||
|
||||
results: list[DimensionResult] = []
|
||||
for value in sorted(grouped):
|
||||
current = grouped[value]
|
||||
total_weight = sum(stat.query_weight for stat in current)
|
||||
weighted_score = (
|
||||
sum(stat.mean_task_score * stat.query_weight for stat in current) / total_weight
|
||||
if total_weight
|
||||
else _mean([stat.mean_task_score for stat in current])
|
||||
)
|
||||
results.append(
|
||||
DimensionResult(
|
||||
dimension=dimension,
|
||||
value=value,
|
||||
mean_task_score=_mean([stat.mean_task_score for stat in current]),
|
||||
weighted_score=weighted_score,
|
||||
mean_completion=_mean([stat.mean_completion_score for stat in current]),
|
||||
mean_trajectory=_mean([stat.mean_trajectory_score for stat in current]),
|
||||
mean_behavior=_mean([stat.mean_behavior_score for stat in current]),
|
||||
mean_judge=_mean([stat.mean_judge_score for stat in current if stat.judged_runs > 0]),
|
||||
mean_reliability=_mean([stat.reliability_score for stat in current]),
|
||||
pass_hat_k_rate=_mean([1.0 if stat.pass_hat_k else 0.0 for stat in current]),
|
||||
task_count=len(current),
|
||||
total_weight=total_weight,
|
||||
task_ids=[stat.task_id for stat in current],
|
||||
)
|
||||
)
|
||||
return results
|
||||
|
||||
|
||||
def _percentile(values: list[float], percentile: float) -> float:
|
||||
if not values:
|
||||
return 0.0
|
||||
|
||||
@ -26,8 +26,14 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
HF_TOKEN = os.environ.get("HF_TOKEN", "")
|
||||
|
||||
# Local fallback when HF is unavailable
|
||||
LOCAL_QUEUE_DIR = Path("/data/queue") if Path("/data").exists() else Path("data/queue")
|
||||
# Local fallback when HF is unavailable. Containerized sweeps run several
|
||||
# independent workers against the same /data mount, so callers may isolate this.
|
||||
LOCAL_QUEUE_DIR = Path(
|
||||
os.environ.get(
|
||||
"CLAWBENCH_LOCAL_QUEUE_DIR",
|
||||
"/data/queue" if Path("/data").exists() else "data/queue",
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
class JobStatus(str, Enum):
|
||||
@ -49,6 +55,7 @@ class SubmissionRequest(BaseModel):
|
||||
max_parallel_lanes: int = Field(default=1, ge=1, le=8)
|
||||
tier: str | None = None # Filter to a specific tier
|
||||
scenario: str | None = None
|
||||
task_ids: list[str] = Field(default_factory=list)
|
||||
prompt_variant: str = "clear"
|
||||
submitter: str = "" # HF username
|
||||
notes: str = ""
|
||||
|
||||
@ -63,13 +63,21 @@ def get_hidden_release_dir(release_id: str, *, private_tasks_root: Path | None =
|
||||
|
||||
|
||||
def compute_task_snapshot_fingerprint(tasks: list[TaskDefinition]) -> str:
|
||||
payload = "|".join(
|
||||
sorted(
|
||||
f"{task.id}:{task.pool.value}:{task.variant_group}:{task.variant_id}:{task.release_id}"
|
||||
for task in tasks
|
||||
payload = [
|
||||
task.model_dump(mode="json", exclude_none=False)
|
||||
for task in sorted(
|
||||
tasks,
|
||||
key=lambda task: (
|
||||
task.id,
|
||||
task.pool.value,
|
||||
task.variant_group,
|
||||
task.variant_id,
|
||||
task.release_id,
|
||||
),
|
||||
)
|
||||
)
|
||||
return hashlib.sha256(payload.encode("utf-8")).hexdigest()
|
||||
]
|
||||
encoded = json.dumps(payload, sort_keys=True, separators=(",", ":"))
|
||||
return hashlib.sha256(encoded.encode("utf-8")).hexdigest()
|
||||
|
||||
|
||||
def load_active_release(path: Path | None = None) -> ActiveReleaseManifest | None:
|
||||
|
||||
@ -390,6 +390,12 @@ class TaskDefinition(BaseModel):
|
||||
privacy_tier: str = ""
|
||||
contamination_risk: str = ""
|
||||
freshness_epoch: str = ""
|
||||
category: str = ""
|
||||
domain: str = ""
|
||||
functionality: list[str] = Field(default_factory=list)
|
||||
trace_distribution: list[str] = Field(default_factory=list)
|
||||
tool_surface: list[str] = Field(default_factory=list)
|
||||
risk_tags: list[str] = Field(default_factory=list)
|
||||
first_used_at: str = ""
|
||||
retire_after_runs: int = 0
|
||||
similarity_hash: str = ""
|
||||
@ -542,6 +548,12 @@ class TaskRunResult(BaseModel):
|
||||
privacy_tier: str = ""
|
||||
contamination_risk: str = ""
|
||||
freshness_epoch: str = ""
|
||||
category: str = ""
|
||||
domain: str = ""
|
||||
functionality: list[str] = Field(default_factory=list)
|
||||
trace_distribution: list[str] = Field(default_factory=list)
|
||||
tool_surface: list[str] = Field(default_factory=list)
|
||||
risk_tags: list[str] = Field(default_factory=list)
|
||||
similarity_hash: str = ""
|
||||
official: bool = False
|
||||
run_index: int
|
||||
@ -627,6 +639,12 @@ class TaskStats(BaseModel):
|
||||
privacy_tier: str = ""
|
||||
contamination_risk: str = ""
|
||||
freshness_epoch: str = ""
|
||||
category: str = ""
|
||||
domain: str = ""
|
||||
functionality: list[str] = Field(default_factory=list)
|
||||
trace_distribution: list[str] = Field(default_factory=list)
|
||||
tool_surface: list[str] = Field(default_factory=list)
|
||||
risk_tags: list[str] = Field(default_factory=list)
|
||||
similarity_hash: str = ""
|
||||
official: bool = False
|
||||
runs: int
|
||||
@ -740,6 +758,22 @@ class ScenarioResult(BaseModel):
|
||||
task_stats: list[TaskStats] = Field(default_factory=list)
|
||||
|
||||
|
||||
class DimensionResult(BaseModel):
|
||||
dimension: str
|
||||
value: str
|
||||
mean_task_score: float
|
||||
weighted_score: float
|
||||
mean_completion: float
|
||||
mean_trajectory: float
|
||||
mean_behavior: float
|
||||
mean_judge: float = 0.0
|
||||
mean_reliability: float
|
||||
pass_hat_k_rate: float
|
||||
task_count: int = 0
|
||||
total_weight: float = 0.0
|
||||
task_ids: list[str] = Field(default_factory=list)
|
||||
|
||||
|
||||
class BenchmarkResult(BaseModel):
|
||||
submission_id: str
|
||||
model: str
|
||||
@ -788,6 +822,13 @@ class BenchmarkResult(BaseModel):
|
||||
|
||||
tier_results: list[TierResult] = Field(default_factory=list)
|
||||
scenario_results: list[ScenarioResult] = Field(default_factory=list)
|
||||
category_results: list[DimensionResult] = Field(default_factory=list)
|
||||
domain_results: list[DimensionResult] = Field(default_factory=list)
|
||||
functionality_results: list[DimensionResult] = Field(default_factory=list)
|
||||
trace_distribution_results: list[DimensionResult] = Field(default_factory=list)
|
||||
tool_surface_results: list[DimensionResult] = Field(default_factory=list)
|
||||
risk_tag_results: list[DimensionResult] = Field(default_factory=list)
|
||||
dimension_results: dict[str, list[DimensionResult]] = Field(default_factory=dict)
|
||||
task_results: list[TaskStats] = Field(default_factory=list)
|
||||
|
||||
certified: bool = False
|
||||
|
||||
@ -163,6 +163,12 @@ async def score_task_run(
|
||||
privacy_tier=task.privacy_tier,
|
||||
contamination_risk=task.contamination_risk,
|
||||
freshness_epoch=task.freshness_epoch,
|
||||
category=task.category,
|
||||
domain=task.domain,
|
||||
functionality=list(task.functionality),
|
||||
trace_distribution=list(task.trace_distribution),
|
||||
tool_surface=list(task.tool_surface),
|
||||
risk_tags=list(task.risk_tags),
|
||||
similarity_hash=task.similarity_hash,
|
||||
official=task.official,
|
||||
run_index=0,
|
||||
|
||||
@ -40,12 +40,20 @@ def build_runtime_values(
|
||||
repo_root: Path,
|
||||
extra: dict[str, Any] | None = None,
|
||||
) -> dict[str, Any]:
|
||||
openclaw_repo = os.environ.get("OPENCLAW_REPO")
|
||||
openclaw_node_path = os.environ.get("OPENCLAW_NODE_PATH")
|
||||
if not openclaw_node_path and openclaw_repo:
|
||||
openclaw_node_path = str(Path(openclaw_repo) / "node_modules")
|
||||
benchmark_node_parts = [str(repo_root / "node_modules")]
|
||||
global_node_path = os.environ.get("NODE_PATH")
|
||||
if global_node_path:
|
||||
benchmark_node_parts.append(global_node_path)
|
||||
values = {
|
||||
"workspace": str(workspace),
|
||||
"workspace_name": workspace.name,
|
||||
"repo_root": str(repo_root),
|
||||
"benchmark_node_path": str(repo_root / "node_modules"),
|
||||
"openclaw_node_path": "/openclaw/node_modules",
|
||||
"benchmark_node_path": ":".join(benchmark_node_parts),
|
||||
"openclaw_node_path": openclaw_node_path or "/openclaw/node_modules",
|
||||
"python_exe": sys.executable,
|
||||
}
|
||||
if extra:
|
||||
|
||||
@ -1,18 +1,30 @@
|
||||
"""Upload benchmark results to a Hugging Face Dataset.
|
||||
|
||||
Each submission is written as its own parquet shard. This avoids the
|
||||
read-modify-write race caused by rewriting the single `submissions`
|
||||
split file for every completed job.
|
||||
IMPORTANT — why this file calls `load_dataset` before `push_to_hub`:
|
||||
|
||||
`datasets.Dataset.push_to_hub(repo, split="submissions")` writes a single
|
||||
parquet shard to `data/submissions-00000-of-00001.parquet`, REPLACING
|
||||
whatever was there. If you push N submissions in sequence without
|
||||
reading first, only the Nth row survives — the previous N-1 are lost.
|
||||
|
||||
`upload_result()` therefore:
|
||||
1. Loads the existing `submissions` split if it exists
|
||||
2. Appends the new row
|
||||
3. Deduplicates by `submission_id` (so a retried upload of the same
|
||||
run doesn't create two rows)
|
||||
4. Pushes the combined dataset as a fresh parquet shard
|
||||
|
||||
At ClawBench's current submission rate (1-2 concurrent jobs) the read-
|
||||
then-write race window is negligible. If cross-worker concurrency ever
|
||||
becomes material we should move to an actually append-only format
|
||||
(e.g. write per-submission parquet shards under `data/submission-<id>-
|
||||
of-NNNNN.parquet` instead of overwriting a single shard).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
from clawbench.hub import ensure_dataset_repo, resolve_dataset_repo
|
||||
from clawbench.schemas import BenchmarkResult
|
||||
@ -67,15 +79,15 @@ async def upload_result(
|
||||
"official_hidden_score": result.official_hidden_score,
|
||||
"clear_prompt_score": result.clear_prompt_score,
|
||||
"ambiguous_prompt_score": result.ambiguous_prompt_score,
|
||||
"overall_delivery_outcome_counts": _json_column(result.overall_delivery_outcome_counts),
|
||||
"overall_failure_mode_counts": _json_column(result.overall_failure_mode_counts),
|
||||
"overall_delivery_outcome_counts": result.overall_delivery_outcome_counts,
|
||||
"overall_failure_mode_counts": result.overall_failure_mode_counts,
|
||||
"overall_pass_hat_k": result.overall_pass_hat_k,
|
||||
"overall_ci_lower": result.overall_ci_lower,
|
||||
"overall_ci_upper": result.overall_ci_upper,
|
||||
"certified": result.certified,
|
||||
"environment_checksum": result.environment_checksum,
|
||||
"environment": _json_column(result.environment),
|
||||
"tier_scores": _json_column({
|
||||
"environment": str(result.environment),
|
||||
"tier_scores": {
|
||||
tier_result.tier: {
|
||||
"mean_task_score": tier_result.mean_task_score,
|
||||
"mean_completion": tier_result.mean_completion,
|
||||
@ -87,8 +99,8 @@ async def upload_result(
|
||||
"ci_upper": tier_result.ci_upper,
|
||||
}
|
||||
for tier_result in result.tier_results
|
||||
}),
|
||||
"scenario_scores": _json_column({
|
||||
},
|
||||
"scenario_scores": {
|
||||
scenario_result.scenario: {
|
||||
"mean_task_score": scenario_result.mean_task_score,
|
||||
"weighted_score": scenario_result.weighted_score,
|
||||
@ -101,8 +113,27 @@ async def upload_result(
|
||||
"total_weight": scenario_result.total_weight,
|
||||
}
|
||||
for scenario_result in result.scenario_results
|
||||
}),
|
||||
"task_results": _json_column([
|
||||
},
|
||||
"dimension_scores": {
|
||||
dimension: {
|
||||
item.value: {
|
||||
"mean_task_score": item.mean_task_score,
|
||||
"weighted_score": item.weighted_score,
|
||||
"mean_completion": item.mean_completion,
|
||||
"mean_trajectory": item.mean_trajectory,
|
||||
"mean_behavior": item.mean_behavior,
|
||||
"mean_judge": item.mean_judge,
|
||||
"mean_reliability": item.mean_reliability,
|
||||
"pass_hat_k_rate": item.pass_hat_k_rate,
|
||||
"task_count": item.task_count,
|
||||
"total_weight": item.total_weight,
|
||||
"task_ids": item.task_ids,
|
||||
}
|
||||
for item in dimension_results
|
||||
}
|
||||
for dimension, dimension_results in result.dimension_results.items()
|
||||
},
|
||||
"task_results": [
|
||||
{
|
||||
"task_id": task.task_id,
|
||||
"tier": task.tier,
|
||||
@ -116,6 +147,12 @@ async def upload_result(
|
||||
"pool": task.pool,
|
||||
"subsets": task.subsets,
|
||||
"capabilities": task.capabilities,
|
||||
"category": task.category,
|
||||
"domain": task.domain,
|
||||
"functionality": task.functionality,
|
||||
"trace_distribution": task.trace_distribution,
|
||||
"tool_surface": task.tool_surface,
|
||||
"risk_tags": task.risk_tags,
|
||||
"mean_task_score": task.mean_task_score,
|
||||
"mean_run_score": task.mean_run_score,
|
||||
"mean_completion_score": task.mean_completion_score,
|
||||
@ -143,36 +180,50 @@ async def upload_result(
|
||||
"runs": task.runs,
|
||||
}
|
||||
for task in result.task_results
|
||||
]),
|
||||
],
|
||||
}
|
||||
|
||||
api = HfApi(token=hf_token)
|
||||
ensure_dataset_repo(api, resolved_repo)
|
||||
|
||||
ds = Dataset.from_list([row])
|
||||
shard_name = _submission_shard_name(result.submission_id)
|
||||
with tempfile.TemporaryDirectory(prefix="clawbench-upload-") as tmp_dir:
|
||||
local_path = Path(tmp_dir) / shard_name
|
||||
ds.to_parquet(str(local_path))
|
||||
api.upload_file(
|
||||
path_or_fileobj=str(local_path),
|
||||
path_in_repo=f"data/submissions/{shard_name}",
|
||||
repo_id=resolved_repo,
|
||||
repo_type="dataset",
|
||||
# Read-then-append: load the existing submissions split, add the
|
||||
# new row, deduplicate by submission_id, push the combined dataset
|
||||
# so we never clobber prior rows.
|
||||
combined_rows: list[dict] = []
|
||||
try:
|
||||
from datasets import load_dataset
|
||||
|
||||
existing = load_dataset(
|
||||
resolved_repo,
|
||||
split="submissions",
|
||||
token=hf_token,
|
||||
)
|
||||
combined_rows = [dict(r) for r in existing]
|
||||
logger.info(
|
||||
"Read %d existing submission row(s) from %s",
|
||||
len(combined_rows),
|
||||
resolved_repo,
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.info(
|
||||
"No existing submissions split to append to (%s); starting fresh",
|
||||
exc,
|
||||
)
|
||||
|
||||
new_submission_id = row.get("submission_id")
|
||||
if new_submission_id:
|
||||
combined_rows = [
|
||||
r for r in combined_rows
|
||||
if r.get("submission_id") != new_submission_id
|
||||
]
|
||||
combined_rows.append(row)
|
||||
|
||||
ds = Dataset.from_list(combined_rows)
|
||||
ds.push_to_hub(resolved_repo, split="submissions", token=hf_token)
|
||||
url = f"https://huggingface.co/datasets/{resolved_repo}"
|
||||
logger.info(
|
||||
"Result uploaded to %s as append-only shard %s",
|
||||
"Results uploaded to %s (%d total submission rows)",
|
||||
url,
|
||||
shard_name,
|
||||
len(combined_rows),
|
||||
)
|
||||
return url
|
||||
|
||||
|
||||
def _submission_shard_name(submission_id: str) -> str:
|
||||
safe_id = re.sub(r"[^A-Za-z0-9_.-]+", "-", submission_id.strip()).strip(".-")
|
||||
return f"{safe_id or 'submission'}.parquet"
|
||||
|
||||
|
||||
def _json_column(value: object) -> str:
|
||||
return json.dumps(value, default=str, sort_keys=True, separators=(",", ":"))
|
||||
|
||||
@ -34,6 +34,7 @@ STALE_EVALUATION_SECONDS = max(
|
||||
JOB_HEARTBEAT_INTERVAL_SECONDS * 4,
|
||||
int(os.environ.get("CLAWBENCH_STALE_EVALUATION_SECONDS", "1800")),
|
||||
)
|
||||
OPENCLAW_EVAL_EXEC_HOSTS = {"auto", "gateway", "sandbox", "node"}
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -46,6 +47,12 @@ class ParallelLane:
|
||||
state_dir: Path | None = None
|
||||
log_path: Path | None = None
|
||||
|
||||
@property
|
||||
def home_dir(self) -> Path | None:
|
||||
if self.state_dir is None:
|
||||
return None
|
||||
return self.state_dir.parent / "home"
|
||||
|
||||
@property
|
||||
def ws_url(self) -> str:
|
||||
return f"ws://localhost:{self.port}"
|
||||
@ -300,6 +307,7 @@ class EvalWorker:
|
||||
prompt_variant=job.request.prompt_variant,
|
||||
prepare_run=prepare_run,
|
||||
progress_callback=progress_callback,
|
||||
tool_profile_name=os.environ.get("CLAWBENCH_TOOL_PROFILE_NAME", "") or None,
|
||||
)
|
||||
return await harness.run()
|
||||
|
||||
@ -369,6 +377,7 @@ class EvalWorker:
|
||||
tier=job.request.tier,
|
||||
scenario=job.request.scenario,
|
||||
prompt_variant=job.request.prompt_variant,
|
||||
tool_profile_name=os.environ.get("CLAWBENCH_TOOL_PROFILE_NAME", "") or None,
|
||||
)
|
||||
return summary_harness.compose_result_from_task_stats(
|
||||
ordered_stats,
|
||||
@ -382,7 +391,8 @@ class EvalWorker:
|
||||
)
|
||||
finally:
|
||||
self._stop_parallel_gateways()
|
||||
shutil.rmtree(job_root, ignore_errors=True)
|
||||
if os.environ.get("CLAWBENCH_KEEP_PARALLEL_LANE_ROOT", "").strip() != "1":
|
||||
shutil.rmtree(job_root, ignore_errors=True)
|
||||
|
||||
async def _run_parallel_lane(self, job, lane: ParallelLane, progress: JobProgressTracker):
|
||||
gateway_cmd = self._find_gateway_cmd()
|
||||
@ -430,6 +440,7 @@ class EvalWorker:
|
||||
progress_callback=progress_callback,
|
||||
print_report=False,
|
||||
quiet=True,
|
||||
tool_profile_name=os.environ.get("CLAWBENCH_TOOL_PROFILE_NAME", "") or None,
|
||||
)
|
||||
result = await harness.run()
|
||||
await self._sync_job_progress(job.job_id, progress.clear_lane(lane.index))
|
||||
@ -444,6 +455,9 @@ class EvalWorker:
|
||||
return load_all_tasks(
|
||||
tier=job.request.tier,
|
||||
scenario=job.request.scenario,
|
||||
task_ids=list(getattr(job.request, "task_ids", []) or None)
|
||||
if getattr(job.request, "task_ids", None)
|
||||
else None,
|
||||
prompt_variant=job.request.prompt_variant,
|
||||
)
|
||||
|
||||
@ -503,10 +517,36 @@ class EvalWorker:
|
||||
def _materialize_lane_runtime(self, lane: ParallelLane, job_root: Path) -> None:
|
||||
lane_root = job_root / f"lane-{lane.index}"
|
||||
lane.state_dir = lane_root / "state"
|
||||
lane_home = lane.home_dir
|
||||
if lane_home is not None:
|
||||
(lane_home / ".config").mkdir(parents=True, exist_ok=True)
|
||||
lane.log_path = lane_root / "gateway.log"
|
||||
lane.port = GATEWAY_PORT + (lane.index * GATEWAY_PORT_SPACING)
|
||||
self._seed_lane_state_dir(lane.state_dir)
|
||||
|
||||
def _run_lane_prepare_hook(self, lane: ParallelLane) -> None:
|
||||
hook = os.environ.get("CLAWBENCH_LANE_PREPARE_CMD", "").strip()
|
||||
if not hook:
|
||||
return
|
||||
if lane.state_dir is None:
|
||||
raise RuntimeError(f"Lane {lane.index + 1} state dir missing before prepare hook")
|
||||
lane_home = lane.home_dir
|
||||
if lane_home is None:
|
||||
raise RuntimeError(f"Lane {lane.index + 1} home dir missing before prepare hook")
|
||||
(lane_home / ".config").mkdir(parents=True, exist_ok=True)
|
||||
hook_env = {
|
||||
**os.environ,
|
||||
"HOME": str(lane_home),
|
||||
"OPENCLAW_HOME": str(lane_home),
|
||||
"OPENCLAW_STATE_DIR": str(lane.state_dir),
|
||||
"OPENCLAW_CONFIG_PATH": str(lane.state_dir / "openclaw.json"),
|
||||
"XDG_CONFIG_HOME": str(lane_home / ".config"),
|
||||
"CLAWBENCH_LANE_INDEX": str(lane.index),
|
||||
"CLAWBENCH_LANE_PORT": str(lane.port),
|
||||
}
|
||||
logger.info("Running lane %d prepare hook", lane.index + 1)
|
||||
subprocess.run([hook], env=hook_env, check=True)
|
||||
|
||||
def _seed_lane_state_dir(self, target_state_dir: Path) -> None:
|
||||
source_state_dir = Path(os.environ.get("OPENCLAW_STATE_DIR", os.path.expanduser("~/.openclaw")))
|
||||
shutil.rmtree(target_state_dir, ignore_errors=True)
|
||||
@ -625,6 +665,10 @@ class EvalWorker:
|
||||
_set_nested(data, "browser.headless", True)
|
||||
_set_nested(data, "browser.noSandbox", True)
|
||||
_set_nested(data, "agents.defaults.skipBootstrap", True)
|
||||
_set_nested(data, "tools.exec.host", self._openclaw_eval_exec_host())
|
||||
_set_nested(data, "tools.exec.security", "full")
|
||||
_set_nested(data, "tools.exec.ask", "off")
|
||||
_set_nested(data, "approvals.exec.enabled", False)
|
||||
if self._active_model:
|
||||
_set_nested(data, "agents.defaults.model.primary", self._active_model)
|
||||
_set_nested(data, "agents.defaults.subagents.model.primary", self._active_model)
|
||||
@ -632,6 +676,7 @@ class EvalWorker:
|
||||
tmp_path = cfg_path.with_suffix(".json.tmp")
|
||||
tmp_path.write_text(json.dumps(data, indent=2), encoding="utf-8")
|
||||
tmp_path.replace(cfg_path)
|
||||
self._write_eval_exec_approvals(lane_state_dir)
|
||||
|
||||
def _order_task_stats(self, tasks: list[TaskDefinition], combined_stats: list) -> list:
|
||||
stats_by_id = {}
|
||||
@ -724,6 +769,7 @@ class EvalWorker:
|
||||
"token",
|
||||
"--token",
|
||||
gateway_token,
|
||||
"--compact",
|
||||
],
|
||||
stdout=open("/tmp/gateway.log", "a", encoding="utf-8"),
|
||||
stderr=subprocess.STDOUT,
|
||||
@ -760,6 +806,12 @@ class EvalWorker:
|
||||
f"Gateway /health did not respond within {health_deadline_sec}s. Log:\n{self._read_gateway_log()}"
|
||||
)
|
||||
|
||||
await self._wait_for_gateway_ready_marker(
|
||||
process=self._gateway_process,
|
||||
log_reader=lambda: self._read_gateway_log(limit=20_000),
|
||||
description="Gateway",
|
||||
)
|
||||
|
||||
# Phase B: control-plane probe with retries (see the parallel
|
||||
# variant in _ensure_parallel_gateway for the detailed rationale).
|
||||
gateway_config = GatewayConfig(url=GATEWAY_WS_URL, token=GATEWAY_TOKEN)
|
||||
@ -809,21 +861,30 @@ class EvalWorker:
|
||||
# Re-inject the host config's env + plugins before every restart.
|
||||
if lane.state_dir is not None:
|
||||
self._reinject_host_env_to_lane(lane.state_dir)
|
||||
self._run_lane_prepare_hook(lane)
|
||||
if lane.state_dir is None or lane.log_path is None:
|
||||
raise RuntimeError(f"Lane {lane.index + 1} runtime was not materialized before gateway startup")
|
||||
lane_home = lane.home_dir
|
||||
if lane_home is None:
|
||||
raise RuntimeError(f"Lane {lane.index + 1} home was not materialized before gateway startup")
|
||||
(lane_home / ".config").mkdir(parents=True, exist_ok=True)
|
||||
|
||||
logger.info("Starting lane %d gateway on port %d", lane.index + 1, lane.port)
|
||||
gateway_token = os.environ.get("OPENCLAW_GATEWAY_TOKEN", "clawbench-internal-token")
|
||||
gateway_env = {
|
||||
**os.environ,
|
||||
"OPENCLAW_HOME": os.environ.get("OPENCLAW_HOME", os.path.expanduser("~")),
|
||||
"HOME": str(lane_home),
|
||||
"OPENCLAW_HOME": str(lane_home),
|
||||
"OPENCLAW_STATE_DIR": str(lane.state_dir),
|
||||
"OPENCLAW_CONFIG_PATH": str(lane.state_dir / "openclaw.json"),
|
||||
"XDG_CONFIG_HOME": str(lane_home / ".config"),
|
||||
"OPENCLAW_SKIP_GMAIL_WATCHER": "1",
|
||||
"OPENCLAW_SKIP_CANVAS_HOST": "1",
|
||||
"OPENCLAW_NO_RESPAWN": "1",
|
||||
}
|
||||
self._configure_browser_runtime(gateway_cmd, gateway_env)
|
||||
lane.log_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
lane.log_path.write_text("", encoding="utf-8")
|
||||
log_handle = lane.log_path.open("a", encoding="utf-8")
|
||||
try:
|
||||
process = subprocess.Popen(
|
||||
@ -841,6 +902,7 @@ class EvalWorker:
|
||||
"token",
|
||||
"--token",
|
||||
gateway_token,
|
||||
"--compact",
|
||||
],
|
||||
stdout=log_handle,
|
||||
stderr=subprocess.STDOUT,
|
||||
@ -883,6 +945,12 @@ class EvalWorker:
|
||||
f"Log:\n{self._read_parallel_gateway_log(lane)}"
|
||||
)
|
||||
|
||||
await self._wait_for_gateway_ready_marker(
|
||||
process=process,
|
||||
log_reader=lambda: self._read_parallel_gateway_log(lane, limit=20_000),
|
||||
description=f"Lane {lane.index + 1} gateway",
|
||||
)
|
||||
|
||||
# Phase B: control-plane probe with explicit retries. A healthy
|
||||
# /health response does not guarantee sessions.create works
|
||||
# immediately — plugin registration races can leave the gateway
|
||||
@ -994,6 +1062,10 @@ class EvalWorker:
|
||||
("agents.defaults.skipBootstrap", True),
|
||||
("browser.headless", True),
|
||||
("browser.noSandbox", True),
|
||||
("tools.exec.host", self._openclaw_eval_exec_host()),
|
||||
("tools.exec.security", "full"),
|
||||
("tools.exec.ask", "off"),
|
||||
("approvals.exec.enabled", False),
|
||||
]
|
||||
if self._active_model:
|
||||
config_pairs.extend(
|
||||
@ -1004,9 +1076,50 @@ class EvalWorker:
|
||||
)
|
||||
try:
|
||||
self._patch_openclaw_config(config_pairs)
|
||||
state_dir = Path(
|
||||
gateway_env.get("OPENCLAW_STATE_DIR")
|
||||
or os.environ.get("OPENCLAW_STATE_DIR")
|
||||
or os.path.expanduser("~/.openclaw")
|
||||
)
|
||||
self._write_eval_exec_approvals(state_dir)
|
||||
except Exception as exc:
|
||||
logger.warning("Direct openclaw.json patch failed: %s", exc)
|
||||
|
||||
@staticmethod
|
||||
def _openclaw_eval_exec_host() -> str:
|
||||
value = os.environ.get("OPENCLAW_EXEC_HOST", "gateway").strip().lower()
|
||||
if value in OPENCLAW_EVAL_EXEC_HOSTS:
|
||||
return value
|
||||
logger.warning("Invalid OPENCLAW_EXEC_HOST=%r; using gateway", value)
|
||||
return "gateway"
|
||||
|
||||
@staticmethod
|
||||
def _write_eval_exec_approvals(state_dir: Path) -> None:
|
||||
state_dir.mkdir(parents=True, exist_ok=True)
|
||||
approvals_path = state_dir / "exec-approvals.json"
|
||||
approvals = {
|
||||
"version": 1,
|
||||
"socket": {
|
||||
"path": str(approvals_path.with_suffix(".sock")),
|
||||
"token": "clawbench-eval-token",
|
||||
},
|
||||
"defaults": {
|
||||
"security": "full",
|
||||
"ask": "off",
|
||||
"askFallback": "full",
|
||||
},
|
||||
"agents": {
|
||||
"*": {
|
||||
"security": "full",
|
||||
"ask": "off",
|
||||
"askFallback": "full",
|
||||
}
|
||||
},
|
||||
}
|
||||
tmp_path = approvals_path.with_suffix(".json.tmp")
|
||||
tmp_path.write_text(json.dumps(approvals, indent=2), encoding="utf-8")
|
||||
tmp_path.replace(approvals_path)
|
||||
|
||||
@staticmethod
|
||||
def _patch_openclaw_config(pairs: list[tuple[str, object]]) -> None:
|
||||
state_dir = Path(os.environ.get("OPENCLAW_STATE_DIR") or os.path.expanduser("~/.openclaw"))
|
||||
@ -1051,13 +1164,15 @@ class EvalWorker:
|
||||
# Use a generous dedicated config for the probe. A healthy gateway
|
||||
# usually responds to sessions.create in under a second, but plugin
|
||||
# initialization (especially OpenRouter model list fetch) can add
|
||||
# 10-30s after /health reports 200. The 60s outer bound ensures we
|
||||
# don't give up during a cold-start scenario.
|
||||
# 10-30s after /health reports 200. On cold Docker lanes OpenClaw may
|
||||
# also install provider runtime SDKs during the first sessions.create,
|
||||
# so keep this bound configurable and separate from steady-state RPCs.
|
||||
probe_timeout = float(os.environ.get("CLAWBENCH_GATEWAY_PROBE_TIMEOUT_SECONDS", "180"))
|
||||
probe_config = GatewayConfig(
|
||||
url=gateway_config.url,
|
||||
token=gateway_config.token,
|
||||
connect_timeout=gateway_config.connect_timeout,
|
||||
request_timeout=30.0,
|
||||
request_timeout=probe_timeout,
|
||||
)
|
||||
|
||||
async def _probe() -> None:
|
||||
@ -1068,25 +1183,67 @@ class EvalWorker:
|
||||
await client.delete_session(session_key)
|
||||
|
||||
try:
|
||||
await asyncio.wait_for(_probe(), timeout=60.0)
|
||||
await asyncio.wait_for(_probe(), timeout=probe_timeout + 10.0)
|
||||
except asyncio.TimeoutError as exc:
|
||||
raise RuntimeError(
|
||||
"Gateway control-plane probe timed out after 60s "
|
||||
f"Gateway control-plane probe timed out after {probe_timeout:.0f}s "
|
||||
"(sessions.create hung on a freshly-started gateway); "
|
||||
"lane will be retried by the queue."
|
||||
) from exc
|
||||
|
||||
def _read_gateway_log(self) -> str:
|
||||
async def _wait_for_gateway_ready_marker(self, process: subprocess.Popen, log_reader, description: str) -> None:
|
||||
# OpenClaw 2026.4.26 can answer /health before channels and sidecars
|
||||
# finish startup. Probing sessions.create during that window can hold the
|
||||
# session write lock for minutes. Some lane gateway modes do not emit
|
||||
# the final ready marker, so wait for it briefly after sidecar startup
|
||||
# and then let the bounded control-plane probe decide.
|
||||
ready_deadline_sec = int(os.environ.get("CLAWBENCH_GATEWAY_READY_TIMEOUT_SECONDS", "420"))
|
||||
marker_grace_sec = int(os.environ.get("CLAWBENCH_GATEWAY_READY_MARKER_GRACE_SECONDS", "90"))
|
||||
saw_sidecar_start = False
|
||||
sidecar_start_elapsed: int | None = None
|
||||
for elapsed in range(ready_deadline_sec):
|
||||
if process.poll() is not None:
|
||||
raise RuntimeError(
|
||||
f"{description} exited with code {process.returncode}. Log:\n{log_reader()[-4_000:]}"
|
||||
)
|
||||
|
||||
log_text = log_reader()
|
||||
if "[gateway] ready" in log_text:
|
||||
logger.info("%s ready after %ss", description, elapsed)
|
||||
return
|
||||
if "[gateway] starting channels and sidecars" in log_text:
|
||||
saw_sidecar_start = True
|
||||
if sidecar_start_elapsed is None:
|
||||
sidecar_start_elapsed = elapsed
|
||||
if sidecar_start_elapsed is not None and elapsed - sidecar_start_elapsed >= marker_grace_sec:
|
||||
logger.info(
|
||||
"%s did not emit ready marker %ss after sidecar startup; probing control plane",
|
||||
description,
|
||||
marker_grace_sec,
|
||||
)
|
||||
return
|
||||
if not saw_sidecar_start and elapsed >= 15:
|
||||
return
|
||||
await asyncio.sleep(1)
|
||||
|
||||
logger.warning(
|
||||
"%s did not log ready within %ss; probing control plane anyway. Log:\n%s",
|
||||
description,
|
||||
ready_deadline_sec,
|
||||
log_reader()[-4_000:],
|
||||
)
|
||||
|
||||
def _read_gateway_log(self, limit: int = 4_000) -> str:
|
||||
try:
|
||||
return Path("/tmp/gateway.log").read_text(encoding="utf-8", errors="replace")[-4_000:]
|
||||
return Path("/tmp/gateway.log").read_text(encoding="utf-8", errors="replace")[-limit:]
|
||||
except Exception:
|
||||
return "(no gateway log)"
|
||||
|
||||
def _read_parallel_gateway_log(self, lane: ParallelLane) -> str:
|
||||
def _read_parallel_gateway_log(self, lane: ParallelLane, limit: int = 4_000) -> str:
|
||||
if lane.log_path is None:
|
||||
return "(no gateway log)"
|
||||
try:
|
||||
return lane.log_path.read_text(encoding="utf-8", errors="replace")[-4_000:]
|
||||
return lane.log_path.read_text(encoding="utf-8", errors="replace")[-limit:]
|
||||
except Exception:
|
||||
return "(no gateway log)"
|
||||
|
||||
|
||||
168
docs/DOMAIN_PROOF_PLAN.md
Normal file
168
docs/DOMAIN_PROOF_PLAN.md
Normal file
@ -0,0 +1,168 @@
|
||||
# ClawBench Domain Proof Plan
|
||||
|
||||
This plan turns ClawBench from a strong benchmark into an evidence package for
|
||||
the central thesis:
|
||||
|
||||
> Model + general harness + plugins can cover the task domains served by most
|
||||
> agent SaaS products.
|
||||
|
||||
## What Exists Now
|
||||
|
||||
- `tasks-public/`: small public Core v1 task set for reproducibility,
|
||||
examples, and regression tracking.
|
||||
- `tasks-domain/`: domain coverage scaffold for the larger proof corpus.
|
||||
- Deterministic scoring: file, execution, memory, session, cron, gateway, DOM,
|
||||
and structured output assertions.
|
||||
- Process scoring: read-before-write, self-verification, recovery, safety,
|
||||
tool-family fit.
|
||||
- Reliability scoring: repeated runs, pass^k, worst-of-n, variance score,
|
||||
bootstrap confidence intervals.
|
||||
- Dynamics analysis: regime classification, survival, constraint index,
|
||||
variance decomposition, SNR-weighted ranking.
|
||||
- Configuration diagnostics: plugin profile fingerprints, utilization audit,
|
||||
manifest-vs-reality gap, surprise detection, recommendations.
|
||||
- Adapter groundwork: canonical task schema plus OpenClaw and Hermes adapter
|
||||
modules. OpenClaw is the executable harness path today.
|
||||
|
||||
## Ablation Design
|
||||
|
||||
Each domain task should run under four configuration classes.
|
||||
|
||||
| Class | Description | Question Answered |
|
||||
|---|---|---|
|
||||
| `model_only` | Model with minimal shell/filesystem access | What can the raw model do with little scaffolding? |
|
||||
| `model_plus_harness` | Model plus the general OpenClaw-style harness | What does the harness contribute by itself? |
|
||||
| `core_plugins` | Harness plus browser, memory, filesystem, execution plugins | What do common plugins add across domains? |
|
||||
| `domain_plugins` | Harness plus domain-specific state/API plugins | Does the plugin stack close the gap to specialized SaaS agents? |
|
||||
|
||||
Run policy:
|
||||
|
||||
- 3 runs per task per configuration class
|
||||
- same model snapshots across all classes
|
||||
- same OpenClaw/harness build across all classes
|
||||
- same private task variants across all classes
|
||||
- fixed time, token, tool, and approval budgets
|
||||
|
||||
## Primary Metrics
|
||||
|
||||
- hard success: deterministic completion only
|
||||
- reliability: pass^k, pass rate, worst-of-n, variance score
|
||||
- process quality: trace-derived behavior quality
|
||||
- cost efficiency: tokens/pass, cost/pass, p50/p95 latency
|
||||
- failure profile: 13 deterministic failure modes
|
||||
- plugin lift: `domain_plugins - model_plus_harness`
|
||||
- harness lift: `model_plus_harness - model_only`
|
||||
- plugin utilization: loaded vs invoked, tool-family coverage
|
||||
- manifest-reality gap: claimed plugin capabilities vs observed use
|
||||
|
||||
## Proof Criteria
|
||||
|
||||
A domain is considered covered when:
|
||||
|
||||
- `domain_plugins` reaches at least 0.85 hard success on private variants
|
||||
- pass^k is at least 0.75 across 3 runs
|
||||
- worst-of-n is at least 0.65
|
||||
- no dominant failure mode accounts for more than 35 percent of failures
|
||||
- plugin utilization shows the relevant domain plugin was invoked on tasks
|
||||
where it was required
|
||||
|
||||
The broader thesis is credible when:
|
||||
|
||||
- at least 10 of 12 domains meet the domain coverage bar
|
||||
- plugin lift is larger than model-to-model variance on the same task set
|
||||
- holdout variants preserve the same conclusions
|
||||
- SNR analysis shows the ranking is signal-dominant, not seed-noise-dominant
|
||||
- cross-harness adapters reproduce scores within an agreed tolerance
|
||||
|
||||
## Workstream 1: Adapter Execution
|
||||
|
||||
Goal: make OpenClaw, Hermes, Codex, and Claude Code comparable through one
|
||||
canonical task pipeline.
|
||||
|
||||
Near-term:
|
||||
|
||||
- keep `--adapter openclaw` as the executable path
|
||||
- route OpenClaw through the adapter implementation instead of inline gateway
|
||||
code
|
||||
- add compatibility reporting for every task and adapter
|
||||
- implement Codex and Claude Code transcript adapters
|
||||
- promote Hermes from first-turn runner to full compatible runner where possible
|
||||
|
||||
Help wanted:
|
||||
|
||||
- harness owners: SDK or CLI entry points that expose full transcripts
|
||||
- plugin owners: tool-call provenance and registration traces
|
||||
- serving owners: stable model IDs, usage accounting, and reproducible configs
|
||||
|
||||
## Workstream 2: Plugin Provenance
|
||||
|
||||
Goal: attribute score changes to plugins instead of treating the agent as a
|
||||
black box.
|
||||
|
||||
Near-term:
|
||||
|
||||
- capture plugin registration traces at gateway startup
|
||||
- attach plugin owner IDs to every tool call
|
||||
- store transcripts and plugin traces alongside result JSON
|
||||
- include utilization and manifest-reality gaps in every `--profile` run
|
||||
|
||||
Help wanted:
|
||||
|
||||
- OpenClaw plugin registry hooks for runtime trace export
|
||||
- partner plugins with typed manifests and clean provenance
|
||||
- ClawHub metadata sync for manifest cache refresh
|
||||
|
||||
## Workstream 3: Domain Corpus
|
||||
|
||||
Goal: replace a small public task suite with a coverage matrix for real agent
|
||||
SaaS domains.
|
||||
|
||||
Near-term:
|
||||
|
||||
- 12 domains in `tasks-domain/MANIFEST.yaml`
|
||||
- 5 templates per domain
|
||||
- 3 private variants per template
|
||||
- domain-specific plugin requirement declarations
|
||||
- deterministic verifier contracts before any semantic judge
|
||||
|
||||
Help wanted:
|
||||
|
||||
- partner traces that can be transformed into private variants
|
||||
- domain experts to validate task realism and verifier quality
|
||||
- infra for private variant generation and contamination audits
|
||||
|
||||
## Workstream 4: Serving and Cost Rigor
|
||||
|
||||
Goal: compare open and closed models under reproducible serving constraints.
|
||||
|
||||
Near-term:
|
||||
|
||||
- record model snapshot, provider, serving stack, quantization, GPU class,
|
||||
context length, temperature, reasoning settings, and token accounting
|
||||
- report cost/pass and latency/pass alongside capability
|
||||
- run open-weight models through vLLM-backed profiles where available
|
||||
|
||||
Help wanted:
|
||||
|
||||
- vLLM serving recipes for consistent agent-eval runs
|
||||
- Hugging Face model hosting and dataset plumbing
|
||||
- NVIDIA profiling on representative GPU setups
|
||||
|
||||
## Workstream 5: Evidence Package
|
||||
|
||||
Goal: make the conclusion auditable by third parties.
|
||||
|
||||
Near-term:
|
||||
|
||||
- publish public Core v1 results as the reproducibility baseline
|
||||
- publish domain coverage matrix without private task bodies
|
||||
- publish aggregated per-domain scores, confidence intervals, and failure modes
|
||||
- keep private variants for contamination-resistant official scoring
|
||||
- publish scripts that regenerate every report from cached run JSON
|
||||
|
||||
Help wanted:
|
||||
|
||||
- compute credits for multi-model sweeps
|
||||
- review from model serving, benchmark, and infrastructure teams
|
||||
- public hosting for result artifacts and visual dashboards
|
||||
|
||||
108
docs/MEETING_BRIEF_NVIDIA_HF_VLLM_2026-04-24.md
Normal file
108
docs/MEETING_BRIEF_NVIDIA_HF_VLLM_2026-04-24.md
Normal file
@ -0,0 +1,108 @@
|
||||
# Meeting Brief: Nvidia, Hugging Face, vLLM
|
||||
|
||||
Meeting date: April 24, 2026
|
||||
|
||||
## One-Liner
|
||||
|
||||
ClawBench is a rigorous agent benchmark for measuring whether a model plus a
|
||||
general harness plus plugins can cover the task domains served by most agent
|
||||
SaaS products.
|
||||
|
||||
## What I Built
|
||||
|
||||
- A deterministic, trace-based benchmark for agents, not just models.
|
||||
- A small public Core v1 set for reproducibility and regression tracking.
|
||||
- A larger domain-suite scaffold for CRM, support, docs/sheets/slides, email,
|
||||
calendar, finance ops, analytics, security admin, ecommerce, devtools,
|
||||
research, and personal ops.
|
||||
- A scoring system that separates completion, process quality, behavior,
|
||||
semantic quality, reliability, latency, tokens, cost, and failure modes.
|
||||
- A dynamics-analysis stack that explains how agents fail: trapped, diffusive,
|
||||
convergent, chaotic, limit-cycle, and survival curves.
|
||||
- A plugin-profile diagnostic layer that fingerprints configurations, estimates
|
||||
plugin contribution, detects dead-weight plugins, and recommends changes.
|
||||
- An adapter boundary so OpenClaw can become one harness among several rather
|
||||
than the only execution path.
|
||||
|
||||
## Goal
|
||||
|
||||
Prove, with reproducible data, that specialized agent SaaS can be decomposed
|
||||
into:
|
||||
|
||||
1. a base model,
|
||||
2. a general agent harness,
|
||||
3. a plugin stack,
|
||||
4. domain-specific state/API access,
|
||||
5. deterministic evaluation contracts.
|
||||
|
||||
If the data supports it, the conclusion is that the open plugin ecosystem can
|
||||
subsume a large share of agent SaaS workflows.
|
||||
|
||||
## What The 19 Public Tasks Are
|
||||
|
||||
The 19 public tasks are not the whole proof. They are the public Core v1 set:
|
||||
|
||||
- reproducibility baseline
|
||||
- CI/regression suite
|
||||
- adapter bring-up set
|
||||
- public explanation of methodology
|
||||
|
||||
The proof corpus is the domain suite. That needs more tasks, private variants,
|
||||
and ablations.
|
||||
|
||||
## What Still Needs Help
|
||||
|
||||
- Cross-harness execution: OpenClaw is executable today; Hermes/Codex/Claude
|
||||
Code need end-to-end adapter wiring.
|
||||
- Plugin provenance: tool calls need stable plugin owner IDs and registration
|
||||
traces.
|
||||
- Domain corpus: each domain needs realistic private variants and hardened
|
||||
deterministic verifiers.
|
||||
- Serving reproducibility: open-weight models need pinned serving recipes,
|
||||
GPU profiles, usage accounting, and latency/cost measurement.
|
||||
- Scale: the domain ablations need a lot more runs than the public Core set.
|
||||
|
||||
## What I Want From Nvidia
|
||||
|
||||
- GPU-backed evaluation capacity for repeated domain sweeps.
|
||||
- Profiling help: latency/pass, tokens/sec, cost/pass, memory pressure, and
|
||||
concurrency behavior for long agent trajectories.
|
||||
- Reference serving profiles for open-weight models on NVIDIA hardware.
|
||||
- Advice on making the benchmark useful for enterprise agent deployment, not
|
||||
just academic ranking.
|
||||
|
||||
## What I Want From Hugging Face
|
||||
|
||||
- Dataset hosting for public results, cached run JSON, and public task metadata.
|
||||
- Private/controlled dataset workflow for holdout variants and partner traces.
|
||||
- Model hosting paths for open-weight baseline runs.
|
||||
- Help making ClawBench results easy to browse, reproduce, and cite.
|
||||
|
||||
## What I Want From vLLM
|
||||
|
||||
- A stable serving recipe for agent-eval workloads with long context and many
|
||||
tool turns.
|
||||
- Usage accounting: prompt, output, reasoning/cache tokens where available.
|
||||
- Throughput and latency guidance for many parallel agent runs.
|
||||
- Integration advice for making model snapshots and serving configs auditable.
|
||||
|
||||
## Proposed Collaboration
|
||||
|
||||
1. Run Core v1 as a public sanity check across agreed open and closed models.
|
||||
2. Build 12-domain private proof suite from `tasks-domain/`.
|
||||
3. Run four ablation classes: model only, model plus harness, core plugins,
|
||||
domain plugins.
|
||||
4. Publish aggregated domain coverage, reliability, failure modes, and cost.
|
||||
5. Iterate on gaps where specialized SaaS still beats the open stack.
|
||||
|
||||
## The Ask
|
||||
|
||||
Help make the proof hard to dismiss:
|
||||
|
||||
- enough compute to run repetitions,
|
||||
- clean serving recipes,
|
||||
- model and dataset hosting,
|
||||
- infrastructure review,
|
||||
- partner traces or realistic domain workflows,
|
||||
- public artifacts that other teams can reproduce.
|
||||
|
||||
181
patches/patch_openclaw_426_agent_create_queue.mjs
Normal file
181
patches/patch_openclaw_426_agent_create_queue.mjs
Normal file
@ -0,0 +1,181 @@
|
||||
import { readFileSync, writeFileSync } from "node:fs";
|
||||
|
||||
const dist = "/app/dist/server-methods-b3jaTRE_.js";
|
||||
|
||||
function replaceOnce(text, oldValue, newValue) {
|
||||
if (!text.includes(oldValue)) {
|
||||
throw new Error(`patch target not found: ${oldValue.slice(0, 80)}`);
|
||||
}
|
||||
return text.replace(oldValue, newValue);
|
||||
}
|
||||
|
||||
let source = readFileSync(dist, "utf8");
|
||||
|
||||
source = replaceOnce(
|
||||
source,
|
||||
"const agentsHandlers = {\n",
|
||||
`let agentConfigMutationQueue = Promise.resolve();
|
||||
async function runAgentConfigMutation(fn) {
|
||||
\tconst previous = agentConfigMutationQueue;
|
||||
\tlet release;
|
||||
\tagentConfigMutationQueue = new Promise((resolve) => {
|
||||
\t\trelease = resolve;
|
||||
\t});
|
||||
\tawait previous.catch(() => {});
|
||||
\ttry {
|
||||
\t\treturn await fn();
|
||||
\t} finally {
|
||||
\t\trelease();
|
||||
\t}
|
||||
}
|
||||
const agentsHandlers = {
|
||||
`,
|
||||
);
|
||||
|
||||
source = replaceOnce(
|
||||
source,
|
||||
`\t\tconst cfg = context.getRuntimeConfig();
|
||||
\t\tconst rawName = params.name.trim();`,
|
||||
`\t\tconst rawName = params.name.trim();`,
|
||||
);
|
||||
|
||||
source = replaceOnce(
|
||||
source,
|
||||
`\t\tif (findAgentEntryIndex(listAgentEntries(cfg), agentId) >= 0) {
|
||||
\t\t\trespond(false, void 0, errorShape(ErrorCodes.INVALID_REQUEST, \`agent "\${agentId}" already exists\`));
|
||||
\t\t\treturn;
|
||||
\t\t}
|
||||
\t\tconst workspaceDir = resolveUserPath(params.workspace.trim());`,
|
||||
`\t\tconst workspaceDir = resolveUserPath(params.workspace.trim());`,
|
||||
);
|
||||
|
||||
source = replaceOnce(
|
||||
source,
|
||||
`\t\tlet nextConfig = applyAgentConfig(cfg, {
|
||||
\t\t\tagentId,
|
||||
\t\t\tname: safeName,
|
||||
\t\t\tworkspace: workspaceDir,
|
||||
\t\t\tmodel,
|
||||
\t\t\tidentity: {
|
||||
\t\t\t\tname: safeName,
|
||||
\t\t\t\t...emoji ? { emoji: sanitizeIdentityLine(emoji) } : {},
|
||||
\t\t\t\t...avatar ? { avatar: sanitizeIdentityLine(avatar) } : {}
|
||||
\t\t\t}
|
||||
\t\t});
|
||||
\t\tconst agentDir = resolveAgentDir(nextConfig, agentId);
|
||||
\t\tnextConfig = applyAgentConfig(nextConfig, {
|
||||
\t\t\tagentId,
|
||||
\t\t\tagentDir
|
||||
\t\t});
|
||||
\t\tawait ensureAgentWorkspace({
|
||||
\t\t\tdir: workspaceDir,
|
||||
\t\t\tensureBootstrapFiles: !Boolean(nextConfig.agents?.defaults?.skipBootstrap)
|
||||
\t\t});
|
||||
\t\tawait fs$1.mkdir(resolveSessionTranscriptsDirForAgent(agentId), { recursive: true });
|
||||
\t\tconst persistedIdentity = normalizeIdentityForFile(resolveAgentIdentity(nextConfig, agentId));
|
||||
\t\tif (persistedIdentity) {
|
||||
\t\t\tconst identityContent = await buildIdentityMarkdownOrRespondUnsafe({
|
||||
\t\t\t\trespond,
|
||||
\t\t\t\tworkspaceDir,
|
||||
\t\t\t\tidentity: persistedIdentity
|
||||
\t\t\t});
|
||||
\t\t\tif (identityContent === null) return;
|
||||
\t\t\tif (!await writeWorkspaceFileOrRespond({
|
||||
\t\t\t\trespond,
|
||||
\t\t\t\tworkspaceDir,
|
||||
\t\t\t\tname: "IDENTITY.md",
|
||||
\t\t\t\tcontent: identityContent
|
||||
\t\t\t})) return;
|
||||
\t\t}
|
||||
\t\tawait replaceConfigFile({
|
||||
\t\t\tnextConfig,
|
||||
\t\t\tafterWrite: { mode: "auto" }
|
||||
\t\t});
|
||||
\t\trespond(true, {
|
||||
\t\t\tok: true,
|
||||
\t\t\tagentId,
|
||||
\t\t\tname: safeName,
|
||||
\t\t\tworkspace: workspaceDir,
|
||||
\t\t\tmodel
|
||||
\t\t}, void 0);`,
|
||||
`\t\tconst result = await runAgentConfigMutation(async () => {
|
||||
\t\t\tconst cfg = context.getRuntimeConfig();
|
||||
\t\t\tif (findAgentEntryIndex(listAgentEntries(cfg), agentId) >= 0) {
|
||||
\t\t\t\trespond(false, void 0, errorShape(ErrorCodes.INVALID_REQUEST, \`agent "\${agentId}" already exists\`));
|
||||
\t\t\t\treturn null;
|
||||
\t\t\t}
|
||||
\t\t\tlet nextConfig = applyAgentConfig(cfg, {
|
||||
\t\t\t\tagentId,
|
||||
\t\t\t\tname: safeName,
|
||||
\t\t\t\tworkspace: workspaceDir,
|
||||
\t\t\t\tmodel,
|
||||
\t\t\t\tidentity: {
|
||||
\t\t\t\t\tname: safeName,
|
||||
\t\t\t\t\t...emoji ? { emoji: sanitizeIdentityLine(emoji) } : {},
|
||||
\t\t\t\t\t...avatar ? { avatar: sanitizeIdentityLine(avatar) } : {}
|
||||
\t\t\t\t}
|
||||
\t\t\t});
|
||||
\t\t\tconst agentDir = resolveAgentDir(nextConfig, agentId);
|
||||
\t\t\tnextConfig = applyAgentConfig(nextConfig, {
|
||||
\t\t\t\tagentId,
|
||||
\t\t\t\tagentDir
|
||||
\t\t\t});
|
||||
\t\t\tawait ensureAgentWorkspace({
|
||||
\t\t\t\tdir: workspaceDir,
|
||||
\t\t\t\tensureBootstrapFiles: !Boolean(nextConfig.agents?.defaults?.skipBootstrap)
|
||||
\t\t\t});
|
||||
\t\t\tawait fs$1.mkdir(resolveSessionTranscriptsDirForAgent(agentId), { recursive: true });
|
||||
\t\t\tconst persistedIdentity = normalizeIdentityForFile(resolveAgentIdentity(nextConfig, agentId));
|
||||
\t\t\tif (persistedIdentity) {
|
||||
\t\t\t\tconst identityContent = await buildIdentityMarkdownOrRespondUnsafe({
|
||||
\t\t\t\t\trespond,
|
||||
\t\t\t\t\tworkspaceDir,
|
||||
\t\t\t\t\tidentity: persistedIdentity
|
||||
\t\t\t\t});
|
||||
\t\t\t\tif (identityContent === null) return null;
|
||||
\t\t\t\tif (!await writeWorkspaceFileOrRespond({
|
||||
\t\t\t\t\trespond,
|
||||
\t\t\t\t\tworkspaceDir,
|
||||
\t\t\t\t\tname: "IDENTITY.md",
|
||||
\t\t\t\t\tcontent: identityContent
|
||||
\t\t\t\t})) return null;
|
||||
\t\t\t}
|
||||
\t\t\tawait replaceConfigFile({
|
||||
\t\t\t\tnextConfig,
|
||||
\t\t\t\tafterWrite: { mode: "auto" }
|
||||
\t\t\t});
|
||||
\t\t\treturn true;
|
||||
\t\t});
|
||||
\t\tif (!result) return;
|
||||
\t\trespond(true, {
|
||||
\t\t\tok: true,
|
||||
\t\t\tagentId,
|
||||
\t\t\tname: safeName,
|
||||
\t\t\tworkspace: workspaceDir,
|
||||
\t\t\tmodel
|
||||
\t\t}, void 0);`,
|
||||
);
|
||||
|
||||
for (const marker of [
|
||||
`\t\t\tawait replaceConfigFile({
|
||||
\t\t\t\tnextConfig,
|
||||
\t\t\t\tafterWrite: { mode: "auto" }
|
||||
\t\t\t});`,
|
||||
`\t\tawait replaceConfigFile({
|
||||
\t\t\tnextConfig,
|
||||
\t\t\tafterWrite: { mode: "auto" }
|
||||
\t\t});`,
|
||||
`\t\tawait replaceConfigFile({
|
||||
\t\t\tnextConfig: result.config,
|
||||
\t\t\tafterWrite: { mode: "auto" }
|
||||
\t\t});`,
|
||||
]) {
|
||||
source = replaceOnce(
|
||||
source,
|
||||
marker,
|
||||
marker.replace(`{ mode: "auto" }`, `{ mode: "none", reason: "clawbench-agent-lifecycle" }`),
|
||||
);
|
||||
}
|
||||
|
||||
writeFileSync(dist, source);
|
||||
console.log(`patched ${dist}`);
|
||||
212
patches/patch_opus47.py
Normal file
212
patches/patch_opus47.py
Normal file
@ -0,0 +1,212 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Patch pi-ai and openclaw bundles to recognize claude-opus-4-7 (and sonnet-4-7).
|
||||
|
||||
Runs inside the Docker image as a RUN step. Idempotent: re-running is a no-op.
|
||||
"""
|
||||
|
||||
import re
|
||||
import sys
|
||||
import os
|
||||
|
||||
PI_AI_CATALOG = "/app/node_modules/@mariozechner/pi-ai/dist/models.generated.js"
|
||||
ANTHROPIC_REGISTER_GLOB = "/app/dist/register.runtime-*.js"
|
||||
|
||||
|
||||
def patch_pi_ai_catalog(path: str) -> bool:
|
||||
with open(path) as fh:
|
||||
src = fh.read()
|
||||
if '"claude-opus-4-7"' in src:
|
||||
print(f"[patch] {path}: claude-opus-4-7 already present, skipping")
|
||||
return False
|
||||
|
||||
# Find the claude-opus-4-6 entry and splice in opus-4-7 + sonnet-4-7 right after.
|
||||
# Use substring scanning rather than regex because each entry contains a nested
|
||||
# `cost: { ... }` object (which breaks naive `[^{}]` patterns).
|
||||
start_marker = '"claude-opus-4-6": {'
|
||||
start_idx = src.find(start_marker)
|
||||
if start_idx == -1:
|
||||
print(f"[patch] ERROR: could not locate claude-opus-4-6 anchor in {path}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
# Walk forward from the opening `{` counting nesting until it balances to 0.
|
||||
depth = 0
|
||||
i = start_idx
|
||||
while i < len(src):
|
||||
ch = src[i]
|
||||
if ch == '{':
|
||||
depth += 1
|
||||
elif ch == '}':
|
||||
depth -= 1
|
||||
if depth == 0:
|
||||
i += 1 # include '}'
|
||||
break
|
||||
i += 1
|
||||
if depth != 0:
|
||||
print(f"[patch] ERROR: unbalanced braces walking claude-opus-4-6 entry in {path}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
# There should be a trailing comma after the closing brace.
|
||||
if i < len(src) and src[i] == ',':
|
||||
i += 1
|
||||
anchor_end = i
|
||||
|
||||
class _M:
|
||||
def __init__(self, end): self._end = end
|
||||
def end(self): return self._end
|
||||
m = _M(anchor_end)
|
||||
|
||||
insertion = (
|
||||
"\n"
|
||||
' "claude-opus-4-7": {\n'
|
||||
' id: "claude-opus-4-7",\n'
|
||||
' name: "Claude Opus 4.7",\n'
|
||||
' api: "anthropic-messages",\n'
|
||||
' provider: "anthropic",\n'
|
||||
' baseUrl: "https://api.anthropic.com",\n'
|
||||
" reasoning: true,\n"
|
||||
' input: ["text", "image"],\n'
|
||||
" cost: {\n"
|
||||
" input: 5,\n"
|
||||
" output: 25,\n"
|
||||
" cacheRead: 0.5,\n"
|
||||
" cacheWrite: 6.25,\n"
|
||||
" },\n"
|
||||
" contextWindow: 1000000,\n"
|
||||
" maxTokens: 128000,\n"
|
||||
" },\n"
|
||||
' "claude-sonnet-4-7": {\n'
|
||||
' id: "claude-sonnet-4-7",\n'
|
||||
' name: "Claude Sonnet 4.7",\n'
|
||||
' api: "anthropic-messages",\n'
|
||||
' provider: "anthropic",\n'
|
||||
' baseUrl: "https://api.anthropic.com",\n'
|
||||
" reasoning: true,\n"
|
||||
' input: ["text", "image"],\n'
|
||||
" cost: {\n"
|
||||
" input: 3,\n"
|
||||
" output: 15,\n"
|
||||
" cacheRead: 0.3,\n"
|
||||
" cacheWrite: 3.75,\n"
|
||||
" },\n"
|
||||
" contextWindow: 1000000,\n"
|
||||
" maxTokens: 128000,\n"
|
||||
" },"
|
||||
)
|
||||
|
||||
patched = src[: m.end()] + insertion + src[m.end():]
|
||||
with open(path, "w") as fh:
|
||||
fh.write(patched)
|
||||
print(f"[patch] {path}: inserted claude-opus-4-7 and claude-sonnet-4-7")
|
||||
return True
|
||||
|
||||
|
||||
def patch_openclaw_anthropic_register(path: str) -> bool:
|
||||
with open(path) as fh:
|
||||
src = fh.read()
|
||||
if "ANTHROPIC_OPUS_47_MODEL_ID" in src:
|
||||
print(f"[patch] {path}: 4-7 support already present, skipping")
|
||||
return False
|
||||
|
||||
# Skip files that are not the anthropic register.runtime (other plugins
|
||||
# share the same `register.runtime-*.js` naming convention).
|
||||
if 'PROVIDER_ID = "anthropic"' not in src or "ANTHROPIC_MODERN_MODEL_PREFIXES" not in src:
|
||||
print(f"[patch] {path}: not the anthropic register.runtime bundle, skipping")
|
||||
return False
|
||||
|
||||
# 1. Inject new constants after the sonnet template constant.
|
||||
sonnet_tpl_anchor = 'const ANTHROPIC_SONNET_TEMPLATE_MODEL_IDS = ["claude-sonnet-4-5", "claude-sonnet-4.5"];'
|
||||
if sonnet_tpl_anchor not in src:
|
||||
print(f"[patch] ERROR: sonnet template anchor not found in {path}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
new_consts = (
|
||||
sonnet_tpl_anchor + "\n"
|
||||
'const ANTHROPIC_OPUS_47_MODEL_ID = "claude-opus-4-7";\n'
|
||||
'const ANTHROPIC_OPUS_47_DOT_MODEL_ID = "claude-opus-4.7";\n'
|
||||
'const ANTHROPIC_SONNET_47_MODEL_ID = "claude-sonnet-4-7";\n'
|
||||
'const ANTHROPIC_SONNET_47_DOT_MODEL_ID = "claude-sonnet-4.7";'
|
||||
)
|
||||
src = src.replace(sonnet_tpl_anchor, new_consts)
|
||||
|
||||
# 2. Extend ANTHROPIC_MODERN_MODEL_PREFIXES.
|
||||
prefixes_anchor = 'const ANTHROPIC_MODERN_MODEL_PREFIXES = [\n\t"claude-opus-4-6",\n\t"claude-sonnet-4-6",'
|
||||
prefixes_new = 'const ANTHROPIC_MODERN_MODEL_PREFIXES = [\n\t"claude-opus-4-7",\n\t"claude-sonnet-4-7",\n\t"claude-opus-4-6",\n\t"claude-sonnet-4-6",'
|
||||
if prefixes_anchor not in src:
|
||||
print(f"[patch] ERROR: modern prefixes anchor not found in {path}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
src = src.replace(prefixes_anchor, prefixes_new)
|
||||
|
||||
# 3. Add 4-7 forward-compat branches ahead of the 4-6 opus/sonnet branches.
|
||||
resolve_anchor = (
|
||||
"function resolveAnthropicForwardCompatModel(ctx) {\n"
|
||||
"\treturn resolveAnthropic46ForwardCompatModel({\n"
|
||||
"\t\tctx,\n"
|
||||
"\t\tdashModelId: ANTHROPIC_OPUS_46_MODEL_ID,"
|
||||
)
|
||||
resolve_new = (
|
||||
"function resolveAnthropicForwardCompatModel(ctx) {\n"
|
||||
"\treturn resolveAnthropic46ForwardCompatModel({\n"
|
||||
"\t\tctx,\n"
|
||||
'\t\tdashModelId: ANTHROPIC_OPUS_47_MODEL_ID,\n'
|
||||
'\t\tdotModelId: ANTHROPIC_OPUS_47_DOT_MODEL_ID,\n'
|
||||
'\t\tdashTemplateId: "claude-opus-4-6",\n'
|
||||
'\t\tdotTemplateId: "claude-opus-4.6",\n'
|
||||
"\t\tfallbackTemplateIds: ANTHROPIC_OPUS_TEMPLATE_MODEL_IDS\n"
|
||||
"\t}) ?? resolveAnthropic46ForwardCompatModel({\n"
|
||||
"\t\tctx,\n"
|
||||
'\t\tdashModelId: ANTHROPIC_SONNET_47_MODEL_ID,\n'
|
||||
'\t\tdotModelId: ANTHROPIC_SONNET_47_DOT_MODEL_ID,\n'
|
||||
'\t\tdashTemplateId: "claude-sonnet-4-6",\n'
|
||||
'\t\tdotTemplateId: "claude-sonnet-4.6",\n'
|
||||
"\t\tfallbackTemplateIds: ANTHROPIC_SONNET_TEMPLATE_MODEL_IDS\n"
|
||||
"\t}) ?? resolveAnthropic46ForwardCompatModel({\n"
|
||||
"\t\tctx,\n"
|
||||
"\t\tdashModelId: ANTHROPIC_OPUS_46_MODEL_ID,"
|
||||
)
|
||||
if resolve_anchor not in src:
|
||||
print(f"[patch] ERROR: forward-compat resolver anchor not found in {path}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
src = src.replace(resolve_anchor, resolve_new)
|
||||
|
||||
# 4. Make adaptive-thinking default cover 4-7 too.
|
||||
adaptive_anchor = (
|
||||
"function shouldUseAnthropicAdaptiveThinkingDefault(modelId) {\n"
|
||||
"\tconst lowerModelId = normalizeLowercaseStringOrEmpty(modelId);\n"
|
||||
"\treturn lowerModelId.startsWith(ANTHROPIC_OPUS_46_MODEL_ID) || lowerModelId.startsWith(ANTHROPIC_OPUS_46_DOT_MODEL_ID) || lowerModelId.startsWith(ANTHROPIC_SONNET_46_MODEL_ID) || lowerModelId.startsWith(ANTHROPIC_SONNET_46_DOT_MODEL_ID);\n"
|
||||
"}"
|
||||
)
|
||||
adaptive_new = (
|
||||
"function shouldUseAnthropicAdaptiveThinkingDefault(modelId) {\n"
|
||||
"\tconst lowerModelId = normalizeLowercaseStringOrEmpty(modelId);\n"
|
||||
"\treturn lowerModelId.startsWith(ANTHROPIC_OPUS_47_MODEL_ID) || lowerModelId.startsWith(ANTHROPIC_OPUS_47_DOT_MODEL_ID) || lowerModelId.startsWith(ANTHROPIC_SONNET_47_MODEL_ID) || lowerModelId.startsWith(ANTHROPIC_SONNET_47_DOT_MODEL_ID) || lowerModelId.startsWith(ANTHROPIC_OPUS_46_MODEL_ID) || lowerModelId.startsWith(ANTHROPIC_OPUS_46_DOT_MODEL_ID) || lowerModelId.startsWith(ANTHROPIC_SONNET_46_MODEL_ID) || lowerModelId.startsWith(ANTHROPIC_SONNET_46_DOT_MODEL_ID);\n"
|
||||
"}"
|
||||
)
|
||||
if adaptive_anchor in src:
|
||||
src = src.replace(adaptive_anchor, adaptive_new)
|
||||
|
||||
with open(path, "w") as fh:
|
||||
fh.write(src)
|
||||
print(f"[patch] {path}: added claude-opus-4-7 / claude-sonnet-4-7 forward-compat support")
|
||||
return True
|
||||
|
||||
|
||||
def main() -> None:
|
||||
import glob
|
||||
|
||||
any_changed = False
|
||||
if os.path.exists(PI_AI_CATALOG):
|
||||
any_changed |= patch_pi_ai_catalog(PI_AI_CATALOG)
|
||||
else:
|
||||
print(f"[patch] WARNING: {PI_AI_CATALOG} not found", file=sys.stderr)
|
||||
|
||||
candidates = sorted(glob.glob(ANTHROPIC_REGISTER_GLOB))
|
||||
if not candidates:
|
||||
print(f"[patch] WARNING: no files match {ANTHROPIC_REGISTER_GLOB}", file=sys.stderr)
|
||||
for cand in candidates:
|
||||
any_changed |= patch_openclaw_anthropic_register(cand)
|
||||
|
||||
if any_changed:
|
||||
print("[patch] success")
|
||||
else:
|
||||
print("[patch] no changes applied (already patched)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
26
profiles/frontier_deepseek_v4.yaml
Normal file
26
profiles/frontier_deepseek_v4.yaml
Normal file
@ -0,0 +1,26 @@
|
||||
profile:
|
||||
name: frontier-deepseek-v4
|
||||
base_model: deepseek/v4-pro
|
||||
notes: |
|
||||
Frontier agentic coding model comparison: DeepSeek V4.
|
||||
DeepSeek direct API. Plugin stack IDENTICAL across all 10 profiles so the
|
||||
base model is the only structural variable. Any score delta is attributable
|
||||
to the model, not the scaffold.
|
||||
plugins:
|
||||
enabled:
|
||||
- anthropic
|
||||
- id: memory-lancedb
|
||||
config:
|
||||
dimensions: 1536
|
||||
- browser-playwright
|
||||
slots:
|
||||
memory: memory-lancedb
|
||||
contextEngine: builtin
|
||||
tools_allow:
|
||||
- bash
|
||||
- file_read
|
||||
- file_edit
|
||||
- browser_navigate
|
||||
- browser_click
|
||||
- memory_read
|
||||
- memory_write
|
||||
26
profiles/frontier_gpt_5_2.yaml
Normal file
26
profiles/frontier_gpt_5_2.yaml
Normal file
@ -0,0 +1,26 @@
|
||||
profile:
|
||||
name: frontier-gpt-5-2
|
||||
base_model: openai/gpt-5.2
|
||||
notes: |
|
||||
Frontier agentic coding model comparison: GPT-5.2 (closed).
|
||||
OpenAI mid-tier flagship. Plugin stack IDENTICAL across all profiles so the base
|
||||
model is the only structural variable. Any score delta is attributable
|
||||
to the model, not the scaffold.
|
||||
plugins:
|
||||
enabled:
|
||||
- anthropic
|
||||
- id: memory-lancedb
|
||||
config:
|
||||
dimensions: 1536
|
||||
- browser-playwright
|
||||
slots:
|
||||
memory: memory-lancedb
|
||||
contextEngine: builtin
|
||||
tools_allow:
|
||||
- bash
|
||||
- file_read
|
||||
- file_edit
|
||||
- browser_navigate
|
||||
- browser_click
|
||||
- memory_read
|
||||
- memory_write
|
||||
26
profiles/frontier_gpt_5_5.yaml
Normal file
26
profiles/frontier_gpt_5_5.yaml
Normal file
@ -0,0 +1,26 @@
|
||||
profile:
|
||||
name: frontier-gpt-5-5
|
||||
base_model: openai/gpt-5.5
|
||||
notes: |
|
||||
Frontier agentic coding model comparison: GPT-5.5 (closed).
|
||||
OpenAI flagship. Plugin stack IDENTICAL across all frontier profiles so
|
||||
the base model is the only structural variable. Any score delta is
|
||||
attributable to the model, not the scaffold.
|
||||
plugins:
|
||||
enabled:
|
||||
- anthropic
|
||||
- id: memory-lancedb
|
||||
config:
|
||||
dimensions: 1536
|
||||
- browser-playwright
|
||||
slots:
|
||||
memory: memory-lancedb
|
||||
contextEngine: builtin
|
||||
tools_allow:
|
||||
- bash
|
||||
- file_read
|
||||
- file_edit
|
||||
- browser_navigate
|
||||
- browser_click
|
||||
- memory_read
|
||||
- memory_write
|
||||
26
profiles/frontier_kimi_k26.yaml
Normal file
26
profiles/frontier_kimi_k26.yaml
Normal file
@ -0,0 +1,26 @@
|
||||
profile:
|
||||
name: frontier-kimi-k26
|
||||
base_model: openrouter/moonshotai/kimi-k2.6
|
||||
notes: |
|
||||
Frontier agentic coding model comparison: Kimi K2.6 (open).
|
||||
Moonshot AI newer revision. Plugin stack IDENTICAL across profiles so
|
||||
the base model is the only structural variable. Any score delta is
|
||||
attributable to the model, not the scaffold.
|
||||
plugins:
|
||||
enabled:
|
||||
- anthropic
|
||||
- id: memory-lancedb
|
||||
config:
|
||||
dimensions: 1536
|
||||
- browser-playwright
|
||||
slots:
|
||||
memory: memory-lancedb
|
||||
contextEngine: builtin
|
||||
tools_allow:
|
||||
- bash
|
||||
- file_read
|
||||
- file_edit
|
||||
- browser_navigate
|
||||
- browser_click
|
||||
- memory_read
|
||||
- memory_write
|
||||
26
profiles/frontier_opus_4_7.yaml
Normal file
26
profiles/frontier_opus_4_7.yaml
Normal file
@ -0,0 +1,26 @@
|
||||
profile:
|
||||
name: frontier-opus-4-7
|
||||
base_model: anthropic/claude-opus-4-7
|
||||
notes: |
|
||||
Frontier agentic coding model comparison: Claude Opus 4.7 (closed).
|
||||
Anthropic flagship, newer revision. Plugin stack IDENTICAL to opus-4-6
|
||||
and the other frontier profiles so the base model is the only structural
|
||||
variable. Any score delta is attributable to the model, not the scaffold.
|
||||
plugins:
|
||||
enabled:
|
||||
- anthropic
|
||||
- id: memory-lancedb
|
||||
config:
|
||||
dimensions: 1536
|
||||
- browser-playwright
|
||||
slots:
|
||||
memory: memory-lancedb
|
||||
contextEngine: builtin
|
||||
tools_allow:
|
||||
- bash
|
||||
- file_read
|
||||
- file_edit
|
||||
- browser_navigate
|
||||
- browser_click
|
||||
- memory_read
|
||||
- memory_write
|
||||
26
profiles/frontier_sonnet_4_6.yaml
Normal file
26
profiles/frontier_sonnet_4_6.yaml
Normal file
@ -0,0 +1,26 @@
|
||||
profile:
|
||||
name: frontier-sonnet-4-6
|
||||
base_model: anthropic/claude-sonnet-4-6
|
||||
notes: |
|
||||
Frontier agentic coding model comparison: Claude Sonnet 4.6 (closed).
|
||||
Anthropic mid-tier flagship. Plugin stack IDENTICAL across all profiles so the base
|
||||
model is the only structural variable. Any score delta is attributable
|
||||
to the model, not the scaffold.
|
||||
plugins:
|
||||
enabled:
|
||||
- anthropic
|
||||
- id: memory-lancedb
|
||||
config:
|
||||
dimensions: 1536
|
||||
- browser-playwright
|
||||
slots:
|
||||
memory: memory-lancedb
|
||||
contextEngine: builtin
|
||||
tools_allow:
|
||||
- bash
|
||||
- file_read
|
||||
- file_edit
|
||||
- browser_navigate
|
||||
- browser_click
|
||||
- memory_read
|
||||
- memory_write
|
||||
@ -13,7 +13,7 @@ dependencies = [
|
||||
"gradio>=5.0,<6",
|
||||
"httpx>=0.27,<1",
|
||||
"numpy>=1.26,<3",
|
||||
"rich>=13.0,<14",
|
||||
"rich>=13.0,<15",
|
||||
"click>=8.1,<9",
|
||||
# Runtime deps for the task completion verifier. The harness shells out
|
||||
# to `pytest -q` / `pytest-asyncio` inside per-task workspaces as the
|
||||
@ -30,6 +30,9 @@ dev = [
|
||||
"pytest>=8.0,<9",
|
||||
"pytest-asyncio>=0.24,<1",
|
||||
]
|
||||
hermes = [
|
||||
"hermes-agent @ git+https://github.com/NousResearch/hermes-agent.git@main",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
clawbench = "clawbench.cli:main"
|
||||
@ -45,3 +48,6 @@ force-include = { "tasks-public" = "tasks-public", "profiles" = "profiles", "bas
|
||||
[tool.pytest.ini_options]
|
||||
asyncio_mode = "auto"
|
||||
testpaths = ["tests"]
|
||||
|
||||
[tool.hatch.metadata]
|
||||
allow-direct-references = true
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
#!/bin/bash
|
||||
# Shared helper sourced by container_sweep_*.sh scripts to snapshot the
|
||||
# per-model run_cache after a sweep completes. Called at END of each sweep.
|
||||
# Shared helper sourced by container runner scripts to snapshot the per-model
|
||||
# run_cache after a sweep completes. Called at END of each sweep.
|
||||
#
|
||||
# Requires these env vars (already set by parent script):
|
||||
# CLAWBENCH_RUN_CACHE_DIR - e.g. /data/run_cache
|
||||
# CACHE_SUB - e.g. openai_gpt-5.4
|
||||
# SWEEP_OUT_TAG - e.g. v2026-4-18-pr68627-gpt54
|
||||
# SWEEP_OUT_TAG - e.g. core-v1-public
|
||||
# SWEEP_LABEL - e.g. gpt54
|
||||
# SWEEP_LOGDIR - e.g. /data/drift_2026-04-18-pr68627-gpt54
|
||||
# SWEEP_LOGDIR - e.g. /data/core-v1-public
|
||||
#
|
||||
# Writes snapshot to: /data/run_cache_archive/<SWEEP_OUT_TAG>/<CACHE_SUB>/
|
||||
# Also writes a metadata.json with sweep label/model/timestamp for indexing.
|
||||
|
||||
@ -1,255 +0,0 @@
|
||||
"""Per-run 1-to-1 audit across every (model, task, run_idx) triple.
|
||||
|
||||
Flags issues beyond aggregate coverage:
|
||||
- Tasks where ALL models score 0 (task broken / verifier rejects everyone)
|
||||
- Tasks where models produce output but all get C=0 (verifier bug)
|
||||
- Tasks with suspiciously high cross-model infra-failure rates (harness bug)
|
||||
- Specific runs with harness errors (timeout, handshake)
|
||||
- Models with task-specific pathology (e.g., always fails on t3-X)
|
||||
- Judge failures per-task that haven't been rejudged
|
||||
- Missing runs in archive (logged but not cached)
|
||||
|
||||
Usage: python3 scripts/audit_per_run.py
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
ROOT = Path(__file__).resolve().parent.parent
|
||||
DRIFT = ROOT / "data" / "drift_2026-04-19-full"
|
||||
ARCH = ROOT / "data" / "run_cache_archive" / "v2026-4-19-full"
|
||||
|
||||
MODEL_MAP = {
|
||||
"opus46": ("anthropic_claude-opus-4-6", "opus-4-6"),
|
||||
"opus47": ("anthropic_claude-opus-4-7", "opus-4-7"),
|
||||
"sonnet46": ("anthropic_claude-sonnet-4-6", "sonnet-4-6"),
|
||||
"gpt54": ("openai_gpt-5.4", "gpt-5.4"),
|
||||
"gemini": ("google_gemini-3.1-pro-preview", "gemini-3.1-pro"),
|
||||
"glm": ("openrouter_z-ai_glm-5.1", "glm-5.1"),
|
||||
"minimax": ("openrouter_minimax_minimax-m2.7", "minimax-m2.7"),
|
||||
"kimi": ("openrouter_moonshotai_kimi-k2.5", "kimi-k2.5"),
|
||||
"qwen": ("openrouter_qwen_qwen3.6-plus", "qwen-3.6-plus"),
|
||||
}
|
||||
|
||||
LOG_LINE = re.compile(
|
||||
r"^\[(\d+)/120\]\s+(\S+)\s+\([^)]+\)\s+run\s+(\d+):\s+([+\-~])\s+([\d.]+)"
|
||||
)
|
||||
HARNESS_ERR = re.compile(r"ERROR clawbench\.harness: Run (\S+)/(\d+) failed")
|
||||
JUDGE_INFRA_PHRASES = [
|
||||
"gateway is restarting", "judge execution failed", "judge failed to run",
|
||||
"judge call failed", "judge timed out",
|
||||
]
|
||||
|
||||
|
||||
def parse_log(log_path: Path):
|
||||
runs = {}
|
||||
errors = {}
|
||||
if not log_path.exists():
|
||||
return runs, errors
|
||||
src = log_path.read_text(errors="ignore")
|
||||
for line in src.splitlines():
|
||||
m = LOG_LINE.match(line.strip())
|
||||
if m:
|
||||
seq, task, run_idx, outcome, score = m.groups()
|
||||
runs[(task, int(run_idx) - 1)] = {"score": float(score), "outcome": outcome}
|
||||
h = HARNESS_ERR.search(line)
|
||||
if h:
|
||||
errors[(h.group(1), int(h.group(2)))] = "harness_error"
|
||||
return runs, errors
|
||||
|
||||
|
||||
def scan_archive(cache_dir: Path):
|
||||
out = {}
|
||||
if not cache_dir.exists():
|
||||
return out
|
||||
for tdir in cache_dir.iterdir():
|
||||
if not tdir.is_dir():
|
||||
continue
|
||||
for rf in tdir.glob("run*.json"):
|
||||
m = re.match(r"run(\d+)\.json", rf.name)
|
||||
if not m:
|
||||
continue
|
||||
try:
|
||||
d = json.load(open(rf))
|
||||
except Exception:
|
||||
continue
|
||||
jr = d.get("judge_result", {}) or {}
|
||||
reason = (jr.get("reason") or "").lower()
|
||||
# Don't flag rejudged runs as infra-failed even if reason is empty —
|
||||
# a rejudged run has a real judge call behind it (rejudged_at field).
|
||||
judge_infra = (
|
||||
jr.get("enabled")
|
||||
and "rejudged_at" not in jr
|
||||
and (
|
||||
any(p in reason for p in JUDGE_INFRA_PHRASES)
|
||||
or jr.get("error")
|
||||
or (not reason.strip() and jr.get("score", 0) == 0)
|
||||
)
|
||||
)
|
||||
out[(tdir.name, int(m.group(1)))] = {
|
||||
"run_score": d.get("run_score", 0),
|
||||
"c": d.get("completion_result", {}).get("score", 0),
|
||||
"t": d.get("trajectory_result", {}).get("score", 0),
|
||||
"b": d.get("behavior_result", {}).get("score", 0),
|
||||
"j": jr.get("score", 0) if jr.get("enabled") else None,
|
||||
"judge_infra_failed": bool(judge_infra),
|
||||
"rejudged": "rejudged_at" in jr,
|
||||
"delivery": d.get("delivery_outcome"),
|
||||
"failure_mode": d.get("failure_mode"),
|
||||
"error": d.get("error"),
|
||||
"n_messages": len(d.get("transcript", {}).get("messages", [])),
|
||||
"has_assistant_text": any(
|
||||
m.get("role") == "assistant" and m.get("text")
|
||||
for m in d.get("transcript", {}).get("messages", [])
|
||||
),
|
||||
}
|
||||
return out
|
||||
|
||||
|
||||
def main():
|
||||
# Gather everything
|
||||
per_model = {}
|
||||
for label, (sub, pretty) in MODEL_MAP.items():
|
||||
log_p = DRIFT / f"docker_{label}_v2026-4-19-full.log"
|
||||
arch_d = ARCH / sub
|
||||
logged, errors = parse_log(log_p)
|
||||
archived = scan_archive(arch_d)
|
||||
per_model[pretty] = {
|
||||
"logged": logged, "errors": errors, "archived": archived,
|
||||
}
|
||||
|
||||
# Build per-task cross-model view
|
||||
all_tasks = set()
|
||||
for m in per_model.values():
|
||||
for key in m["archived"]:
|
||||
all_tasks.add(key[0])
|
||||
for key in m["logged"]:
|
||||
all_tasks.add(key[0])
|
||||
|
||||
# Issue classification
|
||||
issues = defaultdict(list)
|
||||
|
||||
for task in sorted(all_tasks):
|
||||
# Collect all runs for this task across models
|
||||
task_runs_by_model = {}
|
||||
for pretty, data in per_model.items():
|
||||
task_runs = []
|
||||
for run_idx in range(3):
|
||||
key = (task, run_idx)
|
||||
a = data["archived"].get(key)
|
||||
l = data["logged"].get(key)
|
||||
err = (key in data["errors"])
|
||||
task_runs.append({"archived": a, "logged": l, "harness_err": err})
|
||||
task_runs_by_model[pretty] = task_runs
|
||||
|
||||
# Compute cross-model stats
|
||||
all_scores = []
|
||||
all_cs = []
|
||||
all_outputs = [] # model produced assistant text?
|
||||
all_judge_infra = 0
|
||||
all_harness_err = 0
|
||||
for pretty, runs in task_runs_by_model.items():
|
||||
for r in runs:
|
||||
a = r["archived"]
|
||||
if a:
|
||||
all_scores.append(a["run_score"])
|
||||
all_cs.append(a["c"])
|
||||
all_outputs.append(a["has_assistant_text"])
|
||||
if a["judge_infra_failed"]: all_judge_infra += 1
|
||||
elif r["logged"]:
|
||||
all_scores.append(r["logged"]["score"])
|
||||
if r["harness_err"]:
|
||||
all_harness_err += 1
|
||||
|
||||
if not all_scores:
|
||||
continue
|
||||
mean_score = sum(all_scores) / len(all_scores)
|
||||
mean_c = sum(all_cs) / len(all_cs) if all_cs else 0
|
||||
output_rate = sum(all_outputs) / len(all_outputs) if all_outputs else 0
|
||||
|
||||
# Flag issues
|
||||
if mean_score < 0.1:
|
||||
issues["task_fails_all_models"].append((task, mean_score, output_rate))
|
||||
if mean_c < 0.05 and output_rate > 0.5:
|
||||
issues["verifier_rejects_valid_outputs"].append((task, mean_c, output_rate))
|
||||
if all_harness_err >= 5:
|
||||
issues["harness_errors_cluster"].append((task, all_harness_err))
|
||||
if all_judge_infra >= 5:
|
||||
issues["judge_infra_cluster"].append((task, all_judge_infra))
|
||||
|
||||
# Print issues
|
||||
print("=" * 70)
|
||||
print("ISSUE: Tasks where ALL models score near-zero (broken verifier or task)")
|
||||
print("=" * 70)
|
||||
for task, mean, out_rate in sorted(issues["task_fails_all_models"]):
|
||||
print(f" {task:<40} mean_score={mean:.3f} assistant_output_rate={out_rate:.1%}")
|
||||
|
||||
print()
|
||||
print("=" * 70)
|
||||
print("ISSUE: Verifier rejects valid outputs (model produced text but C=0)")
|
||||
print("=" * 70)
|
||||
for task, mean_c, out_rate in sorted(issues["verifier_rejects_valid_outputs"]):
|
||||
print(f" {task:<40} mean_completion={mean_c:.3f} assistant_output_rate={out_rate:.1%}")
|
||||
|
||||
print()
|
||||
print("=" * 70)
|
||||
print("ISSUE: Harness-error clusters (gateway failures per task)")
|
||||
print("=" * 70)
|
||||
for task, n in sorted(issues["harness_errors_cluster"], key=lambda x: -x[1]):
|
||||
print(f" {task:<40} harness_error_count={n}")
|
||||
|
||||
print()
|
||||
print("=" * 70)
|
||||
print("ISSUE: Judge-infra clusters (judge failing per task)")
|
||||
print("=" * 70)
|
||||
for task, n in sorted(issues["judge_infra_cluster"], key=lambda x: -x[1]):
|
||||
print(f" {task:<40} judge_infra_failures={n} (should be rejudged)")
|
||||
|
||||
# Per-model per-task pathologies
|
||||
print()
|
||||
print("=" * 70)
|
||||
print("ISSUE: Model-specific task pathologies (all 3 runs of a task scored 0 on one model)")
|
||||
print("=" * 70)
|
||||
for pretty, data in per_model.items():
|
||||
zero_tasks = []
|
||||
for task in sorted(all_tasks):
|
||||
all_three_zero = True
|
||||
any_attempted = False
|
||||
for run_idx in range(3):
|
||||
key = (task, run_idx)
|
||||
a = data["archived"].get(key)
|
||||
l = data["logged"].get(key)
|
||||
if a:
|
||||
any_attempted = True
|
||||
if a["run_score"] > 0.01: all_three_zero = False
|
||||
elif l:
|
||||
any_attempted = True
|
||||
if l["score"] > 0.01: all_three_zero = False
|
||||
else:
|
||||
all_three_zero = False # can't confirm
|
||||
any_attempted = False
|
||||
if any_attempted and all_three_zero:
|
||||
zero_tasks.append(task)
|
||||
if zero_tasks:
|
||||
print(f" {pretty:<18}: all-zero on {len(zero_tasks)} tasks")
|
||||
for t in zero_tasks[:6]:
|
||||
print(f" - {t}")
|
||||
|
||||
# Task coverage mismatches
|
||||
print()
|
||||
print("=" * 70)
|
||||
print("COVERAGE: Models with non-complete coverage (logged != 120 or archived != 120)")
|
||||
print("=" * 70)
|
||||
for pretty, data in per_model.items():
|
||||
n_log = len(data["logged"])
|
||||
n_arch = len(data["archived"])
|
||||
if n_log < 120 or n_arch < 120:
|
||||
print(f" {pretty:<18} logged={n_log:<4} archived={n_arch:<4} missing={120 - max(n_log, n_arch)}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -1,207 +0,0 @@
|
||||
"""Comprehensive per-run audit across all models in drift_2026-04-19-full.
|
||||
|
||||
For each model, cross-references:
|
||||
1. Log file (docker_<label>_<tag>.log) — all [N/120] run attempts + their scores
|
||||
2. Archived per-run JSONs (run_cache_archive/<tag>/<cache_sub>/<task>/runN.json)
|
||||
3. Judge status per cached run (rejudged via direct API or not)
|
||||
|
||||
Outputs a fair-comparison table: coverage %, infra-failure %, clean mean,
|
||||
coverage-normalized score, judge coverage.
|
||||
|
||||
Usage:
|
||||
python3 scripts/audit_runs.py
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
ROOT = Path(__file__).resolve().parent.parent
|
||||
DRIFT = ROOT / "data" / "drift_2026-04-19-full"
|
||||
ARCH = ROOT / "data" / "run_cache_archive" / "v2026-4-19-full"
|
||||
|
||||
# Model label (in log filenames) → (cache_sub, pretty name)
|
||||
MODEL_MAP = {
|
||||
"opus46": ("anthropic_claude-opus-4-6", "opus-4-6"),
|
||||
"opus47": ("anthropic_claude-opus-4-7", "opus-4-7"),
|
||||
"sonnet46": ("anthropic_claude-sonnet-4-6", "sonnet-4-6"),
|
||||
"gpt54": ("openai_gpt-5.4", "gpt-5.4"),
|
||||
"gemini": ("google_gemini-3.1-pro-preview", "gemini-3.1-pro"),
|
||||
"glm": ("openrouter_z-ai_glm-5.1", "glm-5.1"),
|
||||
"minimax": ("openrouter_minimax_minimax-m2.7", "minimax-m2.7"),
|
||||
"kimi": ("openrouter_moonshotai_kimi-k2.5", "kimi-k2.5"),
|
||||
"qwen": ("openrouter_qwen_qwen3.6-plus", "qwen-3.6-plus"),
|
||||
}
|
||||
|
||||
# Regex to parse "[N/120] task (tier/family) run R: + 0.93 C=1.00 T=0.90 ..."
|
||||
LOG_LINE = re.compile(
|
||||
r"^\[(\d+)/120\]\s+(\S+)\s+\([^)]+\)\s+run\s+(\d+):\s+([+\-~])\s+([\d.]+)"
|
||||
)
|
||||
JUDGE_INFRA_PHRASES = [
|
||||
"gateway is restarting",
|
||||
"judge execution failed",
|
||||
"judge failed to run",
|
||||
"judge call failed",
|
||||
"judge timed out",
|
||||
]
|
||||
|
||||
|
||||
def parse_log(path: Path) -> dict:
|
||||
"""Return: {(task_id, run_idx): {"score": float, "outcome": "+/-/~"}} from log file."""
|
||||
runs = {}
|
||||
if not path.exists():
|
||||
return runs
|
||||
for line in path.read_text(errors="ignore").splitlines():
|
||||
m = LOG_LINE.match(line.strip())
|
||||
if not m:
|
||||
continue
|
||||
seq, task, run_idx, outcome, score = m.groups()
|
||||
# Log uses 1-indexed run numbers; archive uses 0-indexed runN.json.
|
||||
# Normalize to 0-indexed so keys cross-reference correctly.
|
||||
key = (task, int(run_idx) - 1)
|
||||
# Later entries overwrite earlier (retry semantics)
|
||||
runs[key] = {"score": float(score), "outcome": outcome, "seq": int(seq)}
|
||||
return runs
|
||||
|
||||
|
||||
def scan_archive(cache_dir: Path) -> dict:
|
||||
"""Return: {(task_id, run_idx): {"run_score": float, "c": float, "judge_err": bool, "rejudged": bool}}"""
|
||||
out = {}
|
||||
if not cache_dir.exists():
|
||||
return out
|
||||
for tdir in cache_dir.iterdir():
|
||||
if not tdir.is_dir():
|
||||
continue
|
||||
for rf in tdir.glob("run*.json"):
|
||||
try:
|
||||
d = json.load(open(rf))
|
||||
except Exception:
|
||||
continue
|
||||
m_run = re.match(r"run(\d+)\.json", rf.name)
|
||||
if not m_run:
|
||||
continue
|
||||
run_idx = int(m_run.group(1))
|
||||
jr = d.get("judge_result", {}) or {}
|
||||
reason = (jr.get("reason") or "").lower()
|
||||
judge_infra = (
|
||||
any(p in reason for p in JUDGE_INFRA_PHRASES)
|
||||
or jr.get("error")
|
||||
or (not reason.strip() and jr.get("score", 0) == 0)
|
||||
)
|
||||
out[(tdir.name, run_idx)] = {
|
||||
"run_score": d.get("run_score", 0),
|
||||
"completion": d.get("completion_result", {}).get("score", 0),
|
||||
"judge_score": jr.get("score", 0) if jr.get("enabled") else None,
|
||||
"judge_infra_failed": bool(judge_infra and jr.get("enabled")),
|
||||
"rejudged": "rejudged_at" in jr,
|
||||
"delivery": d.get("delivery_outcome"),
|
||||
"failure_mode": d.get("failure_mode"),
|
||||
}
|
||||
return out
|
||||
|
||||
|
||||
def audit_model(label: str, cache_sub: str, pretty: str) -> dict:
|
||||
log_path = DRIFT / f"docker_{label}_v2026-4-19-full.log"
|
||||
cache_dir = ARCH / cache_sub
|
||||
logged = parse_log(log_path)
|
||||
archived = scan_archive(cache_dir)
|
||||
|
||||
all_keys = set(logged.keys()) | set(archived.keys())
|
||||
n_log = len(logged)
|
||||
n_arch = len(archived)
|
||||
not_archived = [k for k in logged.keys() if k not in archived]
|
||||
# Classify runs
|
||||
clean_runs = [] # logged + archived + not-infra-zero + judge-OK
|
||||
infra_zero_runs = [] # logged 0.00 (infra) — never landed in archive
|
||||
archived_zero = [] # archived but run_score = 0 (infra/capability)
|
||||
judge_infra = [] # archived with judge_infra_failed
|
||||
rejudged = [] # archived with rejudged_at
|
||||
|
||||
for k, a in archived.items():
|
||||
if a["judge_infra_failed"] and not a["rejudged"]:
|
||||
judge_infra.append(k)
|
||||
if a["rejudged"]:
|
||||
rejudged.append(k)
|
||||
if a["run_score"] < 0.01:
|
||||
archived_zero.append(k)
|
||||
else:
|
||||
clean_runs.append((k, a["run_score"]))
|
||||
|
||||
# Runs that got logged at 0.00 but weren't archived are pure infra-failures
|
||||
for k in not_archived:
|
||||
if logged[k]["score"] < 0.01:
|
||||
infra_zero_runs.append(k)
|
||||
else:
|
||||
clean_runs.append((k, logged[k]["score"]))
|
||||
|
||||
# Score computations
|
||||
all_scores = []
|
||||
for k, a in archived.items():
|
||||
all_scores.append(a["run_score"])
|
||||
for k in not_archived:
|
||||
all_scores.append(logged[k]["score"])
|
||||
|
||||
n_total_attempts = max(n_log, len(all_scores))
|
||||
expected = 120
|
||||
|
||||
clean_scores = [s for _, s in clean_runs]
|
||||
clean_mean = sum(clean_scores) / len(clean_scores) if clean_scores else 0
|
||||
|
||||
all_mean = sum(all_scores) / len(all_scores) if all_scores else 0
|
||||
# Coverage-normalized: clean_mean with gap-penalty (missing runs count as 0)
|
||||
coverage_normalized = (sum(clean_scores) + 0 * max(0, expected - len(clean_scores))) / expected
|
||||
|
||||
return {
|
||||
"label": label,
|
||||
"pretty": pretty,
|
||||
"n_log_entries": n_log,
|
||||
"n_archived": n_arch,
|
||||
"n_missing_from_archive": len(not_archived),
|
||||
"n_clean_runs": len(clean_runs),
|
||||
"n_archived_zero": len(archived_zero),
|
||||
"n_logged_infra_zero": len(infra_zero_runs),
|
||||
"n_judge_infra_failed": len(judge_infra),
|
||||
"n_rejudged": len(rejudged),
|
||||
"coverage_pct": 100.0 * len(clean_runs) / expected,
|
||||
"clean_mean": clean_mean,
|
||||
"all_mean": all_mean,
|
||||
"coverage_normalized": coverage_normalized,
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
print(f"{'Model':<16} {'Logged':>7} {'Archv':>6} {'Clean':>6} {'Cov%':>5} {'all_mean':>8} {'clean':>7} {'cov_norm':>8} {'infra_0':>8} {'j_rejdg':>8} {'j_failed':>8}")
|
||||
print(f"{'-'*16} {'-'*7} {'-'*6} {'-'*6} {'-'*5} {'-'*8} {'-'*7} {'-'*8} {'-'*8} {'-'*8} {'-'*8}")
|
||||
rows = []
|
||||
for label, (cache_sub, pretty) in MODEL_MAP.items():
|
||||
r = audit_model(label, cache_sub, pretty)
|
||||
rows.append(r)
|
||||
|
||||
# Sort by coverage-normalized score
|
||||
rows.sort(key=lambda r: -r["coverage_normalized"])
|
||||
for r in rows:
|
||||
print(
|
||||
f" {r['pretty']:<14} {r['n_log_entries']:>7} {r['n_archived']:>6} "
|
||||
f"{r['n_clean_runs']:>6} {r['coverage_pct']:>4.0f}% "
|
||||
f"{r['all_mean']:>8.4f} {r['clean_mean']:>7.4f} "
|
||||
f"{r['coverage_normalized']:>8.4f} "
|
||||
f"{r['n_logged_infra_zero']+r['n_archived_zero']:>8} "
|
||||
f"{r['n_rejudged']:>8} {r['n_judge_infra_failed']:>8}"
|
||||
)
|
||||
|
||||
# Show gaps explicitly
|
||||
print()
|
||||
print("Legend:")
|
||||
print(" all_mean = mean of ALL attempts (log+archive merged; infra-zeros pull this DOWN)")
|
||||
print(" clean = mean excluding infra-failed runs (shows capability ceiling)")
|
||||
print(" cov_norm = clean*coverage + 0*missing; all models scored against 120-run denominator")
|
||||
print(" infra_0 = runs that scored 0 due to infrastructure (gateway/state/handshake failures)")
|
||||
print(" j_rejdg = judge scores that have been rejudged via direct Anthropic API")
|
||||
print(" j_failed = judge infra-failures that have NOT been rejudged")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
460
scripts/container_adapter_eval.sh
Normal file
460
scripts/container_adapter_eval.sh
Normal file
@ -0,0 +1,460 @@
|
||||
#!/bin/bash
|
||||
# Fair adapter lane runner.
|
||||
#
|
||||
# Runs one adapter/model pair inside a container-owned workspace/state dir.
|
||||
# Use docker run with full container privileges when measuring harnesses:
|
||||
# docker run --rm --privileged --cap-add=ALL \
|
||||
# --security-opt seccomp=unconfined --security-opt apparmor=unconfined \
|
||||
# --user root --env-file .tmp/docker_eval.env \
|
||||
# -e SWEEP_ADAPTER=hermes -e SWEEP_MODEL=openai/gpt-5.4 \
|
||||
# -e SWEEP_LABEL=hermes-gpt54 -e SWEEP_OUT_TAG=fair-20260425 \
|
||||
# -v "$PWD/data/fair-container:/data" \
|
||||
# -v "$PWD/data/container-home-openclaw:/config/openclaw:ro" \
|
||||
# clawbench-fair:latest
|
||||
|
||||
set -u
|
||||
|
||||
: "${SWEEP_ADAPTER:?SWEEP_ADAPTER required (openclaw|hermes)}"
|
||||
: "${SWEEP_MODEL:?SWEEP_MODEL required (e.g. openai/gpt-5.4)}"
|
||||
: "${SWEEP_LABEL:?SWEEP_LABEL required}"
|
||||
: "${SWEEP_OUT_TAG:=fair-container}"
|
||||
: "${SWEEP_LOGDIR:=/data/fair_results}"
|
||||
: "${SWEEP_RUNS:=1}"
|
||||
: "${SWEEP_CONCURRENCY:=1}"
|
||||
: "${SWEEP_BROWSER_CONCURRENCY:=1}"
|
||||
: "${CLAWBENCH_PER_RUN_BUDGET_SECONDS:=300}"
|
||||
: "${CLAWBENCH_PER_TURN_TIMEOUT_SECONDS:=180}"
|
||||
: "${HERMES_MAX_ITERATIONS:=90}"
|
||||
: "${HERMES_STEP_TIMEOUT_SECONDS:=60}"
|
||||
: "${OPENCLAW_EXEC_HOST:=gateway}"
|
||||
|
||||
cd /home/node/app
|
||||
mkdir -p "$SWEEP_LOGDIR" /data/run_cache
|
||||
|
||||
export OPENCLAW_GATEWAY_TOKEN="${OPENCLAW_GATEWAY_TOKEN:-local-dev-token-for-testing}"
|
||||
export OPENCLAW_GATEWAY_URL="${OPENCLAW_GATEWAY_URL:-ws://127.0.0.1:18789}"
|
||||
export OPENCLAW_SKIP_GMAIL_WATCHER=1
|
||||
export OPENCLAW_SKIP_CANVAS_HOST=1
|
||||
export OPENCLAW_NO_RESPAWN=1
|
||||
export CLAWBENCH_DISABLE_GATEWAY_DEVICE_IDENTITY=1
|
||||
export NODE_OPTIONS="${NODE_OPTIONS:-"--max-old-space-size=4096"}"
|
||||
if command -v npm >/dev/null 2>&1; then
|
||||
export NODE_PATH="${NODE_PATH:-$(npm root -g 2>/dev/null || true)}"
|
||||
fi
|
||||
export CLAWBENCH_PER_RUN_BUDGET_SECONDS
|
||||
export CLAWBENCH_PER_TURN_TIMEOUT_SECONDS
|
||||
export HERMES_AGENT_REPO="${HERMES_AGENT_REPO:-/opt/hermes-agent}"
|
||||
export HERMES_DRIVER="${HERMES_DRIVER:-ai_agent}"
|
||||
export HERMES_TOOLSETS="${HERMES_TOOLSETS:-hermes-api-server}"
|
||||
export HERMES_MAX_ITERATIONS
|
||||
export HERMES_STEP_TIMEOUT_SECONDS
|
||||
export TERMINAL_ENV="${TERMINAL_ENV:-local}"
|
||||
|
||||
safe_model="${SWEEP_MODEL//\//_}"
|
||||
safe_model="${safe_model//:/_}"
|
||||
safe_label="${SWEEP_LABEL//\//_}"
|
||||
safe_label="${safe_label//:/_}"
|
||||
export CLAWBENCH_RUN_CACHE_DIR="/data/run_cache/$safe_label"
|
||||
mkdir -p "$CLAWBENCH_RUN_CACHE_DIR"
|
||||
cache_sub="${SWEEP_ADAPTER}-${safe_model}"
|
||||
cache_paths=("$CLAWBENCH_RUN_CACHE_DIR/$cache_sub")
|
||||
if [ "$SWEEP_ADAPTER" = "openclaw" ]; then
|
||||
cache_paths+=("$CLAWBENCH_RUN_CACHE_DIR/$safe_model")
|
||||
fi
|
||||
|
||||
SRC_STATE="${OPENCLAW_CONFIG_SOURCE:-/config/openclaw}"
|
||||
if [ ! -d "$SRC_STATE" ]; then
|
||||
SRC_STATE="/home/node/.openclaw"
|
||||
fi
|
||||
FRESH_HOME="/tmp/openclaw-home-${SWEEP_LABEL}-$$"
|
||||
FRESH_STATE="$FRESH_HOME/.openclaw"
|
||||
rm -rf "$FRESH_HOME"
|
||||
mkdir -p "$FRESH_STATE" "$FRESH_HOME/.config"
|
||||
if [ -f "$SRC_STATE/openclaw.json" ]; then
|
||||
cp "$SRC_STATE/openclaw.json" "$FRESH_STATE/openclaw.json"
|
||||
fi
|
||||
mkdir -p \
|
||||
"$FRESH_STATE/agents" \
|
||||
"$FRESH_STATE/workspace" \
|
||||
"$FRESH_STATE/logs" \
|
||||
"$FRESH_STATE/memory" \
|
||||
"$FRESH_STATE/cache" \
|
||||
"$FRESH_STATE/identity" \
|
||||
"$FRESH_STATE/devices" \
|
||||
"$FRESH_STATE/tasks" \
|
||||
"$FRESH_STATE/subagents" \
|
||||
"$FRESH_STATE/flows" \
|
||||
"$FRESH_STATE/cron"
|
||||
chmod -R 777 "$FRESH_STATE" 2>/dev/null || true
|
||||
export HOME="$FRESH_HOME"
|
||||
export OPENCLAW_HOME="$FRESH_HOME"
|
||||
export OPENCLAW_STATE_DIR="$FRESH_STATE"
|
||||
export OPENCLAW_CONFIG_PATH="$FRESH_STATE/openclaw.json"
|
||||
export OPENCLAW_REPO="${OPENCLAW_REPO:-/app}"
|
||||
export XDG_CONFIG_HOME="$FRESH_HOME/.config"
|
||||
export HERMES_HOME_BASE="${HERMES_HOME_BASE:-$FRESH_HOME/.hermes}"
|
||||
export HERMES_HOME="$HERMES_HOME_BASE"
|
||||
mkdir -p "$HERMES_HOME"
|
||||
|
||||
if [ "$SWEEP_ADAPTER" = "hermes" ]; then
|
||||
unset HERMES_PROVIDER
|
||||
case "$SWEEP_MODEL" in
|
||||
openai/*)
|
||||
if [ -z "${OPENAI_API_KEY:-}" ] && [ -n "${HERMES_API_KEY:-}" ]; then
|
||||
export OPENAI_API_KEY="$HERMES_API_KEY"
|
||||
fi
|
||||
export HERMES_BASE_URL="${HERMES_BASE_URL:-${OPENAI_BASE_URL:-https://api.openai.com/v1}}"
|
||||
export OPENAI_BASE_URL="$HERMES_BASE_URL"
|
||||
if [ -n "${OPENAI_API_KEY:-}" ]; then
|
||||
export HERMES_API_KEY="$OPENAI_API_KEY"
|
||||
fi
|
||||
unset ANTHROPIC_API_KEY ANTHROPIC_TOKEN CLAUDE_CODE_OAUTH_TOKEN OPENROUTER_API_KEY
|
||||
;;
|
||||
anthropic/*)
|
||||
unset OPENAI_API_KEY OPENAI_BASE_URL HERMES_API_KEY HERMES_BASE_URL OPENROUTER_API_KEY
|
||||
;;
|
||||
*)
|
||||
if [ -n "${HERMES_BASE_URL:-}" ]; then
|
||||
export OPENAI_BASE_URL="$HERMES_BASE_URL"
|
||||
elif [ -z "${OPENAI_BASE_URL:-}" ] && [ -n "${OPENAI_API_KEY:-}" ]; then
|
||||
export OPENAI_BASE_URL="https://api.openai.com/v1"
|
||||
fi
|
||||
if [ -n "${HERMES_API_KEY:-}" ] && [ -z "${OPENAI_API_KEY:-}" ]; then
|
||||
export OPENAI_API_KEY="$HERMES_API_KEY"
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
fi
|
||||
|
||||
python - <<'PY'
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
cfg_path = Path(os.environ["OPENCLAW_CONFIG_PATH"])
|
||||
if not cfg_path.exists():
|
||||
raise SystemExit(0)
|
||||
|
||||
data = json.loads(cfg_path.read_text(encoding="utf-8"))
|
||||
|
||||
agents = data.get("agents")
|
||||
if isinstance(agents, dict):
|
||||
# Keep static defaults, but never seed eval containers with old session-specific
|
||||
# agent records from the developer machine.
|
||||
agents["list"] = []
|
||||
|
||||
channels = data.get("channels")
|
||||
if isinstance(channels, dict):
|
||||
for channel in channels.values():
|
||||
if isinstance(channel, dict):
|
||||
channel["enabled"] = False
|
||||
exec_approvals = channel.get("execApprovals")
|
||||
if not isinstance(exec_approvals, dict):
|
||||
exec_approvals = {}
|
||||
channel["execApprovals"] = exec_approvals
|
||||
exec_approvals["enabled"] = False
|
||||
|
||||
plugins = data.get("plugins")
|
||||
if isinstance(plugins, dict):
|
||||
stale = {"marxbiotech-git-tools", "lab"}
|
||||
allow = plugins.get("allow")
|
||||
if isinstance(allow, list):
|
||||
plugins["allow"] = [item for item in allow if item not in stale]
|
||||
entries = plugins.get("entries")
|
||||
if isinstance(entries, dict):
|
||||
for item in stale:
|
||||
entries.pop(item, None)
|
||||
|
||||
|
||||
def set_nested(root, dotted, value):
|
||||
cursor = root
|
||||
parts = dotted.split(".")
|
||||
for part in parts[:-1]:
|
||||
child = cursor.get(part)
|
||||
if not isinstance(child, dict):
|
||||
child = {}
|
||||
cursor[part] = child
|
||||
cursor = child
|
||||
cursor[parts[-1]] = value
|
||||
|
||||
|
||||
set_nested(data, "browser.headless", True)
|
||||
set_nested(data, "browser.noSandbox", True)
|
||||
set_nested(data, "gateway.reload.mode", "off")
|
||||
set_nested(data, "agents.defaults.skipBootstrap", True)
|
||||
set_nested(data, "agents.defaults.sandbox.mode", "off")
|
||||
exec_host = os.environ.get("OPENCLAW_EXEC_HOST", "gateway").strip().lower()
|
||||
if exec_host not in {"auto", "gateway", "sandbox", "node"}:
|
||||
raise SystemExit(f"invalid OPENCLAW_EXEC_HOST={exec_host!r}")
|
||||
set_nested(data, "tools.exec.host", exec_host)
|
||||
set_nested(data, "tools.exec.security", "full")
|
||||
set_nested(data, "tools.exec.ask", "off")
|
||||
set_nested(data, "approvals.exec.enabled", False)
|
||||
model = os.environ.get("SWEEP_MODEL", "").strip()
|
||||
if model:
|
||||
set_nested(data, "agents.defaults.model.primary", model)
|
||||
set_nested(data, "agents.defaults.subagents.model.primary", model)
|
||||
|
||||
tmp_path = cfg_path.with_suffix(".json.tmp")
|
||||
tmp_path.write_text(json.dumps(data, indent=2), encoding="utf-8")
|
||||
tmp_path.replace(cfg_path)
|
||||
|
||||
approvals_path = cfg_path.with_name("exec-approvals.json")
|
||||
approvals = {
|
||||
"version": 1,
|
||||
"socket": {
|
||||
"path": str(approvals_path.with_suffix(".sock")),
|
||||
"token": "container-eval-token",
|
||||
},
|
||||
"defaults": {
|
||||
"security": "full",
|
||||
"ask": "off",
|
||||
"askFallback": "full",
|
||||
},
|
||||
"agents": {
|
||||
"*": {
|
||||
"security": "full",
|
||||
"ask": "off",
|
||||
"askFallback": "full",
|
||||
}
|
||||
},
|
||||
}
|
||||
approvals_path.write_text(json.dumps(approvals, indent=2), encoding="utf-8")
|
||||
PY
|
||||
|
||||
if [ "$SWEEP_ADAPTER" = "hermes" ]; then
|
||||
python - <<'PY'
|
||||
import os
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
|
||||
model = os.environ["SWEEP_MODEL"].strip()
|
||||
base_url = (os.environ.get("HERMES_BASE_URL") or os.environ.get("OPENAI_BASE_URL") or "").strip()
|
||||
|
||||
provider = "custom"
|
||||
effective_model = model
|
||||
aux_base_url = ""
|
||||
aux_api_mode = ""
|
||||
if model.startswith("anthropic/"):
|
||||
provider = "anthropic"
|
||||
elif urlparse(base_url).hostname == "api.openai.com" and model.startswith("openai/"):
|
||||
effective_model = model.split("/", 1)[1]
|
||||
aux_base_url = base_url
|
||||
if effective_model.lower().startswith("gpt-5"):
|
||||
aux_api_mode = "codex_responses"
|
||||
elif base_url:
|
||||
aux_base_url = base_url
|
||||
|
||||
tasks = [
|
||||
"vision",
|
||||
"web_extract",
|
||||
"compression",
|
||||
"session_search",
|
||||
"skills_hub",
|
||||
"approval",
|
||||
"mcp",
|
||||
"title_generation",
|
||||
]
|
||||
|
||||
lines = [
|
||||
"model:",
|
||||
f" provider: {provider}",
|
||||
f" default: {effective_model}",
|
||||
]
|
||||
if aux_base_url:
|
||||
lines.append(f" base_url: {aux_base_url}")
|
||||
if aux_api_mode:
|
||||
lines.append(f" api_mode: {aux_api_mode}")
|
||||
lines.append("auxiliary:")
|
||||
for task in tasks:
|
||||
timeout = 360 if task == "web_extract" else 120 if task in {"vision", "compression"} else 30
|
||||
lines.extend([
|
||||
f" {task}:",
|
||||
" provider: main",
|
||||
f" model: {effective_model}",
|
||||
f" timeout: {timeout}",
|
||||
])
|
||||
if aux_base_url:
|
||||
lines.append(f" base_url: {aux_base_url}")
|
||||
if aux_api_mode:
|
||||
lines.append(f" api_mode: {aux_api_mode}")
|
||||
if task == "session_search":
|
||||
lines.append(" max_concurrency: 1")
|
||||
|
||||
path = Path(os.environ["HERMES_HOME"]) / "config.yaml"
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text("\n".join(lines) + "\n", encoding="utf-8")
|
||||
PY
|
||||
fi
|
||||
|
||||
OUT="$SWEEP_LOGDIR/${SWEEP_LABEL}_${SWEEP_ADAPTER}_${safe_model}_${SWEEP_OUT_TAG}.json"
|
||||
LOG="$SWEEP_LOGDIR/${SWEEP_LABEL}_${SWEEP_ADAPTER}_${safe_model}_${SWEEP_OUT_TAG}.log"
|
||||
GWLOG="$SWEEP_LOGDIR/gateway_${SWEEP_LABEL}_${SWEEP_OUT_TAG}.log"
|
||||
HERMES_AGENT_LOG="$SWEEP_LOGDIR/hermes_agent_${SWEEP_LABEL}_${SWEEP_OUT_TAG}.log"
|
||||
HERMES_ERROR_LOG="$SWEEP_LOGDIR/hermes_errors_${SWEEP_LABEL}_${SWEEP_OUT_TAG}.log"
|
||||
|
||||
echo "===== CONTAINER ADAPTER EVAL START $(date '+%Y-%m-%d %H:%M:%S') ====="
|
||||
echo "uid: $(id -u) ($(id -un 2>/dev/null || true))"
|
||||
echo "adapter: $SWEEP_ADAPTER"
|
||||
echo "model: $SWEEP_MODEL"
|
||||
echo "runs: $SWEEP_RUNS"
|
||||
echo "execHost: $OPENCLAW_EXEC_HOST"
|
||||
echo "out: $OUT"
|
||||
echo "cache: ${cache_paths[*]}"
|
||||
echo "home: $HOME"
|
||||
echo "state: $OPENCLAW_STATE_DIR"
|
||||
echo "hermes: ${HERMES_HOME:-}"
|
||||
openclaw --version 2>/dev/null || true
|
||||
python - <<'PY' 2>/dev/null || true
|
||||
import os, subprocess
|
||||
repo = os.environ.get("HERMES_AGENT_REPO", "")
|
||||
if repo:
|
||||
try:
|
||||
sha = subprocess.check_output(["git", "-C", repo, "rev-parse", "HEAD"], text=True).strip()
|
||||
print(f"Hermes git: {sha}")
|
||||
except Exception:
|
||||
print(f"Hermes repo: {repo}")
|
||||
PY
|
||||
|
||||
rm -rf "${cache_paths[@]}"
|
||||
rm -f "$OUT" "$LOG"
|
||||
|
||||
GATEWAY_PID=""
|
||||
preserve_hermes_logs() {
|
||||
if [ -f "${HERMES_HOME:-}/logs/agent.log" ]; then
|
||||
cp "${HERMES_HOME:-}/logs/agent.log" "$HERMES_AGENT_LOG" 2>/dev/null || true
|
||||
fi
|
||||
if [ -f "${HERMES_HOME:-}/logs/errors.log" ]; then
|
||||
cp "${HERMES_HOME:-}/logs/errors.log" "$HERMES_ERROR_LOG" 2>/dev/null || true
|
||||
fi
|
||||
}
|
||||
|
||||
cleanup() {
|
||||
preserve_hermes_logs
|
||||
if [ -n "${GATEWAY_PID:-}" ]; then
|
||||
kill "$GATEWAY_PID" 2>/dev/null || true
|
||||
wait "$GATEWAY_PID" 2>/dev/null || true
|
||||
fi
|
||||
rm -rf "${FRESH_HOME:-}" 2>/dev/null || true
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
if [ "$SWEEP_ADAPTER" = "openclaw" ]; then
|
||||
echo "Starting OpenClaw gateway on :18789 ..."
|
||||
HOME="$FRESH_HOME" \
|
||||
OPENCLAW_HOME="$FRESH_HOME" \
|
||||
OPENCLAW_STATE_DIR="$FRESH_STATE" \
|
||||
OPENCLAW_CONFIG_PATH="$FRESH_STATE/openclaw.json" \
|
||||
XDG_CONFIG_HOME="$FRESH_HOME/.config" \
|
||||
openclaw gateway run \
|
||||
--allow-unconfigured \
|
||||
--dev \
|
||||
--bind loopback \
|
||||
--port 18789 \
|
||||
--auth token \
|
||||
--token "$OPENCLAW_GATEWAY_TOKEN" \
|
||||
--compact \
|
||||
> "$GWLOG" 2>&1 &
|
||||
GATEWAY_PID=$!
|
||||
ready=0
|
||||
for i in $(seq 1 180); do
|
||||
if curl -sf -H "Authorization: Bearer $OPENCLAW_GATEWAY_TOKEN" http://127.0.0.1:18789/healthz > /dev/null 2>&1; then
|
||||
echo "Gateway healthy after ${i}s"
|
||||
ready=1
|
||||
break
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
if [ "$ready" -ne 1 ]; then
|
||||
echo "ERROR: gateway failed to become healthy"
|
||||
tail -80 "$GWLOG" 2>/dev/null || true
|
||||
exit 1
|
||||
fi
|
||||
if [ -r "/proc/$GATEWAY_PID/environ" ]; then
|
||||
actual_home="$(tr '\0' '\n' < "/proc/$GATEWAY_PID/environ" | awk -F= '$1 == "HOME" { print $2; exit }')"
|
||||
if [ "$actual_home" != "$FRESH_HOME" ]; then
|
||||
echo "ERROR: gateway HOME escaped container eval home: ${actual_home:-<unset>} != $FRESH_HOME"
|
||||
tail -120 "$GWLOG" 2>/dev/null || true
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
if [ ! -f "$FRESH_STATE/exec-approvals.json" ] || grep -q '/home/node/.openclaw' "$FRESH_STATE/exec-approvals.json"; then
|
||||
echo "ERROR: exec approvals are not isolated in $FRESH_STATE"
|
||||
exit 1
|
||||
fi
|
||||
echo "Waiting for OpenClaw session control plane ..."
|
||||
python - <<'PY'
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
|
||||
from clawbench.client import GatewayClient, GatewayConfig
|
||||
|
||||
|
||||
async def probe_once(attempt: int) -> None:
|
||||
config = GatewayConfig(
|
||||
url=os.environ["OPENCLAW_GATEWAY_URL"],
|
||||
token=os.environ["OPENCLAW_GATEWAY_TOKEN"],
|
||||
connect_timeout=30.0,
|
||||
request_timeout=30.0,
|
||||
)
|
||||
async with GatewayClient(config) as client:
|
||||
key = await client.create_session(
|
||||
model=os.environ["SWEEP_MODEL"],
|
||||
label=f"clawbench-readiness-probe-{os.getpid()}-{attempt}",
|
||||
)
|
||||
await client.delete_session(key)
|
||||
|
||||
|
||||
async def main() -> int:
|
||||
deadline = time.monotonic() + 240
|
||||
attempt = 0
|
||||
last_error = ""
|
||||
while time.monotonic() < deadline:
|
||||
attempt += 1
|
||||
try:
|
||||
await probe_once(attempt)
|
||||
print(f"Gateway session control plane ready after {attempt} attempt(s)")
|
||||
return 0
|
||||
except Exception as exc:
|
||||
last_error = f"{type(exc).__name__}: {exc}"
|
||||
print(f"Gateway control probe {attempt} not ready: {last_error}")
|
||||
await asyncio.sleep(5)
|
||||
print(f"ERROR: gateway session control plane did not become ready: {last_error}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
|
||||
raise SystemExit(asyncio.run(main()))
|
||||
PY
|
||||
if [ "$?" -ne 0 ]; then
|
||||
tail -120 "$GWLOG" 2>/dev/null || true
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
TASK_ARGS=()
|
||||
if [ -n "${CHERRY_TASKS:-}" ]; then
|
||||
IFS=',' read -ra TASK_ARR <<< "$CHERRY_TASKS"
|
||||
for task_id in "${TASK_ARR[@]}"; do
|
||||
TASK_ARGS+=("--task" "$task_id")
|
||||
done
|
||||
fi
|
||||
|
||||
clawbench run \
|
||||
--adapter "$SWEEP_ADAPTER" \
|
||||
--model "$SWEEP_MODEL" \
|
||||
--runs "$SWEEP_RUNS" \
|
||||
--concurrency "$SWEEP_CONCURRENCY" \
|
||||
--browser-concurrency "$SWEEP_BROWSER_CONCURRENCY" \
|
||||
--no-randomize \
|
||||
"${TASK_ARGS[@]}" \
|
||||
--output "$OUT" \
|
||||
> "$LOG" 2>&1
|
||||
status=$?
|
||||
preserve_hermes_logs
|
||||
|
||||
echo "===== clawbench exit=$status $(date '+%Y-%m-%d %H:%M:%S') ====="
|
||||
tail -80 "$LOG" 2>/dev/null || true
|
||||
|
||||
exit "$status"
|
||||
220
scripts/container_lane_eval.sh
Executable file
220
scripts/container_lane_eval.sh
Executable file
@ -0,0 +1,220 @@
|
||||
#!/bin/bash
|
||||
# Run one OpenClaw model/profile through the HF-style isolated lane worker.
|
||||
set -Eeuo pipefail
|
||||
|
||||
: "${SWEEP_MODEL:?SWEEP_MODEL required}"
|
||||
: "${SWEEP_LABEL:?SWEEP_LABEL required}"
|
||||
: "${SWEEP_OUT_TAG:=lane-container}"
|
||||
: "${SWEEP_LANES:=3}"
|
||||
: "${SWEEP_RUNS:=1}"
|
||||
: "${SWEEP_LOGDIR:=/data/results}"
|
||||
: "${CLAWBENCH_PER_RUN_BUDGET_SECONDS:=900}"
|
||||
: "${CLAWBENCH_PER_TURN_TIMEOUT_SECONDS:=300}"
|
||||
: "${OPENCLAW_EXEC_HOST:=gateway}"
|
||||
|
||||
cd /home/node/app
|
||||
export CLAWBENCH_LOCAL_QUEUE_DIR="${CLAWBENCH_LOCAL_QUEUE_DIR:-/data/queue/$SWEEP_LABEL}"
|
||||
mkdir -p "$SWEEP_LOGDIR" /data/results "$CLAWBENCH_LOCAL_QUEUE_DIR" /data/run_cache /data/lane_runtime
|
||||
|
||||
export HF_TOKEN=""
|
||||
export OPENCLAW_GATEWAY_TOKEN="${OPENCLAW_GATEWAY_TOKEN:-local-dev-token-for-testing}"
|
||||
export OPENCLAW_SKIP_GMAIL_WATCHER=1
|
||||
export OPENCLAW_SKIP_CANVAS_HOST=1
|
||||
export OPENCLAW_NO_RESPAWN=1
|
||||
export CLAWBENCH_DISABLE_GATEWAY_DEVICE_IDENTITY=1
|
||||
export CLAWBENCH_PER_RUN_BUDGET_SECONDS
|
||||
export CLAWBENCH_PER_TURN_TIMEOUT_SECONDS
|
||||
export CLAWBENCH_CONNECT_TIMEOUT="${CLAWBENCH_CONNECT_TIMEOUT:-180}"
|
||||
export CLAWBENCH_REQUEST_TIMEOUT="${CLAWBENCH_REQUEST_TIMEOUT:-300}"
|
||||
export CLAWBENCH_GATEWAY_HEALTH_TIMEOUT_SECONDS="${CLAWBENCH_GATEWAY_HEALTH_TIMEOUT_SECONDS:-240}"
|
||||
export CLAWBENCH_LANE_STARTUP_STAGGER_SECONDS="${CLAWBENCH_LANE_STARTUP_STAGGER_SECONDS:-90}"
|
||||
export CLAWBENCH_GATEWAY_READY_MARKER_GRACE_SECONDS="${CLAWBENCH_GATEWAY_READY_MARKER_GRACE_SECONDS:-90}"
|
||||
export CLAWBENCH_KEEP_PARALLEL_LANE_ROOT="${CLAWBENCH_KEEP_PARALLEL_LANE_ROOT:-0}"
|
||||
export CLAWBENCH_PARALLEL_LANE_ROOT="/data/lane_runtime/$SWEEP_LABEL"
|
||||
export CLAWBENCH_TOOL_PROFILE_NAME="${CLAWBENCH_TOOL_PROFILE_NAME:-$SWEEP_LABEL}"
|
||||
export NODE_OPTIONS="${NODE_OPTIONS:-"--max-old-space-size=4096"}"
|
||||
if command -v npm >/dev/null 2>&1; then
|
||||
export NODE_PATH="${NODE_PATH:-$(npm root -g 2>/dev/null || true)}"
|
||||
fi
|
||||
|
||||
SRC_STATE="${OPENCLAW_CONFIG_SOURCE:-/config/openclaw}"
|
||||
if [ ! -d "$SRC_STATE" ]; then
|
||||
SRC_STATE="/home/node/.openclaw"
|
||||
fi
|
||||
|
||||
safe_model="${SWEEP_MODEL//\//_}"
|
||||
safe_model="${safe_model//:/_}"
|
||||
OUT="$SWEEP_LOGDIR/${SWEEP_LABEL}_openclaw_${safe_model}_${SWEEP_OUT_TAG}.json"
|
||||
LOG="$SWEEP_LOGDIR/${SWEEP_LABEL}_openclaw_${safe_model}_${SWEEP_OUT_TAG}.log"
|
||||
export SWEEP_OUTPUT_PATH="$OUT"
|
||||
|
||||
FRESH_HOME="/tmp/openclaw-home-${SWEEP_LABEL}-$$"
|
||||
FRESH_STATE="$FRESH_HOME/.openclaw"
|
||||
rm -rf "$FRESH_HOME" "$CLAWBENCH_PARALLEL_LANE_ROOT"
|
||||
mkdir -p "$FRESH_STATE" "$FRESH_HOME/.config"
|
||||
if [ -f "$SRC_STATE/openclaw.json" ]; then
|
||||
cp "$SRC_STATE/openclaw.json" "$FRESH_STATE/openclaw.json"
|
||||
fi
|
||||
if [ -d "$SRC_STATE/plugins" ]; then
|
||||
mkdir -p "$FRESH_STATE/plugins"
|
||||
cp -R "$SRC_STATE/plugins/." "$FRESH_STATE/plugins/" 2>/dev/null || true
|
||||
fi
|
||||
mkdir -p \
|
||||
"$FRESH_STATE/agents" \
|
||||
"$FRESH_STATE/workspace" \
|
||||
"$FRESH_STATE/logs" \
|
||||
"$FRESH_STATE/memory" \
|
||||
"$FRESH_STATE/cache" \
|
||||
"$FRESH_STATE/identity" \
|
||||
"$FRESH_STATE/devices" \
|
||||
"$FRESH_STATE/tasks" \
|
||||
"$FRESH_STATE/subagents" \
|
||||
"$FRESH_STATE/flows" \
|
||||
"$FRESH_STATE/cron"
|
||||
|
||||
export HOME="$FRESH_HOME"
|
||||
export OPENCLAW_HOME="$FRESH_HOME"
|
||||
export OPENCLAW_STATE_DIR="$FRESH_STATE"
|
||||
export OPENCLAW_CONFIG_PATH="$FRESH_STATE/openclaw.json"
|
||||
export XDG_CONFIG_HOME="$FRESH_HOME/.config"
|
||||
|
||||
python - <<'PY'
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
cfg_path = Path(os.environ["OPENCLAW_CONFIG_PATH"])
|
||||
if not cfg_path.exists():
|
||||
raise SystemExit("missing openclaw.json")
|
||||
data = json.loads(cfg_path.read_text(encoding="utf-8"))
|
||||
|
||||
def set_nested(root, dotted, value):
|
||||
cursor = root
|
||||
parts = dotted.split(".")
|
||||
for part in parts[:-1]:
|
||||
child = cursor.get(part)
|
||||
if not isinstance(child, dict):
|
||||
child = {}
|
||||
cursor[part] = child
|
||||
cursor = child
|
||||
cursor[parts[-1]] = value
|
||||
|
||||
agents = data.setdefault("agents", {})
|
||||
if isinstance(agents, dict):
|
||||
agents["list"] = []
|
||||
|
||||
channels = data.get("channels")
|
||||
if isinstance(channels, dict):
|
||||
for channel in channels.values():
|
||||
if isinstance(channel, dict):
|
||||
channel["enabled"] = False
|
||||
exec_approvals = channel.get("execApprovals")
|
||||
if not isinstance(exec_approvals, dict):
|
||||
exec_approvals = {}
|
||||
channel["execApprovals"] = exec_approvals
|
||||
exec_approvals["enabled"] = False
|
||||
|
||||
plugins = data.setdefault("plugins", {})
|
||||
stale = {"marxbiotech-git-tools", "lab"}
|
||||
allow = plugins.get("allow")
|
||||
if isinstance(allow, list):
|
||||
plugins["allow"] = [item for item in allow if item not in stale]
|
||||
entries = plugins.get("entries")
|
||||
if isinstance(entries, dict):
|
||||
for item in stale:
|
||||
entries.pop(item, None)
|
||||
|
||||
set_nested(data, "browser.headless", True)
|
||||
set_nested(data, "browser.noSandbox", True)
|
||||
set_nested(data, "gateway.reload.mode", "off")
|
||||
set_nested(data, "agents.defaults.skipBootstrap", True)
|
||||
set_nested(data, "agents.defaults.sandbox.mode", "off")
|
||||
set_nested(data, "agents.defaults.model.primary", os.environ["SWEEP_MODEL"])
|
||||
set_nested(data, "agents.defaults.subagents.model.primary", os.environ["SWEEP_MODEL"])
|
||||
set_nested(data, "tools.exec.host", os.environ.get("OPENCLAW_EXEC_HOST", "gateway"))
|
||||
set_nested(data, "tools.exec.security", "full")
|
||||
set_nested(data, "tools.exec.ask", "off")
|
||||
set_nested(data, "approvals.exec.enabled", False)
|
||||
|
||||
cfg_path.write_text(json.dumps(data, indent=2) + "\n", encoding="utf-8")
|
||||
|
||||
approvals_path = cfg_path.with_name("exec-approvals.json")
|
||||
approvals = {
|
||||
"version": 1,
|
||||
"socket": {
|
||||
"path": str(approvals_path.with_suffix(".sock")),
|
||||
"token": "container-lane-eval-token",
|
||||
},
|
||||
"defaults": {"security": "full", "ask": "off", "askFallback": "full"},
|
||||
"agents": {"*": {"security": "full", "ask": "off", "askFallback": "full"}},
|
||||
}
|
||||
approvals_path.write_text(json.dumps(approvals, indent=2) + "\n", encoding="utf-8")
|
||||
PY
|
||||
|
||||
if [ "${CLAWBENCH_ENABLE_GBRAIN:-0}" = "1" ]; then
|
||||
export CLAWBENCH_LANE_PREPARE_CMD="${CLAWBENCH_LANE_PREPARE_CMD:-/home/node/app/scripts/setup_gbrain_runtime.sh}"
|
||||
"$CLAWBENCH_LANE_PREPARE_CMD"
|
||||
fi
|
||||
|
||||
echo "===== CONTAINER LANE EVAL START $(date '+%Y-%m-%d %H:%M:%S') ====="
|
||||
echo "label: $SWEEP_LABEL"
|
||||
echo "model: $SWEEP_MODEL"
|
||||
echo "runs: $SWEEP_RUNS"
|
||||
echo "lanes: $SWEEP_LANES"
|
||||
echo "tasks: ${SWEEP_TASKS:-${CHERRY_TASKS:-all}}"
|
||||
echo "out: $OUT"
|
||||
echo "log: $LOG"
|
||||
echo "home: $HOME"
|
||||
echo "state: $OPENCLAW_STATE_DIR"
|
||||
openclaw --version 2>/dev/null || true
|
||||
|
||||
set +e
|
||||
python - <<'PY' > "$LOG" 2>&1
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
from clawbench.queue import JobQueue, JobStatus, SubmissionRequest
|
||||
from clawbench.worker import EvalWorker, RESULTS_DIR
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
|
||||
|
||||
async def main() -> int:
|
||||
queue = JobQueue()
|
||||
queue._jobs.clear()
|
||||
queue._save_local()
|
||||
task_ids_raw = os.environ.get("SWEEP_TASKS") or os.environ.get("CHERRY_TASKS") or ""
|
||||
task_ids = [item.strip() for item in task_ids_raw.split(",") if item.strip()]
|
||||
request = SubmissionRequest(
|
||||
model=os.environ["SWEEP_MODEL"],
|
||||
runs_per_task=int(os.environ["SWEEP_RUNS"]),
|
||||
max_parallel_lanes=int(os.environ["SWEEP_LANES"]),
|
||||
task_ids=task_ids,
|
||||
prompt_variant=os.environ.get("SWEEP_PROMPT_VARIANT", "clear"),
|
||||
judge_model=os.environ.get("CLAWBENCH_JUDGE_MODEL", ""),
|
||||
notes=os.environ.get("SWEEP_LABEL", ""),
|
||||
)
|
||||
job = await queue.submit(request)
|
||||
worker = EvalWorker(queue)
|
||||
await worker._process_job(job)
|
||||
final = await queue.get_status(job.job_id)
|
||||
print(json.dumps(final.model_dump() if final else {}, indent=2), flush=True)
|
||||
if final is None or final.status != JobStatus.FINISHED or not final.result_id:
|
||||
return 1
|
||||
result_path = RESULTS_DIR / f"{final.result_id}.json"
|
||||
output_path = Path(os.environ["SWEEP_OUTPUT_PATH"])
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
shutil.copy2(result_path, output_path)
|
||||
return 0
|
||||
|
||||
raise SystemExit(asyncio.run(main()))
|
||||
PY
|
||||
status=$?
|
||||
set -e
|
||||
|
||||
echo "===== lane eval exit=$status $(date '+%Y-%m-%d %H:%M:%S') ====="
|
||||
tail -120 "$LOG" 2>/dev/null || true
|
||||
exit "$status"
|
||||
@ -1,98 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Minimal single-model sweep — 1 run per task (not 3) for fast validation.
|
||||
# Used to quickly test if an openrouter-stream fix actually works without
|
||||
# committing to a full 60-minute 3-run sweep.
|
||||
#
|
||||
# Invocation (from host):
|
||||
# docker run -d --name clawbench-<LABEL> \
|
||||
# -e SWEEP_LABEL=<label> -e SWEEP_MODEL=<routed-model> \
|
||||
# -e SWEEP_PROFILE=<abs-profile-path> \
|
||||
# -e SWEEP_LOGDIR=<output-dir-in-container> \
|
||||
# -e SWEEP_OUT_TAG=<tag> \
|
||||
# -v .../scripts:/home/node/app/scripts:ro \
|
||||
# -v .../data:/data \
|
||||
# -v .../data/container-home-openclaw:/home/node/.openclaw \
|
||||
# -v .../profiles:/home/node/app/profiles:ro \
|
||||
# --memory 8g \
|
||||
# <image> \
|
||||
# bash /home/node/app/scripts/container_sweep_minimal.sh
|
||||
|
||||
set -u
|
||||
|
||||
: "${SWEEP_LABEL:?SWEEP_LABEL required}"
|
||||
: "${SWEEP_MODEL:?SWEEP_MODEL required}"
|
||||
: "${SWEEP_PROFILE:?SWEEP_PROFILE required}"
|
||||
: "${SWEEP_LOGDIR:?SWEEP_LOGDIR required}"
|
||||
: "${SWEEP_OUT_TAG:?SWEEP_OUT_TAG required}"
|
||||
|
||||
cd /data
|
||||
mkdir -p "$SWEEP_LOGDIR"
|
||||
|
||||
export OPENCLAW_GATEWAY_TOKEN="local-dev-token-for-testing"
|
||||
export CLAWBENCH_RUN_CACHE_DIR="/data/run_cache"
|
||||
mkdir -p "$CLAWBENCH_RUN_CACHE_DIR"
|
||||
export NODE_OPTIONS="--max-old-space-size=4096"
|
||||
|
||||
# Clear cache for target model
|
||||
case "$SWEEP_MODEL" in
|
||||
openrouter/z-ai/glm-5.1) CACHE_SUB="openrouter_z-ai_glm-5.1" ;;
|
||||
openrouter/minimax/minimax-m2.7) CACHE_SUB="openrouter_minimax_minimax-m2.7" ;;
|
||||
openrouter/moonshotai/kimi-k2.5) CACHE_SUB="openrouter_moonshotai_kimi-k2.5" ;;
|
||||
*) CACHE_SUB="" ;;
|
||||
esac
|
||||
if [ -n "$CACHE_SUB" ] && [ -d "$CLAWBENCH_RUN_CACHE_DIR/$CACHE_SUB" ]; then
|
||||
echo "clearing cache: $CLAWBENCH_RUN_CACHE_DIR/$CACHE_SUB"
|
||||
rm -rf "$CLAWBENCH_RUN_CACHE_DIR/$CACHE_SUB"
|
||||
fi
|
||||
|
||||
OUT="$SWEEP_LOGDIR/docker_${SWEEP_LABEL}_${SWEEP_OUT_TAG}.json"
|
||||
LOG="$SWEEP_LOGDIR/docker_${SWEEP_LABEL}_${SWEEP_OUT_TAG}.log"
|
||||
GWLOG="$SWEEP_LOGDIR/gateway_${SWEEP_LABEL}.log"
|
||||
|
||||
rm -f "$OUT"
|
||||
|
||||
echo "===== MINIMAL SWEEP START $(date '+%Y-%m-%d %H:%M:%S') ====="
|
||||
echo "label: $SWEEP_LABEL"
|
||||
echo "model: $SWEEP_MODEL"
|
||||
echo "profile: $SWEEP_PROFILE"
|
||||
echo "out: $OUT"
|
||||
echo "runs: 1 per task (MINIMAL)"
|
||||
|
||||
echo "Starting gateway on :18789 (heap=4GB) ..."
|
||||
openclaw gateway --port 18789 > "$GWLOG" 2>&1 &
|
||||
GATEWAY_PID=$!
|
||||
|
||||
ready=0
|
||||
for i in $(seq 1 120); do
|
||||
if curl -sf -H "Authorization: Bearer $OPENCLAW_GATEWAY_TOKEN" http://127.0.0.1:18789/health > /dev/null 2>&1; then
|
||||
echo "Gateway healthy after ${i}s"
|
||||
ready=1
|
||||
break
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
if [ $ready -ne 1 ]; then
|
||||
echo "ERROR: gateway failed to come up"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "===== $(date '+%H:%M:%S') starting $SWEEP_LABEL ($SWEEP_MODEL) ====="
|
||||
clawbench run \
|
||||
--model "$SWEEP_MODEL" \
|
||||
--runs 1 \
|
||||
--concurrency 4 \
|
||||
--profile "$SWEEP_PROFILE" \
|
||||
--judge-model "anthropic/claude-sonnet-4-6" \
|
||||
-o "$OUT" \
|
||||
> "$LOG" 2>&1
|
||||
status=$?
|
||||
|
||||
echo "===== $(date '+%H:%M:%S') done $SWEEP_LABEL (exit $status) ====="
|
||||
|
||||
# Archive the cache for future audits
|
||||
# shellcheck disable=SC1091
|
||||
source "$(dirname "$0")/_archive_cache.sh" 2>/dev/null && archive_run_cache || echo "[archive] helper missing, skipping"
|
||||
|
||||
kill $GATEWAY_PID 2>/dev/null
|
||||
wait $GATEWAY_PID 2>/dev/null
|
||||
exit $status
|
||||
@ -1,175 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Single-model sweep with fresh gateway + bumped Node heap to prevent OOM.
|
||||
#
|
||||
# Invocation (from host):
|
||||
# docker run -d --name clawbench-sweep-<LABEL> \
|
||||
# -e SWEEP_LABEL=<label> -e SWEEP_MODEL=<routed-model> -e SWEEP_PROFILE=<abs-profile-path> \
|
||||
# -v .../scripts:/home/node/app/scripts:ro \
|
||||
# -v .../data:/data \
|
||||
# -v .../data/container-home-openclaw:/home/node/.openclaw \
|
||||
# -v .../profiles:/home/node/app/profiles:ro \
|
||||
# --memory 8g \
|
||||
# clawbench-clawbench:latest \
|
||||
# bash /home/node/app/scripts/container_sweep_single.sh
|
||||
#
|
||||
# Differences vs container_sweep.sh:
|
||||
# - Bumps gateway Node.js heap via NODE_OPTIONS=--max-old-space-size=4096 (prevents 2GB OOM we saw at ~4h)
|
||||
# - One model per container (no shared-gateway drift between models)
|
||||
# - Force-clears run_cache for THIS model before running (prevents cache-replay masking)
|
||||
# - Writes to the same $LOGDIR/docker_${label}_${SWEEP_OUT_TAG}.json as the original sweep
|
||||
# so generate_drift_report.py picks it up without changes
|
||||
|
||||
set -u
|
||||
|
||||
: "${SWEEP_LABEL:?SWEEP_LABEL required (e.g. glm, minimax, kimi)}"
|
||||
: "${SWEEP_MODEL:?SWEEP_MODEL required (e.g. openrouter/z-ai/glm-5.1)}"
|
||||
: "${SWEEP_PROFILE:?SWEEP_PROFILE required (absolute path in container)}"
|
||||
|
||||
# Optional overrides (defaults target the v4.14 drift sweep):
|
||||
# SWEEP_LOGDIR — where JSONs and logs go (default /data/drift_2026-04-14)
|
||||
# SWEEP_OUT_TAG — tag embedded in output filename (default v2026-4-14)
|
||||
: "${SWEEP_LOGDIR:=/data/drift_2026-04-14}"
|
||||
: "${SWEEP_OUT_TAG:=v2026-4-14}"
|
||||
|
||||
cd /data
|
||||
|
||||
LOGDIR="$SWEEP_LOGDIR"
|
||||
mkdir -p "$LOGDIR"
|
||||
|
||||
export OPENCLAW_GATEWAY_TOKEN="local-dev-token-for-testing"
|
||||
export CLAWBENCH_RUN_CACHE_DIR="/data/run_cache"
|
||||
mkdir -p "$CLAWBENCH_RUN_CACHE_DIR"
|
||||
|
||||
# OOM fix: give the gateway Node process a 4GB old-space ceiling instead of the default ~2GB.
|
||||
# Scoped via env so we don't stomp on other Node processes (clawbench itself is python).
|
||||
export NODE_OPTIONS="--max-old-space-size=4096"
|
||||
|
||||
# State-dir isolation: the shared /home/node/.openclaw mount accumulates cruft
|
||||
# across sweeps (agents/, workspace/, logs/, memory/, stale openclaw.json.*.tmp)
|
||||
# which triggers gateway hot-reload churn and cascading `RPC agents.create timed
|
||||
# out after 60s` failures. Give each sweep a pristine state dir that carries
|
||||
# over only the config (openclaw.json, identity/, devices/, exec-approvals.json,
|
||||
# tasks/, subagents/, flows/, cron/) and leaves runtime state empty.
|
||||
SRC_STATE="/home/node/.openclaw"
|
||||
FRESH_STATE="/tmp/openclaw-state-${SWEEP_LABEL}-$$"
|
||||
echo "[state-isolate] cloning config from $SRC_STATE to $FRESH_STATE"
|
||||
mkdir -p "$FRESH_STATE"
|
||||
# Copy the main config (skip the .tmp/.bak/.clobbered/.pre-* cruft that can
|
||||
# confuse the loader — only the canonical openclaw.json is needed).
|
||||
if [ -f "$SRC_STATE/openclaw.json" ]; then
|
||||
cp "$SRC_STATE/openclaw.json" "$FRESH_STATE/openclaw.json"
|
||||
fi
|
||||
if [ -f "$SRC_STATE/exec-approvals.json" ]; then
|
||||
cp "$SRC_STATE/exec-approvals.json" "$FRESH_STATE/exec-approvals.json"
|
||||
fi
|
||||
# Carry over static config dirs — these are read-mostly and don't accumulate
|
||||
# per-run cruft. SKIP: agents/ workspace*/ logs/ memory/ cache/ browser/ canvas/
|
||||
# which all grow unboundedly across sweeps.
|
||||
for d in identity devices tasks subagents flows cron; do
|
||||
if [ -d "$SRC_STATE/$d" ]; then
|
||||
cp -r "$SRC_STATE/$d" "$FRESH_STATE/$d"
|
||||
fi
|
||||
done
|
||||
# Ensure runtime dirs exist but are empty
|
||||
mkdir -p "$FRESH_STATE/agents" "$FRESH_STATE/workspace" "$FRESH_STATE/logs" "$FRESH_STATE/memory" "$FRESH_STATE/cache"
|
||||
export OPENCLAW_STATE_DIR="$FRESH_STATE"
|
||||
echo "[state-isolate] OPENCLAW_STATE_DIR=$OPENCLAW_STATE_DIR"
|
||||
du -sh "$FRESH_STATE" 2>/dev/null | sed 's/^/[state-isolate] size: /'
|
||||
|
||||
# Map label -> cache subdir (matches what clawbench writes)
|
||||
case "$SWEEP_MODEL" in
|
||||
anthropic/claude-opus-4-7) CACHE_SUB="anthropic_claude-opus-4-7" ;;
|
||||
anthropic/claude-sonnet-4-7) CACHE_SUB="anthropic_claude-sonnet-4-7" ;;
|
||||
anthropic/claude-opus-4-6) CACHE_SUB="anthropic_claude-opus-4-6" ;;
|
||||
anthropic/claude-sonnet-4-6) CACHE_SUB="anthropic_claude-sonnet-4-6" ;;
|
||||
openai/gpt-5.4) CACHE_SUB="openai_gpt-5.4" ;;
|
||||
openai/gpt-5.2) CACHE_SUB="openai_gpt-5.2" ;;
|
||||
google/gemini-3.1-pro-preview) CACHE_SUB="google_gemini-3.1-pro-preview" ;;
|
||||
openrouter/z-ai/glm-5.1) CACHE_SUB="openrouter_z-ai_glm-5.1" ;;
|
||||
openrouter/qwen/qwen3.6-plus) CACHE_SUB="openrouter_qwen_qwen3.6-plus" ;;
|
||||
openrouter/minimax/minimax-m2.7) CACHE_SUB="openrouter_minimax_minimax-m2.7" ;;
|
||||
openrouter/moonshotai/kimi-k2.5) CACHE_SUB="openrouter_moonshotai_kimi-k2.5" ;;
|
||||
# kimi-k2.6 is not yet supported in the openclaw version under test — skip.
|
||||
*) CACHE_SUB="" ;;
|
||||
esac
|
||||
|
||||
OUT="$LOGDIR/docker_${SWEEP_LABEL}_${SWEEP_OUT_TAG}.json"
|
||||
LOG="$LOGDIR/docker_${SWEEP_LABEL}_${SWEEP_OUT_TAG}.log"
|
||||
GWLOG="$LOGDIR/gateway_${SWEEP_LABEL}.log"
|
||||
|
||||
echo "===== SINGLE-MODEL SWEEP START $(date '+%Y-%m-%d %H:%M:%S') ====="
|
||||
echo "label: $SWEEP_LABEL"
|
||||
echo "model: $SWEEP_MODEL"
|
||||
echo "profile: $SWEEP_PROFILE"
|
||||
echo "out: $OUT"
|
||||
echo "gwlog: $GWLOG"
|
||||
echo "NODE_OPTIONS: $NODE_OPTIONS"
|
||||
|
||||
# Force-clear this model's run_cache so we actually re-run (no replays)
|
||||
if [ -n "$CACHE_SUB" ] && [ -d "$CLAWBENCH_RUN_CACHE_DIR/$CACHE_SUB" ]; then
|
||||
echo "clearing cache: $CLAWBENCH_RUN_CACHE_DIR/$CACHE_SUB"
|
||||
rm -rf "$CLAWBENCH_RUN_CACHE_DIR/$CACHE_SUB"
|
||||
fi
|
||||
|
||||
# Also remove any stale result JSON so we don't skip-on-idempotence
|
||||
if [ -f "$OUT" ]; then
|
||||
echo "removing stale result: $OUT"
|
||||
rm -f "$OUT"
|
||||
fi
|
||||
|
||||
# Start gateway with bumped heap
|
||||
echo "Starting gateway on :18789 (heap=4GB) ..."
|
||||
openclaw gateway --port 18789 > "$GWLOG" 2>&1 &
|
||||
GATEWAY_PID=$!
|
||||
echo "gateway pid=$GATEWAY_PID"
|
||||
|
||||
ready=0
|
||||
for i in $(seq 1 120); do
|
||||
if curl -sf -H "Authorization: Bearer $OPENCLAW_GATEWAY_TOKEN" http://127.0.0.1:18789/health > /dev/null 2>&1; then
|
||||
echo "Gateway healthy after ${i}s"
|
||||
ready=1
|
||||
break
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
if [ $ready -ne 1 ]; then
|
||||
echo "ERROR: gateway failed to come up within 120s"
|
||||
tail -30 "$GWLOG"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "===== $(date '+%H:%M:%S') starting $SWEEP_LABEL ($SWEEP_MODEL) ====="
|
||||
clawbench run \
|
||||
--model "$SWEEP_MODEL" \
|
||||
--runs 3 \
|
||||
--concurrency 4 \
|
||||
--profile "$SWEEP_PROFILE" \
|
||||
--judge-model "anthropic/claude-sonnet-4-6" \
|
||||
-o "$OUT" \
|
||||
> "$LOG" 2>&1
|
||||
status=$?
|
||||
|
||||
if [ $status -eq 0 ]; then
|
||||
echo "===== $(date '+%H:%M:%S') done $SWEEP_LABEL (exit 0) ====="
|
||||
else
|
||||
echo "===== $(date '+%H:%M:%S') FAILED $SWEEP_LABEL (exit $status) ====="
|
||||
tail -20 "$LOG"
|
||||
fi
|
||||
|
||||
# Archive the cache for future audits (preserves transcripts per sweep tag)
|
||||
# shellcheck disable=SC1091
|
||||
source "$(dirname "$0")/_archive_cache.sh" 2>/dev/null && archive_run_cache || echo "[archive] helper missing, skipping"
|
||||
|
||||
echo ""
|
||||
echo "===== SINGLE-MODEL SWEEP END $(date '+%Y-%m-%d %H:%M:%S') ====="
|
||||
kill $GATEWAY_PID 2>/dev/null
|
||||
wait $GATEWAY_PID 2>/dev/null
|
||||
echo "gateway stopped"
|
||||
|
||||
# Clean up the isolated state dir (don't accumulate /tmp cruft across sweeps).
|
||||
if [ -n "${FRESH_STATE:-}" ] && [ -d "$FRESH_STATE" ]; then
|
||||
echo "[state-isolate] removing $FRESH_STATE"
|
||||
rm -rf "$FRESH_STATE"
|
||||
fi
|
||||
|
||||
exit $status
|
||||
@ -1,254 +0,0 @@
|
||||
"""Fair 9-model comparison report generator for the v2026-4-19 full sweep.
|
||||
|
||||
Reads the per-run archive at data/run_cache_archive/<tag>/<cache_sub>/<task>/runN.json
|
||||
and computes, per model:
|
||||
- Coverage % (archived runs / 120)
|
||||
- Overall mean, clean mean (excl. infra-zeros), coverage-normalized score
|
||||
- Per-tier mean (tier1-5)
|
||||
- Judge-infra failures remaining (should be 0 after rejudge pass)
|
||||
|
||||
Writes markdown to reports/EVAL_REPORT_9MODEL_FAIR_<tag>.md.
|
||||
|
||||
Usage:
|
||||
python3 scripts/generate_fair_report.py \\
|
||||
--tag v2026-4-19-full \\
|
||||
[--out reports/EVAL_REPORT_9MODEL_FAIR_v2026-4-19-full.md]
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from statistics import mean
|
||||
|
||||
ROOT = Path(__file__).resolve().parent.parent
|
||||
|
||||
MODEL_MAP = {
|
||||
"opus47": ("anthropic_claude-opus-4-7", "Claude Opus 4.7"),
|
||||
"opus46": ("anthropic_claude-opus-4-6", "Claude Opus 4.6"),
|
||||
"sonnet46": ("anthropic_claude-sonnet-4-6", "Claude Sonnet 4.6"),
|
||||
"gpt54": ("openai_gpt-5.4", "GPT 5.4"),
|
||||
"gemini": ("google_gemini-3.1-pro-preview", "Gemini 3.1 Pro"),
|
||||
"glm": ("openrouter_z-ai_glm-5.1", "GLM 5.1"),
|
||||
"minimax": ("openrouter_minimax_minimax-m2.7", "MiniMax M2.7"),
|
||||
"kimi25": ("openrouter_moonshotai_kimi-k2.5", "Kimi K2.5"),
|
||||
"qwen": ("openrouter_qwen_qwen3.6-plus", "Qwen 3.6 Plus"),
|
||||
}
|
||||
|
||||
JUDGE_INFRA_PHRASES = [
|
||||
"gateway is restarting", "judge execution failed", "judge failed to run",
|
||||
"judge call failed", "judge timed out",
|
||||
]
|
||||
|
||||
|
||||
def tier_of(task_id: str) -> str:
|
||||
m = re.match(r"t(\d)-", task_id)
|
||||
return f"tier{m.group(1)}" if m else "other"
|
||||
|
||||
|
||||
def scan_archive(cache_dir: Path) -> list[dict]:
|
||||
rows = []
|
||||
if not cache_dir.exists():
|
||||
return rows
|
||||
for tdir in sorted(cache_dir.iterdir()):
|
||||
if not tdir.is_dir():
|
||||
continue
|
||||
for rf in sorted(tdir.glob("run*.json")):
|
||||
try:
|
||||
d = json.loads(rf.read_text())
|
||||
except Exception:
|
||||
continue
|
||||
jr = d.get("judge_result", {}) or {}
|
||||
reason = (jr.get("reason") or "").lower()
|
||||
judge_infra = (
|
||||
jr.get("enabled")
|
||||
and "rejudged_at" not in jr
|
||||
and (
|
||||
any(p in reason for p in JUDGE_INFRA_PHRASES)
|
||||
or jr.get("error")
|
||||
or (not reason.strip() and jr.get("score", 0) == 0)
|
||||
)
|
||||
)
|
||||
rows.append({
|
||||
"task": tdir.name,
|
||||
"tier": tier_of(tdir.name),
|
||||
"run_score": d.get("run_score", 0),
|
||||
"c": d.get("completion_result", {}).get("score", 0),
|
||||
"t": d.get("trajectory_result", {}).get("score", 0),
|
||||
"b": d.get("behavior_result", {}).get("score", 0),
|
||||
"j": jr.get("score", 0) if jr.get("enabled") else None,
|
||||
"judge_infra": bool(judge_infra),
|
||||
"rejudged": "rejudged_at" in jr,
|
||||
"is_infra_zero": d.get("run_score", 0) < 0.01,
|
||||
})
|
||||
return rows
|
||||
|
||||
|
||||
def summarize(label: str, cache_sub: str, pretty: str, tag: str) -> dict:
|
||||
cache_dir = ROOT / "data" / "run_cache_archive" / tag / cache_sub
|
||||
rows = scan_archive(cache_dir)
|
||||
n = len(rows)
|
||||
if n == 0:
|
||||
return {"label": label, "pretty": pretty, "n": 0, "missing": 120}
|
||||
|
||||
all_scores = [r["run_score"] for r in rows]
|
||||
clean_rows = [r for r in rows if not r["is_infra_zero"]]
|
||||
clean_scores = [r["run_score"] for r in clean_rows]
|
||||
overall = mean(all_scores) if all_scores else 0
|
||||
clean = mean(clean_scores) if clean_scores else 0
|
||||
cov_norm = sum(clean_scores) / 120
|
||||
coverage_pct = 100.0 * len(clean_rows) / 120
|
||||
|
||||
per_tier = defaultdict(list)
|
||||
for r in rows:
|
||||
per_tier[r["tier"]].append(r["run_score"])
|
||||
tier_means = {t: mean(v) for t, v in per_tier.items() if v}
|
||||
|
||||
# Judge-only score (how well model does purely on LLM judgment)
|
||||
judge_scores = [r["j"] for r in rows if r["j"] is not None]
|
||||
judge_mean = mean(judge_scores) if judge_scores else None
|
||||
|
||||
# C=1.0 pass count
|
||||
c_pass_count = sum(1 for r in rows if r["c"] >= 0.9999)
|
||||
|
||||
return {
|
||||
"label": label,
|
||||
"pretty": pretty,
|
||||
"n": n,
|
||||
"missing": max(0, 120 - n),
|
||||
"n_clean": len(clean_rows),
|
||||
"coverage_pct": coverage_pct,
|
||||
"overall": overall,
|
||||
"clean": clean,
|
||||
"cov_norm": cov_norm,
|
||||
"tier_means": tier_means,
|
||||
"judge_mean": judge_mean,
|
||||
"c_pass_count": c_pass_count,
|
||||
"judge_infra_remaining": sum(1 for r in rows if r["judge_infra"]),
|
||||
"rejudged": sum(1 for r in rows if r["rejudged"]),
|
||||
}
|
||||
|
||||
|
||||
def build_markdown(summaries: list[dict], tag: str) -> str:
|
||||
summaries = [s for s in summaries if s["n"] > 0]
|
||||
summaries.sort(key=lambda s: -s.get("clean", 0))
|
||||
|
||||
L = []
|
||||
L.append(f"# ClawBench Fair 9-Model Comparison — {tag}")
|
||||
L.append("")
|
||||
L.append("All 9 models at 120/120 coverage after gap-fill. Rankings use")
|
||||
L.append("**clean mean run_score** — mean across all 120 archived runs per model.")
|
||||
L.append("")
|
||||
L.append("## Ranking (clean mean run_score, 0–1 scale)")
|
||||
L.append("")
|
||||
L.append("| Rank | Model | Clean | Judge-only | C=1.0 tasks | Coverage |")
|
||||
L.append("|---:|---|---:|---:|---:|---:|")
|
||||
for rank, s in enumerate(summaries, 1):
|
||||
jm = f"{s['judge_mean']:.3f}" if s.get("judge_mean") is not None else "—"
|
||||
cpct = s.get("c_pass_count", 0)
|
||||
L.append(f"| {rank} | **{s['pretty']}** | **{s['clean']:.4f}** | "
|
||||
f"{jm} | {cpct}/{s['n']} | {s['n']}/120 |")
|
||||
L.append("")
|
||||
|
||||
L.append("## Fairness audit — passed")
|
||||
L.append("")
|
||||
L.append("All 9 models subjected to **identical** evaluation conditions:")
|
||||
L.append("")
|
||||
L.append("- **Same 40 tasks × 3 runs = 120 expected runs per model** (all from v4-19-full sweep)")
|
||||
L.append("- **Same completion/trajectory/behavior verifiers** for every model")
|
||||
L.append("- **Same Docker image** (openclaw 2026-04-16 baseline)")
|
||||
L.append("- **Same judge model** (Claude Sonnet 4.6)")
|
||||
L.append("- **Judge infra failures all rejudged** via direct Anthropic API (0 left)")
|
||||
L.append("- **Coverage parity**: 97-99% across all models (within ~3 runs)")
|
||||
L.append("")
|
||||
# Coverage table
|
||||
L.append("### Coverage detail")
|
||||
L.append("")
|
||||
L.append("| Model | Archived | Missing | Rejudged via API |")
|
||||
L.append("|---|---:|---:|---:|")
|
||||
for s in summaries:
|
||||
L.append(f"| {s['pretty']} | {s['n']}/120 | {s['missing']} | {s['rejudged']} |")
|
||||
L.append("")
|
||||
|
||||
# Per-tier
|
||||
L.append("## Per-tier mean run_score")
|
||||
L.append("")
|
||||
L.append("| Model | Tier 1 | Tier 2 | Tier 3 | Tier 4 | Tier 5 |")
|
||||
L.append("|---|---:|---:|---:|---:|---:|")
|
||||
for s in summaries:
|
||||
tm = s.get("tier_means", {})
|
||||
row = [s["pretty"]]
|
||||
for t in ("tier1", "tier2", "tier3", "tier4", "tier5"):
|
||||
row.append(f"{tm[t]:.3f}" if t in tm else "—")
|
||||
L.append("| " + " | ".join(row) + " |")
|
||||
L.append("")
|
||||
|
||||
# Legend
|
||||
L.append("## Glossary")
|
||||
L.append("")
|
||||
L.append("- **Cov-norm**: `clean_sum / 120`. Missing runs count as 0.")
|
||||
L.append(" This is the single fair comparison number — it penalizes both")
|
||||
L.append(" low scores AND infra-related missing runs.")
|
||||
L.append("- **Clean**: Mean run_score across archived runs (excludes infra-zeros).")
|
||||
L.append(" Shows capability ceiling ignoring infra flakiness.")
|
||||
L.append("- **Judge-only**: Mean LLM-judge score (0-1 from Claude Sonnet 4.6).")
|
||||
L.append(" Independent second opinion on quality, used when deterministic")
|
||||
L.append(" verifiers can't capture nuance.")
|
||||
L.append("- **Cov%**: Fraction of 120 runs that produced a non-infra outcome.")
|
||||
L.append("- **run_score**: Weighted combination — when deterministic verifiers")
|
||||
L.append(" pass (C≥0.9999): `0.4·C + 0.3·T + 0.2·B + 0.1·J`. Else, judge excluded,")
|
||||
L.append(" renormalized over C/T/B.")
|
||||
L.append("")
|
||||
|
||||
# Caveats
|
||||
L.append("## Caveats")
|
||||
L.append("")
|
||||
L.append("- **Missing runs** (1-3 per model) were infra failures that never")
|
||||
L.append(" wrote to cache. Treated as 0 in cov-norm (penalizes the model).")
|
||||
L.append("- **Some tasks have strict verifiers** that require specific file")
|
||||
L.append(" artifacts. All models face the same verifier, so the comparison")
|
||||
L.append(" is internally fair even where individual verifier scores feel low.")
|
||||
L.append("- **Judge scores come from a single judge model** (Sonnet 4.6). Judge")
|
||||
L.append(" bias toward its own family is possible but small at 10% weight.")
|
||||
L.append("- **Ranking gaps of <0.02 cov-norm are within run-to-run noise**.")
|
||||
L.append(" Treat models within the top cluster as roughly equivalent.")
|
||||
L.append("")
|
||||
|
||||
return "\n".join(L) + "\n"
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--tag", required=True)
|
||||
ap.add_argument("--out", type=Path, default=None)
|
||||
ap.add_argument("--exclude", default="", help="comma-separated model labels to exclude")
|
||||
args = ap.parse_args()
|
||||
|
||||
excluded = {x.strip() for x in args.exclude.split(",") if x.strip()}
|
||||
summaries = [summarize(label, sub, pretty, args.tag)
|
||||
for label, (sub, pretty) in MODEL_MAP.items()
|
||||
if label not in excluded]
|
||||
|
||||
out_path = args.out or (ROOT / "reports" / f"EVAL_REPORT_9MODEL_FAIR_{args.tag}.md")
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
out_path.write_text(build_markdown(summaries, args.tag))
|
||||
print(f"Wrote: {out_path}")
|
||||
|
||||
present = [s for s in summaries if s["n"] > 0]
|
||||
present.sort(key=lambda s: -s.get("cov_norm", 0))
|
||||
print()
|
||||
print(f"{'Rank':>4} {'Model':<20} {'Runs':>7} {'Cov%':>5} {'CovNorm':>8} {'Clean':>7} {'Judge':>6}")
|
||||
print("-" * 66)
|
||||
for i, s in enumerate(present, 1):
|
||||
jm = f"{s['judge_mean']:.3f}" if s.get("judge_mean") is not None else "—"
|
||||
print(
|
||||
f"{i:>4} {s['pretty']:<20} {s['n']}/120 {s['coverage_pct']:>4.0f}% "
|
||||
f"{s['cov_norm']:>8.4f} {s['clean']:>7.4f} {jm:>6}"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
22
scripts/infra_log_gate.sh
Executable file
22
scripts/infra_log_gate.sh
Executable file
@ -0,0 +1,22 @@
|
||||
#!/bin/bash
|
||||
# Fail if a ClawBench/OpenClaw run directory contains infra-level failures.
|
||||
|
||||
set -u
|
||||
|
||||
dir="${1:?usage: infra_log_gate.sh <log-dir>}"
|
||||
|
||||
if [ ! -d "$dir" ]; then
|
||||
echo "[infra-gate] missing log directory: $dir" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
pattern="no longer exists|env_unavailable|environment_unavailable|REJECTED|Traceback|model_not_allowed|model not allowed|not allowed|WebSocket closed|API key|billing|Insufficient|sessions.create.*✗|Gateway .*timed out|control-plane.*timed out|connect.*timed out|RPC .*timed out|agents.create timed out|sessions.create.*timed out"
|
||||
|
||||
matches="$(rg -n "$pattern" "$dir" 2>/dev/null || true)"
|
||||
if [ -n "$matches" ]; then
|
||||
echo "[infra-gate] infra-level signatures found in $dir" >&2
|
||||
printf '%s\n' "$matches" | head -80 >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "[infra-gate] clean: $dir"
|
||||
@ -1,289 +0,0 @@
|
||||
"""Re-judge ALL judge-infra-failure runs across all models in a drift sweep dir.
|
||||
|
||||
Fixes: 'Gateway is restarting', 'Judge execution failed', empty-reason 0-score
|
||||
judge results by re-running the judge via direct Anthropic API calls (bypassing
|
||||
the gateway that was failing in the first place).
|
||||
|
||||
Updates:
|
||||
- data/run_cache_archive/<sweep_tag>/<model>/<task>/runN.json (in place)
|
||||
- data/drift_*/docker_<label>_<tag>.json (aggregates)
|
||||
|
||||
Usage:
|
||||
python3 scripts/rejudge_all.py \
|
||||
--drift-dir data/drift_2026-04-19-full \
|
||||
--archive-dir data/run_cache_archive/v2026-4-19-full \
|
||||
[--dry-run]
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import anthropic
|
||||
import yaml
|
||||
|
||||
|
||||
ROOT = Path(__file__).resolve().parent.parent
|
||||
TASK_DIRS = [ROOT / "tasks" / f"tier{i}" for i in range(1, 6)]
|
||||
|
||||
FAILURE_PHRASES = [
|
||||
"gateway is restarting",
|
||||
"judge execution failed",
|
||||
"judge failed to run",
|
||||
"judge call failed",
|
||||
"judge timed out",
|
||||
]
|
||||
|
||||
# Weights copied from clawbench/scorer.py
|
||||
WEIGHTS_DETERMINISTIC = {"completion": 0.40, "trajectory": 0.30, "behavior": 0.20}
|
||||
WEIGHTS_WITH_JUDGE = {"completion": 0.40, "trajectory": 0.30, "behavior": 0.20, "judge": 0.10}
|
||||
WEIGHTS_SEMANTIC_ONLY = {"completion": 0.20, "trajectory": 0.20, "behavior": 0.10, "judge": 0.50}
|
||||
DETERMINISTIC_FLOOR = 0.9999
|
||||
|
||||
# Cache-sub → model label (for result JSON lookup)
|
||||
CACHE_TO_LABEL = {
|
||||
"openrouter_z-ai_glm-5.1": "glm",
|
||||
"openrouter_minimax_minimax-m2.7": "minimax",
|
||||
"openrouter_moonshotai_kimi-k2.5": "kimi",
|
||||
"openrouter_qwen_qwen3.6-plus": "qwen",
|
||||
"anthropic_claude-opus-4-6": "opus46",
|
||||
"anthropic_claude-opus-4-7": "opus47",
|
||||
"anthropic_claude-sonnet-4-6": "sonnet46",
|
||||
"openai_gpt-5.4": "gpt54",
|
||||
"openai_gpt-5.2": "gpt52",
|
||||
"google_gemini-3.1-pro-preview": "gemini",
|
||||
}
|
||||
|
||||
|
||||
def get_api_key() -> str:
|
||||
k = os.environ.get("ANTHROPIC_API_KEY")
|
||||
if k:
|
||||
return k
|
||||
cfg = Path.home() / ".openclaw" / "openclaw.json"
|
||||
if cfg.exists():
|
||||
try:
|
||||
v = json.loads(cfg.read_text()).get("env", {}).get("ANTHROPIC_API_KEY")
|
||||
if v:
|
||||
return v
|
||||
except Exception:
|
||||
pass
|
||||
raise RuntimeError("No ANTHROPIC_API_KEY found (set env var or openclaw.json)")
|
||||
|
||||
|
||||
def load_tasks() -> dict[str, dict]:
|
||||
out = {}
|
||||
for td in TASK_DIRS:
|
||||
if not td.exists():
|
||||
continue
|
||||
for yf in sorted(td.glob("*.yaml")):
|
||||
t = yaml.safe_load(yf.read_text())
|
||||
if t and "id" in t:
|
||||
out[t["id"]] = t
|
||||
return out
|
||||
|
||||
|
||||
def is_judge_infra_fail(jr: dict) -> bool:
|
||||
if not jr or not jr.get("enabled"):
|
||||
return False
|
||||
reason = (jr.get("reason") or "").lower()
|
||||
if any(p in reason for p in FAILURE_PHRASES):
|
||||
return True
|
||||
if jr.get("error"):
|
||||
return True
|
||||
# Empty reason + score 0 is likely an unreported failure
|
||||
if not reason.strip() and jr.get("score", 0) == 0:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def render_transcript_excerpt(transcript: dict, max_chars: int = 4000) -> str:
|
||||
msgs = transcript.get("messages", []) if transcript else []
|
||||
parts = []
|
||||
for m in msgs:
|
||||
role = m.get("role", "?")
|
||||
text = (m.get("text") or "").strip()
|
||||
if text:
|
||||
parts.append(f"[{role}] {text[:500]}")
|
||||
for tc in (m.get("tool_calls") or []):
|
||||
parts.append(f"[{role}/tool] {tc.get('name','?')}({json.dumps(tc.get('arguments',{}))[:120]})")
|
||||
if m.get("tool_result_for"):
|
||||
tr = (m.get("tool_result_content") or "")
|
||||
parts.append(f"[tool_result] {tr[:300]}")
|
||||
excerpt = "\n".join(parts)
|
||||
if len(excerpt) > max_chars:
|
||||
excerpt = excerpt[:max_chars] + "\n... (truncated)"
|
||||
return excerpt
|
||||
|
||||
|
||||
def build_judge_prompt(task: dict, run: dict) -> str:
|
||||
rubric = task.get("judge", {}).get("rubric", "").strip()
|
||||
transcript_excerpt = render_transcript_excerpt(run.get("transcript", {}))
|
||||
cr = run.get("completion_result", {})
|
||||
comp_summary = (
|
||||
f"score={cr.get('score',0):.3f} "
|
||||
f"passed={cr.get('passed_assertions',0)}/{cr.get('total_assertions',0)}"
|
||||
)
|
||||
failures = cr.get("failed_assertions", [])
|
||||
comp_feedback = "\n".join(f"- {f}" for f in failures[:5]) if failures else "(none)"
|
||||
return (
|
||||
f"{rubric}\n\n"
|
||||
f"=== Completion verifier summary ===\n{comp_summary}\n"
|
||||
f"Failed assertions:\n{comp_feedback}\n\n"
|
||||
f"=== Transcript excerpt ===\n{transcript_excerpt}\n"
|
||||
)
|
||||
|
||||
|
||||
JSON_RE = re.compile(r"\{.*\}", re.DOTALL)
|
||||
|
||||
|
||||
def parse_judge_response(raw: str, threshold: float) -> dict:
|
||||
try:
|
||||
# Find the first balanced JSON object (json.raw_decode tolerates trailing text)
|
||||
start = raw.find("{")
|
||||
if start < 0:
|
||||
raise ValueError("no JSON in response")
|
||||
decoder = json.JSONDecoder()
|
||||
obj, _end = decoder.raw_decode(raw[start:])
|
||||
score = float(obj.get("score", 0))
|
||||
confidence = float(obj.get("confidence", 0.5))
|
||||
reason = str(obj.get("reason", ""))
|
||||
return {
|
||||
"enabled": True,
|
||||
"score": round(max(0.0, min(1.0, score)), 4),
|
||||
"confidence": round(max(0.0, min(1.0, confidence)), 4),
|
||||
"reason": reason,
|
||||
"rubric_hits": obj.get("rubric_hits") or [],
|
||||
"rubric_misses": obj.get("rubric_misses") or [],
|
||||
"passing_threshold": threshold,
|
||||
"passed": score >= threshold,
|
||||
"error": None,
|
||||
}
|
||||
except Exception as exc:
|
||||
return {
|
||||
"enabled": True, "score": 0.0, "confidence": 0.0,
|
||||
"reason": f"parse failed: {exc}", "rubric_hits": [], "rubric_misses": [],
|
||||
"passing_threshold": threshold, "passed": False, "error": str(exc),
|
||||
}
|
||||
|
||||
|
||||
def combine_run_score(c: float, t: float, b: float, j: Optional[float], has_det: bool) -> float:
|
||||
if j is None:
|
||||
w = WEIGHTS_DETERMINISTIC
|
||||
ws = w["completion"]*c + w["trajectory"]*t + w["behavior"]*b
|
||||
return round(min(1.0, max(0.0, ws/sum(w.values()))), 4)
|
||||
if has_det:
|
||||
if c < DETERMINISTIC_FLOOR:
|
||||
w = WEIGHTS_DETERMINISTIC
|
||||
ws = w["completion"]*c + w["trajectory"]*t + w["behavior"]*b
|
||||
return round(min(1.0, max(0.0, ws/sum(w.values()))), 4)
|
||||
w = WEIGHTS_WITH_JUDGE
|
||||
ws = w["completion"]*c + w["trajectory"]*t + w["behavior"]*b + w["judge"]*j
|
||||
return round(min(1.0, max(0.0, ws)), 4)
|
||||
w = WEIGHTS_SEMANTIC_ONLY
|
||||
ws = w["completion"]*c + w["trajectory"]*t + w["behavior"]*b + w["judge"]*j
|
||||
return round(min(1.0, max(0.0, ws)), 4)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--drift-dir", required=True, type=Path)
|
||||
ap.add_argument("--archive-dir", required=True, type=Path)
|
||||
ap.add_argument("--dry-run", action="store_true")
|
||||
args = ap.parse_args()
|
||||
|
||||
if not args.archive_dir.exists():
|
||||
print(f"Archive dir missing: {args.archive_dir}")
|
||||
sys.exit(1)
|
||||
|
||||
tasks = load_tasks()
|
||||
print(f"Loaded {len(tasks)} task definitions")
|
||||
|
||||
# Gather all affected runs: (cache_sub, task_id, run_path, run_data)
|
||||
affected: list = []
|
||||
for model_dir in sorted(args.archive_dir.iterdir()):
|
||||
if not model_dir.is_dir():
|
||||
continue
|
||||
if model_dir.name not in CACHE_TO_LABEL:
|
||||
continue
|
||||
for task_dir in model_dir.iterdir():
|
||||
if not task_dir.is_dir():
|
||||
continue
|
||||
for rf in sorted(task_dir.glob("run*.json")):
|
||||
try:
|
||||
run = json.loads(rf.read_text())
|
||||
except Exception:
|
||||
continue
|
||||
if is_judge_infra_fail(run.get("judge_result", {})):
|
||||
affected.append((model_dir.name, task_dir.name, rf, run))
|
||||
|
||||
print(f"Found {len(affected)} runs with judge infra failures")
|
||||
if args.dry_run:
|
||||
from collections import Counter
|
||||
by_model = Counter(a[0] for a in affected)
|
||||
for m, n in by_model.most_common():
|
||||
print(f" {m}: {n}")
|
||||
return
|
||||
if not affected:
|
||||
return
|
||||
|
||||
api_key = get_api_key()
|
||||
client = anthropic.Anthropic(api_key=api_key)
|
||||
|
||||
# Re-judge each
|
||||
succ = 0
|
||||
fail = 0
|
||||
for i, (cache_sub, task_id, rp, run) in enumerate(affected):
|
||||
task = tasks.get(task_id)
|
||||
if not task or not task.get("judge"):
|
||||
continue
|
||||
prompt = build_judge_prompt(task, run)
|
||||
threshold = task["judge"].get("passing_threshold", 0.7)
|
||||
print(f"[{i+1}/{len(affected)}] {cache_sub}/{task_id}/{rp.name} ... ", end="", flush=True)
|
||||
try:
|
||||
t0 = time.monotonic()
|
||||
resp = client.messages.create(
|
||||
model="claude-sonnet-4-6", max_tokens=1024,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
)
|
||||
raw = resp.content[0].text
|
||||
dur_ms = int((time.monotonic() - t0) * 1000)
|
||||
parsed = parse_judge_response(raw, threshold)
|
||||
parsed["model"] = "anthropic/claude-sonnet-4-6"
|
||||
parsed["duration_ms"] = dur_ms
|
||||
parsed["token_usage"] = {
|
||||
"input_tokens": resp.usage.input_tokens,
|
||||
"output_tokens": resp.usage.output_tokens,
|
||||
}
|
||||
parsed["rejudged_at"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
|
||||
run["judge_result"] = parsed
|
||||
# Recompute run_score
|
||||
cr = run.get("completion_result", {})
|
||||
tr = run.get("trajectory_result", {})
|
||||
br = run.get("behavior_result", {})
|
||||
has_det = cr.get("total_assertions", 0) > 0
|
||||
j = parsed["score"] if parsed["enabled"] and not parsed.get("error") else None
|
||||
old_rs = run.get("run_score", 0)
|
||||
new_rs = combine_run_score(cr.get("score", 0), tr.get("score", 0), br.get("score", 0), j, has_det)
|
||||
run["run_score"] = new_rs
|
||||
tmp = rp.with_suffix(".json.tmp")
|
||||
tmp.write_text(json.dumps(run, indent=2))
|
||||
tmp.replace(rp)
|
||||
print(f"J={parsed['score']:.2f} ΔRS={new_rs - old_rs:+.3f}")
|
||||
succ += 1
|
||||
except Exception as exc:
|
||||
print(f"ERROR: {exc}")
|
||||
fail += 1
|
||||
|
||||
print(f"\nRe-judging complete: {succ} succeeded, {fail} failed")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
136
scripts/setup_gbrain_runtime.sh
Executable file
136
scripts/setup_gbrain_runtime.sh
Executable file
@ -0,0 +1,136 @@
|
||||
#!/usr/bin/env bash
|
||||
# Prepare a lane-local GBrain install for OpenClaw benchmark runs.
|
||||
#
|
||||
# The image supplies /opt/gbrain and this script keeps secrets runtime-only:
|
||||
# keys are read from the lane's openclaw.json env block or existing process env,
|
||||
# never baked into Docker layers.
|
||||
set -Eeuo pipefail
|
||||
|
||||
if [ "${CLAWBENCH_ENABLE_GBRAIN:-0}" != "1" ]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
: "${HOME:?HOME is required}"
|
||||
|
||||
GBRAIN_ROOT="${GBRAIN_ROOT:-/opt/gbrain}"
|
||||
if [ ! -d "$GBRAIN_ROOT" ]; then
|
||||
echo "[gbrain] missing $GBRAIN_ROOT" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
export PATH="$GBRAIN_ROOT/bin:/usr/local/bun/bin:$PATH"
|
||||
export GBRAIN_ALLOW_SHELL_JOBS="${GBRAIN_ALLOW_SHELL_JOBS:-1}"
|
||||
|
||||
STATE_DIR="${OPENCLAW_STATE_DIR:-$HOME/.openclaw}"
|
||||
CONFIG_PATH="${OPENCLAW_CONFIG_PATH:-$STATE_DIR/openclaw.json}"
|
||||
LOG_DIR="${CLAWBENCH_GBRAIN_LOG_DIR:-$STATE_DIR/logs}"
|
||||
mkdir -p "$HOME/.gbrain" "$LOG_DIR"
|
||||
LOG_PATH="$LOG_DIR/gbrain-runtime.log"
|
||||
|
||||
if [ -f "$CONFIG_PATH" ]; then
|
||||
eval "$(python3 - "$CONFIG_PATH" <<'PY'
|
||||
import json
|
||||
import os
|
||||
import shlex
|
||||
import sys
|
||||
|
||||
config_path = sys.argv[1]
|
||||
try:
|
||||
data = json.load(open(config_path, encoding="utf-8"))
|
||||
except Exception:
|
||||
data = {}
|
||||
env = data.get("env") if isinstance(data, dict) else {}
|
||||
if not isinstance(env, dict):
|
||||
env = {}
|
||||
for key in ("OPENAI_API_KEY", "ANTHROPIC_API_KEY"):
|
||||
value = os.environ.get(key) or env.get(key)
|
||||
if value:
|
||||
print(f"export {key}={shlex.quote(str(value))}")
|
||||
PY
|
||||
)"
|
||||
|
||||
python3 - "$CONFIG_PATH" "$GBRAIN_ROOT" <<'PY'
|
||||
import json
|
||||
import sys
|
||||
|
||||
config_path = sys.argv[1]
|
||||
gbrain_root = sys.argv[2]
|
||||
try:
|
||||
with open(config_path, encoding="utf-8") as handle:
|
||||
data = json.load(handle)
|
||||
except Exception:
|
||||
data = {}
|
||||
if not isinstance(data, dict):
|
||||
data = {}
|
||||
|
||||
plugins = data.setdefault("plugins", {})
|
||||
if not isinstance(plugins, dict):
|
||||
plugins = {}
|
||||
data["plugins"] = plugins
|
||||
|
||||
allow = plugins.get("allow")
|
||||
if not isinstance(allow, list):
|
||||
allow = []
|
||||
plugins["allow"] = allow
|
||||
if "gbrain" not in allow:
|
||||
allow.append("gbrain")
|
||||
|
||||
entries = plugins.get("entries")
|
||||
if not isinstance(entries, dict):
|
||||
entries = {}
|
||||
plugins["entries"] = entries
|
||||
entry = entries.get("gbrain")
|
||||
if not isinstance(entry, dict):
|
||||
entry = {}
|
||||
entries["gbrain"] = entry
|
||||
entry["enabled"] = True
|
||||
|
||||
load = plugins.get("load")
|
||||
if not isinstance(load, dict):
|
||||
load = {}
|
||||
plugins["load"] = load
|
||||
paths = load.get("paths")
|
||||
if not isinstance(paths, list):
|
||||
paths = []
|
||||
load["paths"] = paths
|
||||
if gbrain_root not in paths:
|
||||
paths.append(gbrain_root)
|
||||
|
||||
with open(config_path, "w", encoding="utf-8") as handle:
|
||||
json.dump(data, handle, indent=2)
|
||||
handle.write("\n")
|
||||
PY
|
||||
fi
|
||||
|
||||
echo "[gbrain] preparing HOME=$HOME" > "$LOG_PATH"
|
||||
echo "[gbrain] version: $(gbrain --version 2>/dev/null || true)" >> "$LOG_PATH"
|
||||
echo "[gbrain] plugin path enabled in $CONFIG_PATH" >> "$LOG_PATH"
|
||||
|
||||
if [ ! -f "$HOME/.gbrain/config.json" ]; then
|
||||
gbrain init >> "$LOG_PATH" 2>&1
|
||||
else
|
||||
gbrain apply-migrations --yes --non-interactive >> "$LOG_PATH" 2>&1 || true
|
||||
fi
|
||||
|
||||
BRAIN_REPO="${GBRAIN_BRAIN_REPO:-$HOME/brain}"
|
||||
mkdir -p "$BRAIN_REPO"
|
||||
if [ "${CLAWBENCH_GBRAIN_SEED_SMOKE:-1}" = "1" ] && ! find "$BRAIN_REPO" -type f -name '*.md' -print -quit | grep -q .; then
|
||||
cat > "$BRAIN_REPO/gbrain-smoke.md" <<'EOF'
|
||||
# GBrain smoke page
|
||||
|
||||
This page verifies that the benchmark image can initialize, import, and query a
|
||||
lane-local GBrain database. It is intentionally generic and not task-specific.
|
||||
EOF
|
||||
fi
|
||||
|
||||
if find "$BRAIN_REPO" -type f -name '*.md' -print -quit | grep -q .; then
|
||||
gbrain import "$BRAIN_REPO" --no-embed >> "$LOG_PATH" 2>&1 || true
|
||||
if [ -n "${OPENAI_API_KEY:-}" ]; then
|
||||
gbrain embed --stale >> "$LOG_PATH" 2>&1 || true
|
||||
else
|
||||
echo "[gbrain] OPENAI_API_KEY not available; semantic embeddings skipped" >> "$LOG_PATH"
|
||||
fi
|
||||
fi
|
||||
|
||||
gbrain doctor --json >> "$LOG_PATH" 2>&1 || true
|
||||
echo "[gbrain] ready" >> "$LOG_PATH"
|
||||
@ -57,12 +57,10 @@ tasks-public/
|
||||
docker build -t clawbench .
|
||||
```
|
||||
|
||||
The repo `Dockerfile` pins an OpenClaw image digest so public Space
|
||||
builds do not silently drift. Override `OPENCLAW_IMAGE` only when you
|
||||
intend to measure a different platform build. Note that platform
|
||||
upgrades can shift scores (we observed +0.13 to +0.29 per model going
|
||||
from 4.9 → 4.15-beta.1) — when comparing two model runs, build them
|
||||
against the same OpenClaw release.
|
||||
The repo `Dockerfile` layers ClawBench on the configured OpenClaw base
|
||||
image. Platform upgrades can shift scores, so record the OpenClaw
|
||||
version for every published comparison and build both sides of a
|
||||
comparison against the same OpenClaw release.
|
||||
|
||||
## How to run Core v1
|
||||
|
||||
@ -107,10 +105,8 @@ your ClawBench config. See MANIFEST.yaml for a programmatic list.
|
||||
- **OpenClaw platform version matters.** Upgrading from 4.9 → 4.15-beta.1
|
||||
shifted scores by +0.13 to +0.29 across models. Build both sides of
|
||||
any comparison from the same OpenClaw release.
|
||||
- **Judge scores** come from Claude Sonnet 4.6 via direct Anthropic
|
||||
API (with a fallback from the gateway judge). Scores assume the
|
||||
judge is working correctly; re-judging broken runs may be required
|
||||
(see `scripts/rejudge_all.py` in the main repo).
|
||||
- **Judge scores** are advisory and depend on the configured judge model.
|
||||
They are reported separately and cannot replace deterministic checks.
|
||||
|
||||
## What's NOT in Core v1
|
||||
|
||||
@ -120,9 +116,9 @@ your ClawBench config. See MANIFEST.yaml for a programmatic list.
|
||||
- **9 noise tasks** (cross-model SNR < 0.5) — either broken verifiers
|
||||
or genuinely ambiguous prompts. Scheduled for redesign.
|
||||
- **3 ranking-breaker tasks** — tasks where the cross-model ordering
|
||||
conflicts with the reference ranking (e.g. `t2-node-search-patch`,
|
||||
`t5-contradictory-requirements`). Not broken per se; just
|
||||
inconsistent with the headline.
|
||||
conflicts with the reference ranking. Not broken per se; just
|
||||
inconsistent with the headline. Their task IDs and contents remain
|
||||
private with the rest of the holdout.
|
||||
|
||||
Also missing entirely from Core v1:
|
||||
- **Tier 6 long-horizon (100+ turn) tasks** — planned for v2.
|
||||
|
||||
122
tests/test_ablation.py
Normal file
122
tests/test_ablation.py
Normal file
@ -0,0 +1,122 @@
|
||||
from clawbench.ablation import (
|
||||
common_compatible_task_set,
|
||||
compare_results,
|
||||
default_tool_profile,
|
||||
)
|
||||
from clawbench.adapters.hermes import HermesAdapterConfig
|
||||
from clawbench.schemas import (
|
||||
BenchmarkResult,
|
||||
CompletionSpec,
|
||||
FileState,
|
||||
SimulatedUser,
|
||||
TaskDefinition,
|
||||
TaskFamily,
|
||||
TaskStats,
|
||||
Tier,
|
||||
UserTurn,
|
||||
)
|
||||
|
||||
|
||||
def _task(task_id: str) -> TaskDefinition:
|
||||
return TaskDefinition(
|
||||
id=task_id,
|
||||
name=task_id,
|
||||
tier=Tier.TIER1,
|
||||
family=TaskFamily.CODING,
|
||||
surface="coding",
|
||||
user=SimulatedUser(turns=[UserTurn(message="write out.txt")]),
|
||||
completion=CompletionSpec(files=[FileState(path="out.txt")]),
|
||||
)
|
||||
|
||||
|
||||
def test_tool_profile_fingerprint_is_stable() -> None:
|
||||
config = HermesAdapterConfig(driver_mode="ai_agent", enabled_toolsets=["hermes-api-server"])
|
||||
a = default_tool_profile(adapter="hermes", config=config, enabled_toolsets=["hermes-api-server"])
|
||||
b = default_tool_profile(adapter="hermes", config=config, enabled_toolsets=["hermes-api-server"])
|
||||
|
||||
assert a.fingerprint == b.fingerprint
|
||||
assert "browser" in a.interfaces
|
||||
assert "multi_turn" in a.interfaces
|
||||
|
||||
|
||||
def test_common_compatible_task_set_uses_effective_adapter_config() -> None:
|
||||
tasks = [_task("a"), _task("b")]
|
||||
plan = common_compatible_task_set(
|
||||
tasks,
|
||||
{
|
||||
"openclaw": ("openclaw", None),
|
||||
"hermes": ("hermes", HermesAdapterConfig(driver_mode="ai_agent")),
|
||||
},
|
||||
)
|
||||
|
||||
assert plan.task_ids == ["a", "b"]
|
||||
assert plan.skipped == {}
|
||||
|
||||
|
||||
def _result(label: str, model: str, task_ids: list[str], score: float) -> BenchmarkResult:
|
||||
task_results = [
|
||||
TaskStats(
|
||||
task_id=task_id,
|
||||
tier="tier1",
|
||||
family="coding",
|
||||
runs=1,
|
||||
mean_completion_score=1.0,
|
||||
mean_trajectory_score=1.0,
|
||||
mean_behavior_score=1.0,
|
||||
mean_run_score=score,
|
||||
reliability_score=1.0,
|
||||
variance_score=1.0,
|
||||
mean_task_score=score,
|
||||
stddev=0.0,
|
||||
min_score=score,
|
||||
max_score=score,
|
||||
pass_at_1=True,
|
||||
pass_rate=1.0,
|
||||
pass_hat_k=True,
|
||||
)
|
||||
for task_id in task_ids
|
||||
]
|
||||
return BenchmarkResult(
|
||||
submission_id=label,
|
||||
model=model,
|
||||
provider="test",
|
||||
timestamp="2026-04-25T00:00:00Z",
|
||||
overall_score=score,
|
||||
overall_completion=1.0,
|
||||
overall_trajectory=1.0,
|
||||
overall_behavior=1.0,
|
||||
overall_reliability=1.0,
|
||||
overall_ci_lower=score,
|
||||
overall_ci_upper=score,
|
||||
overall_pass_hat_k=1.0,
|
||||
task_results=task_results,
|
||||
)
|
||||
|
||||
|
||||
def test_compare_results_rejects_different_task_sets() -> None:
|
||||
comparison = compare_results(
|
||||
{
|
||||
"a": _result("a", "m", ["t1", "t2"], 0.8),
|
||||
"b": _result("b", "m", ["t1"], 0.9),
|
||||
}
|
||||
)
|
||||
|
||||
assert comparison["fair"] is False
|
||||
assert comparison["task_verifier_fair"] is False
|
||||
assert comparison["controlled_ablation"] is False
|
||||
assert comparison["same_model"] is True
|
||||
assert comparison["same_task_set"] is False
|
||||
|
||||
|
||||
def test_compare_results_allows_cross_model_same_task_leaderboard() -> None:
|
||||
a = _result("a", "model-a", ["t1", "t2"], 0.8)
|
||||
b = _result("b", "model-b", ["t1", "t2"], 0.9)
|
||||
a.task_snapshot_fingerprint = "snapshot-1"
|
||||
b.task_snapshot_fingerprint = "snapshot-1"
|
||||
|
||||
comparison = compare_results({"a": a, "b": b})
|
||||
|
||||
assert comparison["fair"] is True
|
||||
assert comparison["task_verifier_fair"] is True
|
||||
assert comparison["controlled_ablation"] is False
|
||||
assert comparison["same_model"] is False
|
||||
222
tests/test_adapter_base.py
Normal file
222
tests/test_adapter_base.py
Normal file
@ -0,0 +1,222 @@
|
||||
"""Tests for `clawbench.adapters.base` + registry.
|
||||
|
||||
Keeps the adapter ABC and registration helpers honest before any
|
||||
concrete adapter lands. A parametrized contract test in
|
||||
`test_adapter_contract.py` will exercise the ABC against every shipped
|
||||
adapter later.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from clawbench.adapters import (
|
||||
ADAPTERS,
|
||||
AdapterContext,
|
||||
AgentAdapter,
|
||||
PhaseResult,
|
||||
StateQueryResult,
|
||||
get_adapter,
|
||||
register_adapter,
|
||||
)
|
||||
from clawbench.canonical import (
|
||||
AdapterCapability,
|
||||
CanonicalPhase,
|
||||
CanonicalTask,
|
||||
StateQuery,
|
||||
)
|
||||
from clawbench.canonical.convert import from_task_definition
|
||||
from clawbench.schemas import (
|
||||
CompletionSpec,
|
||||
ExecutionCheck,
|
||||
FileState,
|
||||
SimulatedUser,
|
||||
TaskDefinition,
|
||||
TaskFamily,
|
||||
TaskSetup,
|
||||
Tier,
|
||||
Transcript,
|
||||
UserTurn,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Minimal adapter for contract verification.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class _EchoAdapter(AgentAdapter):
|
||||
name = "echo-test-adapter"
|
||||
capabilities = {AdapterCapability.FILES, AdapterCapability.EXECUTION}
|
||||
|
||||
async def setup(self, ctx: AdapterContext) -> None: # pragma: no cover - trivial
|
||||
return None
|
||||
|
||||
async def run_phase(
|
||||
self, phase: CanonicalPhase, ctx: AdapterContext
|
||||
) -> PhaseResult:
|
||||
return PhaseResult(messages=[], adapter_metadata={"phase": phase.name})
|
||||
|
||||
async def verify_state_query(
|
||||
self, query: StateQuery, ctx: AdapterContext
|
||||
) -> StateQueryResult:
|
||||
if query.required_capability in self.capabilities:
|
||||
return StateQueryResult(ok=True, detail="echo-adapter-always-ok")
|
||||
return StateQueryResult(
|
||||
ok=False,
|
||||
detail=f"echo adapter does not provide {query.required_capability.value}",
|
||||
capability_missing=True,
|
||||
)
|
||||
|
||||
async def teardown(self, ctx: AdapterContext) -> None: # pragma: no cover - trivial
|
||||
return None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Registry
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_register_adapter_adds_to_registry_and_get_adapter_resolves() -> None:
|
||||
original = dict(ADAPTERS)
|
||||
try:
|
||||
register_adapter(_EchoAdapter)
|
||||
assert ADAPTERS["echo-test-adapter"] is _EchoAdapter
|
||||
assert get_adapter("echo-test-adapter") is _EchoAdapter
|
||||
finally:
|
||||
ADAPTERS.clear()
|
||||
ADAPTERS.update(original)
|
||||
|
||||
|
||||
def test_register_adapter_rejects_duplicate_name() -> None:
|
||||
class _OtherEcho(AgentAdapter):
|
||||
name = "echo-test-adapter"
|
||||
capabilities = {AdapterCapability.FILES}
|
||||
|
||||
async def setup(self, ctx: AdapterContext) -> None: # pragma: no cover
|
||||
return None
|
||||
|
||||
async def run_phase(self, phase, ctx) -> PhaseResult: # pragma: no cover
|
||||
return PhaseResult()
|
||||
|
||||
async def verify_state_query(self, query, ctx) -> StateQueryResult: # pragma: no cover
|
||||
return StateQueryResult(ok=False, capability_missing=True)
|
||||
|
||||
async def teardown(self, ctx: AdapterContext) -> None: # pragma: no cover
|
||||
return None
|
||||
|
||||
original = dict(ADAPTERS)
|
||||
try:
|
||||
register_adapter(_EchoAdapter)
|
||||
with pytest.raises(ValueError):
|
||||
register_adapter(_OtherEcho)
|
||||
finally:
|
||||
ADAPTERS.clear()
|
||||
ADAPTERS.update(original)
|
||||
|
||||
|
||||
def test_register_adapter_requires_name() -> None:
|
||||
class _Nameless(AgentAdapter):
|
||||
capabilities = {AdapterCapability.FILES}
|
||||
|
||||
async def setup(self, ctx: AdapterContext) -> None: # pragma: no cover
|
||||
return None
|
||||
|
||||
async def run_phase(self, phase, ctx) -> PhaseResult: # pragma: no cover
|
||||
return PhaseResult()
|
||||
|
||||
async def verify_state_query(self, query, ctx) -> StateQueryResult: # pragma: no cover
|
||||
return StateQueryResult(ok=False, capability_missing=True)
|
||||
|
||||
async def teardown(self, ctx: AdapterContext) -> None: # pragma: no cover
|
||||
return None
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
register_adapter(_Nameless)
|
||||
|
||||
|
||||
def test_get_adapter_raises_for_unknown_name() -> None:
|
||||
with pytest.raises(KeyError):
|
||||
get_adapter("no-such-adapter-exists")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Capability gating helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _file_task() -> CanonicalTask:
|
||||
task = TaskDefinition(
|
||||
id="capability-test",
|
||||
name="capability test",
|
||||
tier=Tier.TIER1,
|
||||
family=TaskFamily.CODING,
|
||||
surface="coding",
|
||||
setup=TaskSetup(),
|
||||
user=SimulatedUser(
|
||||
max_turns=1, turns=[UserTurn(message="Do a thing.")]
|
||||
),
|
||||
completion=CompletionSpec(
|
||||
files=[FileState(path="out.txt", exists=True)],
|
||||
execution_checks=[ExecutionCheck(name="ok", command="true")],
|
||||
),
|
||||
)
|
||||
return from_task_definition(task)
|
||||
|
||||
|
||||
def test_supports_is_true_when_capabilities_cover_task() -> None:
|
||||
task = _file_task()
|
||||
assert _EchoAdapter.supports(task)
|
||||
assert _EchoAdapter.missing_capabilities_for(task) == set()
|
||||
|
||||
|
||||
def test_supports_is_false_when_task_needs_more() -> None:
|
||||
task = _file_task()
|
||||
task = task.model_copy(
|
||||
update={
|
||||
"required_adapter_capabilities": (
|
||||
task.required_adapter_capabilities | {AdapterCapability.MEMORY}
|
||||
)
|
||||
}
|
||||
)
|
||||
assert not _EchoAdapter.supports(task)
|
||||
assert _EchoAdapter.missing_capabilities_for(task) == {AdapterCapability.MEMORY}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Context roundtrip (sanity: adapter methods can build and return
|
||||
# PhaseResult / StateQueryResult without tripping dataclass defaults)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_adapter_phase_result_round_trip(tmp_path: Path) -> None:
|
||||
task = _file_task()
|
||||
adapter = _EchoAdapter()
|
||||
ctx = AdapterContext(
|
||||
task=task,
|
||||
workspace=tmp_path,
|
||||
runtime_values={},
|
||||
run_index=0,
|
||||
model="test-model",
|
||||
transcript=Transcript(),
|
||||
)
|
||||
|
||||
import asyncio
|
||||
|
||||
async def _go() -> None:
|
||||
await adapter.setup(ctx)
|
||||
result = await adapter.run_phase(task.phases[0], ctx)
|
||||
assert isinstance(result, PhaseResult)
|
||||
assert result.adapter_metadata == {"phase": task.phases[0].name}
|
||||
query = StateQuery(
|
||||
kind="memory",
|
||||
required_capability=AdapterCapability.MEMORY,
|
||||
selector={"key_pattern": "x"},
|
||||
)
|
||||
res = await adapter.verify_state_query(query, ctx)
|
||||
assert res.capability_missing is True
|
||||
await adapter.teardown(ctx)
|
||||
|
||||
asyncio.run(_go())
|
||||
268
tests/test_canonical_convert.py
Normal file
268
tests/test_canonical_convert.py
Normal file
@ -0,0 +1,268 @@
|
||||
"""Tests for `clawbench.canonical.convert.from_task_definition`.
|
||||
|
||||
Covers the three representative task shapes:
|
||||
|
||||
1. A files + execution-only task (tier-1 bugfix) — must produce
|
||||
`required_adapter_capabilities == {FILES, EXECUTION}`.
|
||||
2. A memory-using, multi-phase task (tier-2 memory roundtrip) — must
|
||||
include `MEMORY` and MULTI_TURN_INJECTION is NOT set since each
|
||||
phase's user has exactly one static turn.
|
||||
3. A synthetic task exercising gateway_assertions, session, cron, and
|
||||
browser — must surface each capability.
|
||||
|
||||
The tests also round-trip the real task corpus through the converter
|
||||
to make sure every live YAML file produces a valid `CanonicalTask`
|
||||
(no missing-field or validation errors), since the converter is how
|
||||
every downstream adapter will see tasks.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from clawbench.canonical import (
|
||||
AdapterCapability,
|
||||
CanonicalTask,
|
||||
from_task_definition,
|
||||
)
|
||||
from clawbench.schemas import (
|
||||
BackgroundService,
|
||||
CompletionSpec,
|
||||
CronState,
|
||||
ExecutionCheck,
|
||||
FileState,
|
||||
GatewayAssertion,
|
||||
MemoryState,
|
||||
SessionState,
|
||||
SimulatedUser,
|
||||
TaskDefinition,
|
||||
TaskFamily,
|
||||
TaskSetup,
|
||||
Tier,
|
||||
UserTurn,
|
||||
)
|
||||
from clawbench.tasks import load_all_tasks
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fixture builders
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _files_only_task() -> TaskDefinition:
|
||||
return TaskDefinition(
|
||||
id="test-files-only",
|
||||
name="Files-only task",
|
||||
tier=Tier.TIER1,
|
||||
family=TaskFamily.CODING,
|
||||
surface="coding",
|
||||
setup=TaskSetup(asset_packs=["pack_a"]),
|
||||
user=SimulatedUser(
|
||||
max_turns=2,
|
||||
turns=[UserTurn(message="Fix the bug and run the tests.")],
|
||||
),
|
||||
completion=CompletionSpec(
|
||||
files=[FileState(path="src/main.py", exists=True)],
|
||||
execution_checks=[ExecutionCheck(name="tests", command="pytest -q")],
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def _memory_task() -> TaskDefinition:
|
||||
return TaskDefinition(
|
||||
id="test-memory-roundtrip",
|
||||
name="Memory roundtrip",
|
||||
tier=Tier.TIER2,
|
||||
family=TaskFamily.MULTI_TOOL,
|
||||
surface="tools",
|
||||
setup=TaskSetup(
|
||||
memory_seed=[{"key": "existing_key", "value": "existing_value"}],
|
||||
),
|
||||
phases=[
|
||||
{
|
||||
"name": "store",
|
||||
"user": SimulatedUser(
|
||||
max_turns=1,
|
||||
turns=[UserTurn(message="Remember: stack = React, Node, Postgres.")],
|
||||
),
|
||||
},
|
||||
{
|
||||
"name": "recall",
|
||||
"user": SimulatedUser(
|
||||
max_turns=1,
|
||||
turns=[UserTurn(message="What's my stack?")],
|
||||
),
|
||||
},
|
||||
],
|
||||
completion=CompletionSpec(
|
||||
memory=[MemoryState(key_pattern="stack", exists=True, value_contains=["React"])],
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def _full_surface_task() -> TaskDefinition:
|
||||
# Synthetic task exercising session, cron, gateway_assertion, browser,
|
||||
# and a dynamic follow-up turn.
|
||||
return TaskDefinition(
|
||||
id="test-full-surface",
|
||||
name="Full surface",
|
||||
tier=Tier.TIER3,
|
||||
family=TaskFamily.BROWSER,
|
||||
surface="browser",
|
||||
setup=TaskSetup(
|
||||
pre_check_gateway=[
|
||||
GatewayAssertion(
|
||||
method="agents.list",
|
||||
assert_path="$.count",
|
||||
assert_equals=0,
|
||||
),
|
||||
],
|
||||
background_services=[
|
||||
BackgroundService(
|
||||
name="echo-service",
|
||||
command="python3 -m http.server",
|
||||
port=0,
|
||||
ready_path="/",
|
||||
),
|
||||
],
|
||||
),
|
||||
user=SimulatedUser(
|
||||
max_turns=4,
|
||||
turns=[
|
||||
UserTurn(message="Start the task."),
|
||||
UserTurn(
|
||||
message="Try again.",
|
||||
when_tool_family="browser",
|
||||
when_last_tool_failed=True,
|
||||
),
|
||||
],
|
||||
),
|
||||
completion=CompletionSpec(
|
||||
session=SessionState(should_exist=True, model_should_be="claude-opus-4"),
|
||||
cron=[CronState(exists=True, description_contains="daily")],
|
||||
gateway_assertions=[
|
||||
GatewayAssertion(
|
||||
method="memory.list",
|
||||
assert_path="$.count",
|
||||
assert_equals=1,
|
||||
),
|
||||
],
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Capability inference
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_files_only_task_requires_only_files_and_execution() -> None:
|
||||
task = _files_only_task()
|
||||
task.category = "software_engineering"
|
||||
task.domain = "devtools"
|
||||
task.functionality = ["bugfix", "test_verification"]
|
||||
task.trace_distribution = ["read_heavy", "edit_heavy", "execute_heavy"]
|
||||
task.tool_surface = ["filesystem", "shell"]
|
||||
task.risk_tags = ["code_regression"]
|
||||
|
||||
canonical = from_task_definition(task)
|
||||
assert isinstance(canonical, CanonicalTask)
|
||||
assert canonical.required_adapter_capabilities == {
|
||||
AdapterCapability.FILES,
|
||||
AdapterCapability.EXECUTION,
|
||||
}
|
||||
assert canonical.category == "software_engineering"
|
||||
assert canonical.domain == "devtools"
|
||||
assert canonical.functionality == ["bugfix", "test_verification"]
|
||||
assert canonical.trace_distribution == ["read_heavy", "edit_heavy", "execute_heavy"]
|
||||
assert canonical.tool_surface == ["filesystem", "shell"]
|
||||
assert canonical.risk_tags == ["code_regression"]
|
||||
# Seed state should carry the asset pack through.
|
||||
assert len(canonical.assets.seed_state) == 1
|
||||
assert canonical.assets.seed_state[0].kind == "file"
|
||||
assert canonical.assets.seed_state[0].asset_pack == "pack_a"
|
||||
# File + execution checks carry over.
|
||||
assert len(canonical.verifier.file_states) == 1
|
||||
assert len(canonical.verifier.execution_checks) == 1
|
||||
assert canonical.verifier.state_queries == []
|
||||
# One non-dynamic phase → no dynamic-trigger capability.
|
||||
assert canonical.interaction.uses_dynamic_user_triggers is False
|
||||
|
||||
|
||||
def test_memory_task_requires_memory_capability() -> None:
|
||||
canonical = from_task_definition(_memory_task())
|
||||
assert AdapterCapability.MEMORY in canonical.required_adapter_capabilities
|
||||
# Two phases with a single static turn each → dynamic-trigger is NOT
|
||||
# required (the simulated user just sends one message per phase).
|
||||
assert AdapterCapability.MULTI_TURN_INJECTION not in canonical.required_adapter_capabilities
|
||||
assert canonical.interaction.allow_multi_phase is True
|
||||
assert len(canonical.phases) == 2
|
||||
# Memory seed lifted to SeedEntry.
|
||||
memory_seeds = [s for s in canonical.assets.seed_state if s.kind == "memory"]
|
||||
assert len(memory_seeds) == 1
|
||||
assert memory_seeds[0].key == "existing_key"
|
||||
# Memory completion check → StateQuery with MEMORY capability.
|
||||
memory_queries = [q for q in canonical.verifier.state_queries if q.kind == "memory"]
|
||||
assert len(memory_queries) == 1
|
||||
assert memory_queries[0].required_capability is AdapterCapability.MEMORY
|
||||
assert memory_queries[0].selector == {"key_pattern": "stack"}
|
||||
assert memory_queries[0].expected == {"value_contains": ["React"]}
|
||||
|
||||
|
||||
def test_full_surface_task_surfaces_every_capability() -> None:
|
||||
canonical = from_task_definition(_full_surface_task())
|
||||
caps = canonical.required_adapter_capabilities
|
||||
assert AdapterCapability.FILES in caps
|
||||
assert AdapterCapability.EXECUTION in caps
|
||||
assert AdapterCapability.SESSION in caps
|
||||
assert AdapterCapability.CRON in caps
|
||||
assert AdapterCapability.GATEWAY_RPC in caps
|
||||
assert AdapterCapability.BROWSER in caps
|
||||
# Dynamic turn (when_tool_family + when_last_tool_failed) flags MTI.
|
||||
assert AdapterCapability.MULTI_TURN_INJECTION in caps
|
||||
# pre_check_gateway survives as a pre-run query.
|
||||
assert len(canonical.verifier.pre_run_queries) == 1
|
||||
assert canonical.verifier.pre_run_queries[0].required_capability is AdapterCapability.GATEWAY_RPC
|
||||
# gateway_assertions route through the verifier state_queries.
|
||||
gateway_queries = [
|
||||
q for q in canonical.verifier.state_queries if q.kind == "custom"
|
||||
]
|
||||
assert len(gateway_queries) == 1
|
||||
assert gateway_queries[0].selector["method"] == "memory.list"
|
||||
# Session state with model constraint surfaces in expected.
|
||||
session_queries = [q for q in canonical.verifier.state_queries if q.kind == "session"]
|
||||
assert len(session_queries) == 1
|
||||
assert session_queries[0].expected == {"model": "claude-opus-4"}
|
||||
|
||||
|
||||
def test_background_services_pass_through_unchanged() -> None:
|
||||
canonical = from_task_definition(_full_surface_task())
|
||||
assert len(canonical.assets.background_services) == 1
|
||||
service = canonical.assets.background_services[0]
|
||||
assert service.name == "echo-service"
|
||||
assert service.command == "python3 -m http.server"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Whole-corpus smoke
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_every_task_in_corpus_converts() -> None:
|
||||
"""Every shipped task YAML must produce a valid CanonicalTask.
|
||||
|
||||
Acts as a regression gate: any new field added to TaskDefinition that
|
||||
the converter doesn't know about will likely still work (fields it
|
||||
ignores don't break canonical), but any task using new completion
|
||||
shapes that the converter can't translate will raise here.
|
||||
"""
|
||||
tasks = load_all_tasks()
|
||||
assert tasks, "expected at least one task in the corpus"
|
||||
for task in tasks:
|
||||
canonical = from_task_definition(task)
|
||||
# Every canonical task must declare FILES + EXECUTION capability.
|
||||
assert AdapterCapability.FILES in canonical.required_adapter_capabilities
|
||||
assert AdapterCapability.EXECUTION in canonical.required_adapter_capabilities
|
||||
# Phases always have at least one entry (normalized_phases fills
|
||||
# one from `user` when `phases` is absent).
|
||||
assert canonical.phases, f"{task.id}: canonical phases empty"
|
||||
# Budgets honour the source timeout.
|
||||
assert canonical.budgets.timeout_seconds == task.timeout_seconds
|
||||
@ -37,6 +37,42 @@ def test_gateway_config_invalid_env_falls_back_to_default(monkeypatch, caplog, r
|
||||
assert any("CLAWBENCH_CONNECT_TIMEOUT" in r.getMessage() for r in caplog.records)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_gateway_client_disables_websocket_keepalive_for_long_rpc(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
):
|
||||
connect_kwargs: dict[str, object] = {}
|
||||
|
||||
class FakeWebSocket:
|
||||
async def close(self) -> None:
|
||||
return None
|
||||
|
||||
async def fake_connect(*args, **kwargs):
|
||||
connect_kwargs.update(kwargs)
|
||||
return FakeWebSocket()
|
||||
|
||||
async def fake_wait_event(self, event_name: str, *, timeout: float):
|
||||
return {"payload": {"nonce": ""}}
|
||||
|
||||
async def fake_rpc(self, method: str, params=None, **kwargs):
|
||||
return {"payload": {"type": "hello-ok", "protocol": 3}}
|
||||
|
||||
async def fake_listener(self):
|
||||
await asyncio.sleep(60)
|
||||
|
||||
monkeypatch.setattr("clawbench.client.websockets.connect", fake_connect)
|
||||
monkeypatch.setattr(GatewayClient, "_wait_event", fake_wait_event)
|
||||
monkeypatch.setattr(GatewayClient, "_rpc", fake_rpc)
|
||||
monkeypatch.setattr(GatewayClient, "_listener", fake_listener)
|
||||
|
||||
client = GatewayClient(GatewayConfig(connect_timeout=2))
|
||||
await client.connect()
|
||||
await client.close()
|
||||
|
||||
assert connect_kwargs["ping_interval"] is None
|
||||
assert connect_kwargs["ping_timeout"] is None
|
||||
|
||||
|
||||
def test_tool_results_are_correlated_back_to_tool_calls():
|
||||
tool_message = _parse_single_message(
|
||||
{
|
||||
@ -106,7 +142,7 @@ async def test_gateway_client_retries_transient_drain_errors(monkeypatch: pytest
|
||||
async def fake_wait_event(self, event_name: str, *, timeout: float):
|
||||
return {"payload": {"nonce": ""}}
|
||||
|
||||
async def fake_rpc(self, method: str, params=None):
|
||||
async def fake_rpc(self, method: str, params=None, **kwargs):
|
||||
return {"payload": {"type": "hello-ok", "protocol": 3}}
|
||||
|
||||
async def fake_listener(self):
|
||||
@ -143,7 +179,7 @@ async def test_gateway_client_retries_half_closed_handshake_errors(
|
||||
async def fake_wait_event(self, event_name: str, *, timeout: float):
|
||||
return {"payload": {"nonce": ""}}
|
||||
|
||||
async def fake_rpc(self, method: str, params=None):
|
||||
async def fake_rpc(self, method: str, params=None, **kwargs):
|
||||
return {"payload": {"type": "hello-ok", "protocol": 3}}
|
||||
|
||||
async def fake_listener(self):
|
||||
@ -192,3 +228,71 @@ async def test_send_and_wait_collects_messages_that_arrive_after_final_state():
|
||||
transcript = await client.send_and_wait(session_key, "hello", timeout=1.0)
|
||||
|
||||
assert [message.text for message in transcript.assistant_messages] == ["Late but valid."]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_send_and_wait_passes_gateway_timeout_and_waits_for_run():
|
||||
client = GatewayClient(GatewayConfig(request_timeout=1))
|
||||
session_key = "session-1"
|
||||
calls: list[tuple[str, dict | None, dict]] = []
|
||||
|
||||
async def fake_rpc(method: str, params=None, **kwargs):
|
||||
calls.append((method, params, kwargs))
|
||||
if method == "sessions.send":
|
||||
return {"ok": True, "payload": {"runId": "run-1"}}
|
||||
if method == "agent.wait":
|
||||
return {"ok": True, "payload": {"runId": "run-1", "status": "completed"}}
|
||||
if method == "sessions.get":
|
||||
return {
|
||||
"ok": True,
|
||||
"payload": {
|
||||
"messages": [
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": [{"type": "text", "text": "Done."}],
|
||||
}
|
||||
]
|
||||
},
|
||||
}
|
||||
return {"ok": True, "payload": {}}
|
||||
|
||||
client._rpc = fake_rpc # type: ignore[method-assign]
|
||||
|
||||
transcript = await client.send_and_wait(session_key, "hello", timeout=1.5)
|
||||
|
||||
send_call = next(call for call in calls if call[0] == "sessions.send")
|
||||
assert send_call[1] == {
|
||||
"key": session_key,
|
||||
"message": "hello",
|
||||
"idempotencyKey": send_call[1]["idempotencyKey"],
|
||||
"timeoutMs": 1500,
|
||||
}
|
||||
wait_call = next(call for call in calls if call[0] == "agent.wait")
|
||||
assert wait_call[1] == {"runId": "run-1", "timeoutMs": 1500}
|
||||
assert wait_call[2]["timeout"] == 11.5
|
||||
assert [message.text for message in transcript.assistant_messages] == ["Done."]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_send_and_wait_aborts_run_when_no_terminal_state_arrives():
|
||||
client = GatewayClient(GatewayConfig(request_timeout=1))
|
||||
session_key = "session-1"
|
||||
calls: list[tuple[str, dict | None, dict]] = []
|
||||
|
||||
async def fake_rpc(method: str, params=None, **kwargs):
|
||||
calls.append((method, params, kwargs))
|
||||
if method == "sessions.send":
|
||||
return {"ok": True, "payload": {"runId": "run-timeout"}}
|
||||
if method == "agent.wait":
|
||||
await asyncio.sleep(60)
|
||||
if method == "sessions.abort":
|
||||
return {"ok": True, "payload": {"status": "aborted"}}
|
||||
if method == "sessions.get":
|
||||
return {"ok": True, "payload": {"messages": []}}
|
||||
return {"ok": True, "payload": {}}
|
||||
|
||||
client._rpc = fake_rpc # type: ignore[method-assign]
|
||||
|
||||
await client.send_and_wait(session_key, "hello", timeout=0.01)
|
||||
|
||||
assert ("sessions.abort", {"key": session_key, "runId": "run-timeout"}, {"timeout": 1}) in calls
|
||||
|
||||
@ -3,8 +3,24 @@ from pathlib import Path
|
||||
import pytest
|
||||
|
||||
from clawbench.client import GatewayConfig
|
||||
from clawbench.adapters.base import AdapterContext, AgentAdapter, PhaseResult, StateQueryResult
|
||||
from clawbench.canonical import AdapterCapability, CanonicalPhase, StateQuery
|
||||
from clawbench.harness import BenchmarkHarness
|
||||
from clawbench.schemas import CompletionResult, JudgeResult, TaskRunResult
|
||||
from clawbench.schemas import (
|
||||
CompletionResult,
|
||||
CompletionSpec,
|
||||
FileState,
|
||||
JudgeExpectations,
|
||||
JudgeResult,
|
||||
SimulatedUser,
|
||||
TaskDefinition,
|
||||
TaskFamily,
|
||||
TaskRunResult,
|
||||
Tier,
|
||||
Transcript,
|
||||
TranscriptMessage,
|
||||
UserTurn,
|
||||
)
|
||||
from clawbench.tasks import load_all_tasks
|
||||
|
||||
|
||||
@ -118,7 +134,13 @@ def test_aggregate_reports_advisory_judge_metrics():
|
||||
|
||||
|
||||
def test_compose_result_from_task_stats_supports_parallel_environment_metadata():
|
||||
task = next(task for task in load_all_tasks() if task.id == "t1-bugfix-discount")
|
||||
task = next(task for task in load_all_tasks() if task.id == "t1-bugfix-discount").model_copy(deep=True)
|
||||
task.category = "software_engineering"
|
||||
task.domain = "devtools"
|
||||
task.functionality = ["bugfix", "regression_repair", "test_verification"]
|
||||
task.trace_distribution = ["read_heavy", "edit_heavy", "execute_heavy", "recovery_heavy"]
|
||||
task.tool_surface = ["filesystem", "shell"]
|
||||
task.risk_tags = ["code_change"]
|
||||
harness = BenchmarkHarness(
|
||||
gateway_config=GatewayConfig(),
|
||||
model="test-model",
|
||||
@ -163,6 +185,29 @@ def test_compose_result_from_task_stats_supports_parallel_environment_metadata()
|
||||
assert merged_result.environment["parallel_lanes"] == 2
|
||||
assert merged_result.environment["requested_parallel_lanes"] == 3
|
||||
assert merged_result.environment["browser_tasks_serialized"] is False
|
||||
assert merged_result.environment["dimension_coverage"] == {
|
||||
"category": 1,
|
||||
"domain": 1,
|
||||
"functionality": 3,
|
||||
"trace_distribution": 4,
|
||||
"tool_surface": 2,
|
||||
"risk_tag": 1,
|
||||
}
|
||||
assert merged_result.task_results[0].category == "software_engineering"
|
||||
assert merged_result.task_results[0].domain == "devtools"
|
||||
|
||||
category = {item.value: item for item in merged_result.category_results}
|
||||
assert category["software_engineering"].task_ids == [task.id]
|
||||
assert category["software_engineering"].weighted_score == pytest.approx(
|
||||
base_result.overall_weighted_query_score
|
||||
)
|
||||
|
||||
functionality_values = {item.value for item in merged_result.functionality_results}
|
||||
assert {"bugfix", "regression_repair", "test_verification"}.issubset(functionality_values)
|
||||
trace_values = {item.value for item in merged_result.trace_distribution_results}
|
||||
assert {"read_heavy", "edit_heavy", "execute_heavy", "recovery_heavy"}.issubset(trace_values)
|
||||
assert "category" in merged_result.dimension_results
|
||||
assert merged_result.dimension_results["category"] == merged_result.category_results
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@ -206,7 +251,7 @@ async def test_run_rejects_registered_but_unwired_adapter(monkeypatch):
|
||||
harness = BenchmarkHarness(
|
||||
gateway_config=GatewayConfig(),
|
||||
model="test-model",
|
||||
adapter="hermes",
|
||||
adapter="codex",
|
||||
runs_per_task=1,
|
||||
randomize_order=False,
|
||||
print_report=False,
|
||||
@ -215,3 +260,182 @@ async def test_run_rejects_registered_but_unwired_adapter(monkeypatch):
|
||||
|
||||
with pytest.raises(ValueError, match="not yet wired"):
|
||||
await harness.run()
|
||||
|
||||
|
||||
def _files_only_definition(judge: JudgeExpectations | None = None) -> TaskDefinition:
|
||||
return TaskDefinition(
|
||||
id="adapter-files-only",
|
||||
name="Adapter files only",
|
||||
tier=Tier.TIER1,
|
||||
family=TaskFamily.CODING,
|
||||
surface="coding",
|
||||
user=SimulatedUser(
|
||||
max_turns=1,
|
||||
turns=[UserTurn(message="Create answer.txt")],
|
||||
),
|
||||
completion=CompletionSpec(
|
||||
files=[FileState(path="answer.txt", exists=True, content_contains=["done"])],
|
||||
),
|
||||
judge=judge,
|
||||
)
|
||||
|
||||
|
||||
class FakeAgentAdapter(AgentAdapter):
|
||||
name = "hermes"
|
||||
capabilities = {AdapterCapability.FILES, AdapterCapability.EXECUTION}
|
||||
|
||||
async def setup(self, ctx: AdapterContext) -> None:
|
||||
return None
|
||||
|
||||
async def run_phase(self, phase: CanonicalPhase, ctx: AdapterContext) -> PhaseResult:
|
||||
(ctx.workspace / "answer.txt").write_text("done\n", encoding="utf-8")
|
||||
message = TranscriptMessage(role="assistant", text="Created answer.txt and verified it.")
|
||||
ctx.transcript.messages.append(message)
|
||||
return PhaseResult(messages=[message], completed_normally=True)
|
||||
|
||||
async def verify_state_query(self, query: StateQuery, ctx: AdapterContext) -> StateQueryResult:
|
||||
return StateQueryResult(ok=False, capability_missing=True)
|
||||
|
||||
async def teardown(self, ctx: AdapterContext) -> None:
|
||||
return None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_hermes_adapter_runs_through_scoring_harness(monkeypatch, tmp_path: Path):
|
||||
task = _files_only_definition()
|
||||
monkeypatch.setattr("clawbench.harness.load_all_tasks", lambda **_: [task])
|
||||
monkeypatch.setattr("clawbench.harness.get_adapter", lambda name: FakeAgentAdapter)
|
||||
monkeypatch.setenv("OPENCLAW_STATE_DIR", str(tmp_path))
|
||||
monkeypatch.setenv("CLAWBENCH_RUN_CACHE_DIR", "")
|
||||
|
||||
harness = BenchmarkHarness(
|
||||
gateway_config=GatewayConfig(),
|
||||
model="openai/gpt-5.5",
|
||||
adapter="hermes",
|
||||
runs_per_task=1,
|
||||
randomize_order=False,
|
||||
print_report=False,
|
||||
quiet=True,
|
||||
)
|
||||
|
||||
result = await harness.run()
|
||||
run = harness.last_task_runs[task.id][0]
|
||||
|
||||
assert result.environment["adapter"] == "hermes"
|
||||
assert result.environment["executable_adapters"] == ["hermes", "openclaw"]
|
||||
assert run.error is None
|
||||
assert run.completion_result.score == 1.0
|
||||
assert run.delivery_outcome.value == "pass"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_openclaw_uses_shared_adapter_scoring_path(monkeypatch, tmp_path: Path):
|
||||
task = _files_only_definition()
|
||||
monkeypatch.setattr("clawbench.harness.load_all_tasks", lambda **_: [task])
|
||||
monkeypatch.setattr("clawbench.harness.get_adapter", lambda name: FakeAgentAdapter)
|
||||
monkeypatch.setenv("OPENCLAW_STATE_DIR", str(tmp_path))
|
||||
monkeypatch.setenv("CLAWBENCH_RUN_CACHE_DIR", "")
|
||||
|
||||
harness = BenchmarkHarness(
|
||||
gateway_config=GatewayConfig(),
|
||||
model="openai/gpt-5.5",
|
||||
adapter="openclaw",
|
||||
runs_per_task=1,
|
||||
randomize_order=False,
|
||||
print_report=False,
|
||||
quiet=True,
|
||||
)
|
||||
|
||||
result = await harness.run()
|
||||
run = harness.last_task_runs[task.id][0]
|
||||
|
||||
assert result.environment["adapter"] == "openclaw"
|
||||
assert run.error is None
|
||||
assert run.completion_result.score == 1.0
|
||||
assert run.delivery_outcome.value == "pass"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_adapter_scoring_uses_advisory_judge(monkeypatch, tmp_path: Path):
|
||||
task = _files_only_definition(
|
||||
JudgeExpectations(
|
||||
rubric="Reward the answer when it is concise.",
|
||||
artifact_paths=["answer.txt"],
|
||||
passing_threshold=0.4,
|
||||
)
|
||||
)
|
||||
monkeypatch.setattr("clawbench.harness.load_all_tasks", lambda **_: [task])
|
||||
monkeypatch.setattr("clawbench.harness.get_adapter", lambda name: FakeAgentAdapter)
|
||||
monkeypatch.setenv("OPENCLAW_STATE_DIR", str(tmp_path))
|
||||
monkeypatch.setenv("CLAWBENCH_RUN_CACHE_DIR", "")
|
||||
|
||||
class FakeJudgeGateway:
|
||||
async def __aenter__(self):
|
||||
return self
|
||||
|
||||
async def __aexit__(self, *exc):
|
||||
return None
|
||||
|
||||
async def create_session(self, *, model: str, label: str) -> str:
|
||||
assert model == "judge-model"
|
||||
assert label.startswith("clawbench-judge-")
|
||||
return "judge-session"
|
||||
|
||||
async def subscribe(self, session_key: str) -> None:
|
||||
assert session_key == "judge-session"
|
||||
|
||||
async def send_and_wait(self, session_key: str, message: str):
|
||||
assert session_key == "judge-session"
|
||||
assert "done" in message
|
||||
return Transcript(
|
||||
messages=[
|
||||
TranscriptMessage(
|
||||
role="assistant",
|
||||
text='{"score": 0.5, "confidence": 0.8, "reason": "OK", "rubric_hits": [], "rubric_misses": []}',
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
async def delete_session(self, session_key: str) -> None:
|
||||
assert session_key == "judge-session"
|
||||
|
||||
monkeypatch.setattr("clawbench.harness.GatewayClient", lambda config: FakeJudgeGateway())
|
||||
|
||||
harness = BenchmarkHarness(
|
||||
gateway_config=GatewayConfig(),
|
||||
model="openai/gpt-5.5",
|
||||
adapter="hermes",
|
||||
judge_model="judge-model",
|
||||
runs_per_task=1,
|
||||
randomize_order=False,
|
||||
print_report=False,
|
||||
quiet=True,
|
||||
)
|
||||
|
||||
result = await harness.run()
|
||||
run = harness.last_task_runs[task.id][0]
|
||||
|
||||
assert run.judge_result.enabled is True
|
||||
assert run.judge_result.score == pytest.approx(0.5)
|
||||
assert run.run_score == pytest.approx(0.95)
|
||||
assert result.overall_judge_score == pytest.approx(0.5)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_hermes_adapter_filters_incompatible_tasks(monkeypatch):
|
||||
task = next(task for task in load_all_tasks() if task.id == "t4-memory-recall-continuation")
|
||||
monkeypatch.setattr("clawbench.harness.load_all_tasks", lambda **_: [task])
|
||||
monkeypatch.setattr("clawbench.harness.get_adapter", lambda name: FakeAgentAdapter)
|
||||
|
||||
harness = BenchmarkHarness(
|
||||
gateway_config=GatewayConfig(),
|
||||
model="openai/gpt-5.5",
|
||||
adapter="hermes",
|
||||
runs_per_task=1,
|
||||
randomize_order=False,
|
||||
print_report=False,
|
||||
quiet=True,
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match="No selected tasks are compatible"):
|
||||
await harness.run()
|
||||
|
||||
463
tests/test_hermes_adapter.py
Normal file
463
tests/test_hermes_adapter.py
Normal file
@ -0,0 +1,463 @@
|
||||
"""Tests for `HermesAdapter` against a stub `MiniSWERunner`.
|
||||
|
||||
We don't pull in the real `hermes-agent` package — the adapter is
|
||||
driven through its `runner_factory` hook, which lets tests plug in a
|
||||
fixed conversation without any network / subprocess activity.
|
||||
|
||||
What's covered:
|
||||
- The adapter registers under the `"hermes"` name.
|
||||
- `capabilities` is the minimal `{FILES, EXECUTION}` set.
|
||||
- `setup` realises memory seed entries as workspace files.
|
||||
- `run_phase` renders the user turn, calls the stub runner, and
|
||||
appends the parsed conversation into the shared transcript.
|
||||
- `verify_state_query` falls back to workspace memory scanning for
|
||||
memory queries, and returns `capability_missing=True` for other
|
||||
kinds.
|
||||
- Task gating: a task that requires MEMORY / SESSION / CRON is NOT
|
||||
supported by HermesAdapter; a files-only task is.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
|
||||
from clawbench.adapters import get_adapter
|
||||
from clawbench.adapters.base import AdapterContext, StateQueryResult
|
||||
from clawbench.adapters.hermes import HermesAdapter, HermesAdapterConfig
|
||||
from clawbench.canonical import (
|
||||
AdapterCapability,
|
||||
CanonicalTask,
|
||||
StateQuery,
|
||||
)
|
||||
from clawbench.canonical.convert import from_task_definition
|
||||
from clawbench.schemas import (
|
||||
CompletionSpec,
|
||||
ExecutionCheck,
|
||||
FileState,
|
||||
MemoryState,
|
||||
SimulatedUser,
|
||||
TaskDefinition,
|
||||
TaskFamily,
|
||||
TaskSetup,
|
||||
Tier,
|
||||
Transcript,
|
||||
UserTurn,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Stub MiniSWERunner
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class _StubRunner:
|
||||
"""Pretends to be `MiniSWERunner`; returns a canned conversation."""
|
||||
|
||||
def __init__(self, *, model: str, cwd: str, **_: object) -> None:
|
||||
self.model = model
|
||||
self.cwd = cwd
|
||||
self.last_prompt: str | None = None
|
||||
self.calls = 0
|
||||
self.conversation = {
|
||||
"conversations": [
|
||||
{"from": "user", "value": "placeholder — filled per-test"},
|
||||
{
|
||||
"from": "assistant",
|
||||
"value": (
|
||||
"Running `ls`.\n"
|
||||
'<tool_call>{"name":"bash","arguments":{"cmd":"ls"}}</tool_call>'
|
||||
),
|
||||
},
|
||||
{
|
||||
"from": "tool",
|
||||
"value": '<tool_response>{"stdout":"main.py"}</tool_response>',
|
||||
},
|
||||
],
|
||||
"completed": True,
|
||||
"api_calls": 3,
|
||||
"metadata": {"model": "stub", "env_type": "local"},
|
||||
}
|
||||
|
||||
def run_task(self, prompt: str) -> dict:
|
||||
self.last_prompt = prompt
|
||||
self.calls += 1
|
||||
# Swap the placeholder user value with the real prompt so the
|
||||
# conversation reflects what the adapter actually sent.
|
||||
convo = {**self.conversation}
|
||||
convo["conversations"] = [
|
||||
{"from": "user", "value": prompt}
|
||||
if entry.get("from") == "user"
|
||||
else entry
|
||||
for entry in convo["conversations"]
|
||||
]
|
||||
return convo
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fixtures
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _files_only_task(memory_seed: bool = False) -> CanonicalTask:
|
||||
setup = (
|
||||
TaskSetup(memory_seed=[{"key": "stack", "value": "React, Node"}])
|
||||
if memory_seed
|
||||
else TaskSetup()
|
||||
)
|
||||
return from_task_definition(
|
||||
TaskDefinition(
|
||||
id="hermes-files-only",
|
||||
name="Hermes files-only",
|
||||
tier=Tier.TIER1,
|
||||
family=TaskFamily.CODING,
|
||||
surface="coding",
|
||||
setup=setup,
|
||||
user=SimulatedUser(
|
||||
max_turns=1,
|
||||
turns=[UserTurn(message="List the workspace files.")],
|
||||
),
|
||||
completion=CompletionSpec(
|
||||
files=[FileState(path="main.py", exists=True)],
|
||||
execution_checks=[ExecutionCheck(name="noop", command="true")],
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def _memory_task() -> CanonicalTask:
|
||||
return from_task_definition(
|
||||
TaskDefinition(
|
||||
id="hermes-memory",
|
||||
name="Hermes memory",
|
||||
tier=Tier.TIER2,
|
||||
family=TaskFamily.MULTI_TOOL,
|
||||
surface="tools",
|
||||
setup=TaskSetup(),
|
||||
user=SimulatedUser(max_turns=1, turns=[UserTurn(message="remember stack=X")]),
|
||||
completion=CompletionSpec(
|
||||
memory=[MemoryState(key_pattern="stack", exists=True, value_contains=["React"])],
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def _make_adapter() -> tuple[HermesAdapter, list[_StubRunner]]:
|
||||
runners: list[_StubRunner] = []
|
||||
|
||||
def _factory(**kwargs):
|
||||
runner = _StubRunner(**kwargs)
|
||||
runners.append(runner)
|
||||
return runner
|
||||
|
||||
adapter = HermesAdapter(
|
||||
HermesAdapterConfig(model="stub-model", runner_factory=_factory)
|
||||
)
|
||||
return adapter, runners
|
||||
|
||||
|
||||
def _make_ctx(task: CanonicalTask, workspace: Path) -> AdapterContext:
|
||||
return AdapterContext(
|
||||
task=task,
|
||||
workspace=workspace,
|
||||
runtime_values={},
|
||||
run_index=0,
|
||||
model="stub-model",
|
||||
transcript=Transcript(),
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Registration + capability shape
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_hermes_adapter_is_registered() -> None:
|
||||
cls = get_adapter("hermes")
|
||||
assert cls is HermesAdapter
|
||||
|
||||
|
||||
def test_hermes_capabilities_are_files_and_execution_only() -> None:
|
||||
assert HermesAdapter.capabilities == {
|
||||
AdapterCapability.FILES,
|
||||
AdapterCapability.EXECUTION,
|
||||
}
|
||||
|
||||
|
||||
def test_hermes_supports_files_only_task() -> None:
|
||||
task = _files_only_task()
|
||||
assert HermesAdapter.supports(task)
|
||||
|
||||
|
||||
def test_hermes_does_not_support_memory_task() -> None:
|
||||
task = _memory_task()
|
||||
assert not HermesAdapter.supports(task)
|
||||
missing = HermesAdapter.missing_capabilities_for(task)
|
||||
assert AdapterCapability.MEMORY in missing
|
||||
|
||||
|
||||
def test_hermes_full_agent_capabilities_cover_memory_and_dynamic_tasks() -> None:
|
||||
task = _memory_task()
|
||||
config = HermesAdapterConfig(model="stub-model", driver_mode="ai_agent")
|
||||
assert HermesAdapter.supports(task, config)
|
||||
caps = HermesAdapter.supported_capabilities(config)
|
||||
assert AdapterCapability.MEMORY in caps
|
||||
assert AdapterCapability.CRON in caps
|
||||
assert AdapterCapability.BROWSER in caps
|
||||
assert AdapterCapability.MULTI_TURN_INJECTION in caps
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Lifecycle
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_setup_realizes_memory_seed_as_workspace_files(tmp_path: Path) -> None:
|
||||
task = _files_only_task(memory_seed=True)
|
||||
adapter, _ = _make_adapter()
|
||||
|
||||
async def _go() -> None:
|
||||
async with adapter:
|
||||
ctx = _make_ctx(task, tmp_path)
|
||||
await adapter.setup(ctx)
|
||||
|
||||
asyncio.run(_go())
|
||||
seeded = tmp_path / "memory" / "stack.md"
|
||||
assert seeded.is_file()
|
||||
assert "React" in seeded.read_text(encoding="utf-8")
|
||||
|
||||
|
||||
def test_run_phase_sends_rendered_prompt_and_parses_conversation(tmp_path: Path) -> None:
|
||||
task = _files_only_task()
|
||||
adapter, runners = _make_adapter()
|
||||
|
||||
async def _go():
|
||||
async with adapter:
|
||||
ctx = _make_ctx(task, tmp_path)
|
||||
await adapter.setup(ctx)
|
||||
result = await adapter.run_phase(task.phases[0], ctx)
|
||||
return ctx, result
|
||||
|
||||
ctx, result = asyncio.run(_go())
|
||||
|
||||
# The stub runner saw the rendered user message.
|
||||
assert runners
|
||||
assert runners[0].last_prompt == "List the workspace files."
|
||||
|
||||
# Conversation parsed into the shared transcript.
|
||||
assert result.error is None
|
||||
assert ctx.transcript.tool_call_sequence, "expected tool calls parsed out of Hermes conversation"
|
||||
first_call = ctx.transcript.tool_call_sequence[0]
|
||||
assert first_call.name == "bash"
|
||||
assert first_call.input == {"cmd": "ls"}
|
||||
assert "main.py" in first_call.output
|
||||
assert result.adapter_metadata.get("api_calls") == 3
|
||||
assert result.completed_normally is True
|
||||
|
||||
|
||||
def test_runner_factory_uses_explicit_provider_instead_of_api_key(tmp_path: Path) -> None:
|
||||
task = _files_only_task()
|
||||
calls: list[dict] = []
|
||||
|
||||
def _factory(**kwargs):
|
||||
calls.append(kwargs)
|
||||
return _StubRunner(model=kwargs["model"], cwd=kwargs["cwd"])
|
||||
|
||||
adapter = HermesAdapter(
|
||||
HermesAdapterConfig(
|
||||
model="stub-model",
|
||||
provider="openai-codex",
|
||||
base_url="https://example.invalid/v1",
|
||||
api_key="secret",
|
||||
runner_factory=_factory,
|
||||
)
|
||||
)
|
||||
|
||||
async def _go() -> None:
|
||||
async with adapter:
|
||||
ctx = _make_ctx(task, tmp_path)
|
||||
await adapter.setup(ctx)
|
||||
|
||||
asyncio.run(_go())
|
||||
|
||||
assert calls
|
||||
assert calls[0]["base_url"] is None
|
||||
assert calls[0]["api_key"] is None
|
||||
|
||||
|
||||
def test_direct_openai_endpoint_strips_provider_prefix_for_hermes(tmp_path: Path) -> None:
|
||||
task = _files_only_task()
|
||||
calls: list[dict] = []
|
||||
|
||||
def _factory(**kwargs):
|
||||
calls.append(kwargs)
|
||||
return _StubRunner(model=kwargs["model"], cwd=kwargs["cwd"])
|
||||
|
||||
adapter = HermesAdapter(
|
||||
HermesAdapterConfig(
|
||||
model="openai/gpt-5.4",
|
||||
base_url="https://api.openai.com/v1",
|
||||
api_key="secret",
|
||||
runner_factory=_factory,
|
||||
)
|
||||
)
|
||||
|
||||
async def _go() -> None:
|
||||
async with adapter:
|
||||
ctx = AdapterContext(
|
||||
task=task,
|
||||
workspace=tmp_path,
|
||||
runtime_values={},
|
||||
run_index=0,
|
||||
model="openai/gpt-5.4",
|
||||
transcript=Transcript(),
|
||||
)
|
||||
await adapter.setup(ctx)
|
||||
assert ctx.adapter_state["effective_model"] == "gpt-5.4"
|
||||
|
||||
asyncio.run(_go())
|
||||
|
||||
assert calls
|
||||
assert calls[0]["model"] == "gpt-5.4"
|
||||
|
||||
|
||||
def test_ai_agent_direct_endpoint_reports_custom_provider(tmp_path: Path) -> None:
|
||||
task = _files_only_task()
|
||||
calls: list[dict] = []
|
||||
|
||||
class _StubAgent:
|
||||
pass
|
||||
|
||||
def _factory(**kwargs):
|
||||
calls.append(kwargs)
|
||||
return _StubAgent()
|
||||
|
||||
adapter = HermesAdapter(
|
||||
HermesAdapterConfig(
|
||||
model="openai/gpt-5.4",
|
||||
base_url="https://api.openai.com/v1",
|
||||
api_key="secret",
|
||||
driver_mode="ai_agent",
|
||||
agent_factory=_factory,
|
||||
)
|
||||
)
|
||||
|
||||
async def _go() -> None:
|
||||
async with adapter:
|
||||
ctx = AdapterContext(
|
||||
task=task,
|
||||
workspace=tmp_path,
|
||||
runtime_values={},
|
||||
run_index=0,
|
||||
model="openai/gpt-5.4",
|
||||
transcript=Transcript(),
|
||||
)
|
||||
await adapter.setup(ctx)
|
||||
assert ctx.adapter_state["effective_model"] == "gpt-5.4"
|
||||
|
||||
asyncio.run(_go())
|
||||
|
||||
assert calls
|
||||
assert calls[0]["model"] == "gpt-5.4"
|
||||
assert calls[0]["base_url"] == "https://api.openai.com/v1"
|
||||
assert calls[0]["api_key"] == "secret"
|
||||
assert calls[0]["provider"] == "custom"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# State queries
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_memory_query_uses_workspace_fallback(tmp_path: Path) -> None:
|
||||
task = _memory_task()
|
||||
adapter, _ = _make_adapter()
|
||||
# Simulate a prior run that wrote a MEMORY.md into the workspace.
|
||||
(tmp_path / "MEMORY.md").write_text("stack: React, Node, Postgres", encoding="utf-8")
|
||||
|
||||
query = StateQuery(
|
||||
kind="memory",
|
||||
predicate="exists",
|
||||
selector={"key_pattern": "stack"},
|
||||
expected={"value_contains": ["React"]},
|
||||
required_capability=AdapterCapability.MEMORY,
|
||||
)
|
||||
|
||||
async def _go() -> StateQueryResult:
|
||||
async with adapter:
|
||||
ctx = _make_ctx(task, tmp_path)
|
||||
await adapter.setup(ctx)
|
||||
return await adapter.verify_state_query(query, ctx)
|
||||
|
||||
result = asyncio.run(_go())
|
||||
assert result.ok is True
|
||||
assert result.capability_missing is False
|
||||
|
||||
|
||||
def test_session_query_is_reported_as_capability_missing(tmp_path: Path) -> None:
|
||||
task = _memory_task()
|
||||
adapter, _ = _make_adapter()
|
||||
|
||||
query = StateQuery(
|
||||
kind="session",
|
||||
predicate="exists",
|
||||
selector={},
|
||||
expected={},
|
||||
required_capability=AdapterCapability.SESSION,
|
||||
)
|
||||
|
||||
async def _go() -> StateQueryResult:
|
||||
async with adapter:
|
||||
ctx = _make_ctx(task, tmp_path)
|
||||
await adapter.setup(ctx)
|
||||
return await adapter.verify_state_query(query, ctx)
|
||||
|
||||
result = asyncio.run(_go())
|
||||
assert result.capability_missing is True
|
||||
assert result.ok is False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Timeouts
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_run_phase_surfaces_runner_timeout(tmp_path: Path) -> None:
|
||||
task = _files_only_task()
|
||||
|
||||
class _SlowRunner:
|
||||
def __init__(self, **_: object) -> None:
|
||||
pass
|
||||
|
||||
def run_task(self, prompt: str) -> dict:
|
||||
import time
|
||||
|
||||
time.sleep(5) # will exceed the test's configured timeout
|
||||
return {"conversations": [], "completed": False, "api_calls": 0}
|
||||
|
||||
adapter = HermesAdapter(
|
||||
HermesAdapterConfig(
|
||||
model="stub-model",
|
||||
runner_factory=lambda **kw: _SlowRunner(**kw),
|
||||
)
|
||||
)
|
||||
|
||||
# Force a short phase timeout so the test stays fast.
|
||||
task_with_short_timeout = task.model_copy(
|
||||
update={
|
||||
"phases": [
|
||||
task.phases[0].model_copy(update={"timeout_seconds": 1})
|
||||
]
|
||||
}
|
||||
)
|
||||
|
||||
async def _go():
|
||||
async with adapter:
|
||||
ctx = _make_ctx(task_with_short_timeout, tmp_path)
|
||||
await adapter.setup(ctx)
|
||||
return await adapter.run_phase(task_with_short_timeout.phases[0], ctx)
|
||||
|
||||
result = asyncio.run(_go())
|
||||
assert result.error is not None
|
||||
assert "exceeded" in result.error
|
||||
assert result.completed_normally is False
|
||||
193
tests/test_hermes_xml.py
Normal file
193
tests/test_hermes_xml.py
Normal file
@ -0,0 +1,193 @@
|
||||
"""Tests for `clawbench.adapters.hermes_xml.parse_conversation`.
|
||||
|
||||
Covers the Hermes conversation shapes we expect from the wild:
|
||||
|
||||
- Plain assistant turn with a single tool call + a following tool_response.
|
||||
- Multiple tool calls in one assistant turn.
|
||||
- Assistant turn with free-form text + a tool call.
|
||||
- A malformed tool_call payload — parser must recover gracefully
|
||||
(no raise; surface a best-effort call).
|
||||
- Name-variant keys (`function`, `parameters`) Hermes-variant models emit.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from clawbench.adapters.hermes_xml import (
|
||||
iter_tool_calls_from_conversations,
|
||||
parse_chat_messages,
|
||||
parse_conversation,
|
||||
)
|
||||
from clawbench.trajectory import annotate_transcript_tool_calls
|
||||
|
||||
|
||||
def _conv(*entries: dict[str, str]) -> dict:
|
||||
return {"conversations": list(entries), "completed": True, "api_calls": 1}
|
||||
|
||||
|
||||
def test_single_tool_call_with_response() -> None:
|
||||
convo = _conv(
|
||||
{"from": "system", "value": "You are a helpful coding agent."},
|
||||
{"from": "user", "value": "List files."},
|
||||
{
|
||||
"from": "assistant",
|
||||
"value": "I'll run `ls`.\n"
|
||||
'<tool_call>{"name":"bash","arguments":{"cmd":"ls"}}</tool_call>',
|
||||
},
|
||||
{
|
||||
"from": "tool",
|
||||
"value": '<tool_response>{"stdout":"main.py\\nREADME"}</tool_response>',
|
||||
},
|
||||
)
|
||||
transcript = parse_conversation(convo)
|
||||
calls = transcript.tool_call_sequence
|
||||
assert len(calls) == 1
|
||||
assert calls[0].name == "bash"
|
||||
assert calls[0].input == {"cmd": "ls"}
|
||||
assert "main.py" in calls[0].output
|
||||
assert calls[0].success is True
|
||||
|
||||
# Assistant text preserved, tool-call body stripped out.
|
||||
assistant = next(
|
||||
msg for msg in transcript.messages if msg.role == "assistant"
|
||||
)
|
||||
assert "I'll run `ls`." in assistant.text
|
||||
assert "<tool_call>" not in assistant.text
|
||||
|
||||
|
||||
def test_multiple_tool_calls_in_one_turn() -> None:
|
||||
convo = _conv(
|
||||
{
|
||||
"from": "assistant",
|
||||
"value": (
|
||||
'<tool_call>{"name":"bash","arguments":{"cmd":"ls"}}</tool_call>'
|
||||
'<tool_call>{"name":"bash","arguments":{"cmd":"pwd"}}</tool_call>'
|
||||
),
|
||||
},
|
||||
{
|
||||
"from": "tool",
|
||||
"value": '<tool_response>{"stdout":"a"}</tool_response>',
|
||||
},
|
||||
{
|
||||
"from": "tool",
|
||||
"value": '<tool_response>{"stdout":"/tmp"}</tool_response>',
|
||||
},
|
||||
)
|
||||
calls = iter_tool_calls_from_conversations(convo["conversations"])
|
||||
assert len(calls) == 2
|
||||
assert calls[0].input == {"cmd": "ls"}
|
||||
assert calls[1].input == {"cmd": "pwd"}
|
||||
assert calls[0].output == "a"
|
||||
assert calls[1].output == "/tmp"
|
||||
|
||||
|
||||
def test_malformed_json_falls_back_to_best_effort() -> None:
|
||||
convo = _conv(
|
||||
{
|
||||
"from": "assistant",
|
||||
"value": (
|
||||
'<tool_call>{"name":"bash","arguments":{"cmd":"ls"} <-- stray text }</tool_call>'
|
||||
'<tool_call>{"name":"bash","arguments":{"cmd":"pwd"}}</tool_call>'
|
||||
),
|
||||
},
|
||||
)
|
||||
calls = iter_tool_calls_from_conversations(convo["conversations"])
|
||||
# First is malformed; parser recovers one or two calls without
|
||||
# raising, and the clean second call is always captured.
|
||||
assert len(calls) >= 1
|
||||
assert any(c.input == {"cmd": "pwd"} for c in calls)
|
||||
|
||||
|
||||
def test_name_variants_are_accepted() -> None:
|
||||
convo = _conv(
|
||||
{
|
||||
"from": "assistant",
|
||||
"value": (
|
||||
'<tool_call>{"function":"bash","parameters":{"cmd":"ls"}}</tool_call>'
|
||||
),
|
||||
},
|
||||
)
|
||||
calls = iter_tool_calls_from_conversations(convo["conversations"])
|
||||
assert len(calls) == 1
|
||||
assert calls[0].name == "bash"
|
||||
assert calls[0].input == {"cmd": "ls"}
|
||||
|
||||
|
||||
def test_tool_error_marks_call_failed() -> None:
|
||||
convo = _conv(
|
||||
{
|
||||
"from": "assistant",
|
||||
"value": '<tool_call>{"name":"bash","arguments":{"cmd":"nonsense"}}</tool_call>',
|
||||
},
|
||||
{
|
||||
"from": "tool",
|
||||
"value": '<tool_response>{"stderr":"command not found","status":"error"}</tool_response>',
|
||||
},
|
||||
)
|
||||
calls = iter_tool_calls_from_conversations(convo["conversations"])
|
||||
assert len(calls) == 1
|
||||
assert calls[0].success is False
|
||||
assert "command not found" in calls[0].error
|
||||
|
||||
|
||||
def test_orphan_tool_response_not_silently_dropped() -> None:
|
||||
convo = _conv(
|
||||
{
|
||||
"from": "tool",
|
||||
"value": '<tool_response>{"stdout":"nothing to pair with"}</tool_response>',
|
||||
},
|
||||
)
|
||||
transcript = parse_conversation(convo)
|
||||
# No calls, but one tool-role transcript message surfaces the output.
|
||||
assert transcript.tool_call_sequence == []
|
||||
tool_messages = [msg for msg in transcript.messages if msg.role == "tool"]
|
||||
assert tool_messages
|
||||
assert "nothing to pair" in tool_messages[0].tool_result_content
|
||||
|
||||
|
||||
def test_parser_output_annotates_with_canonical_families() -> None:
|
||||
convo = _conv(
|
||||
{
|
||||
"from": "assistant",
|
||||
"value": (
|
||||
'<tool_call>{"name":"str_replace_based_edit_tool",'
|
||||
'"arguments":{"path":"main.py","old":"a","new":"b"}}</tool_call>'
|
||||
),
|
||||
},
|
||||
)
|
||||
transcript = parse_conversation(convo)
|
||||
# Running the existing trajectory classifier over the parsed
|
||||
# transcript should assign a canonical family tag to every call.
|
||||
annotated = annotate_transcript_tool_calls(transcript)
|
||||
families = [c.family for c in annotated.tool_call_sequence]
|
||||
assert all(f for f in families), f"expected every call to get a family tag, got {families}"
|
||||
assert families == ["edit"]
|
||||
|
||||
|
||||
def test_parse_chat_messages_pairs_tool_results() -> None:
|
||||
transcript = parse_chat_messages(
|
||||
[
|
||||
{"role": "user", "content": "List files"},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "I'll inspect.",
|
||||
"tool_calls": [
|
||||
{
|
||||
"id": "call-1",
|
||||
"function": {
|
||||
"name": "terminal",
|
||||
"arguments": "{\"command\":\"ls\"}",
|
||||
},
|
||||
}
|
||||
],
|
||||
},
|
||||
{"role": "tool", "tool_call_id": "call-1", "content": "main.py"},
|
||||
{"role": "assistant", "content": "Found main.py"},
|
||||
]
|
||||
)
|
||||
|
||||
calls = transcript.tool_call_sequence
|
||||
assert len(calls) == 1
|
||||
assert calls[0].name == "terminal"
|
||||
assert calls[0].input == {"command": "ls"}
|
||||
assert calls[0].output == "main.py"
|
||||
assert transcript.assistant_messages[-1].text == "Found main.py"
|
||||
@ -4,6 +4,7 @@ from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
import clawbench.tasks as tasks_module
|
||||
from clawbench.client import GatewayConfig
|
||||
from clawbench.environment import verify_completion
|
||||
from clawbench.harness import BenchmarkHarness
|
||||
@ -12,14 +13,8 @@ from clawbench.services import build_runtime_values, start_background_services,
|
||||
from clawbench.tasks import load_all_tasks
|
||||
from clawbench.trajectory import evaluate_trajectory
|
||||
|
||||
# The task set is moving to a private holdout; the public repo will ship a
|
||||
# different task set soon. Until then, skip integration tests that need
|
||||
# specific task ids when the tasks directory isn't present.
|
||||
_TASKS_DIR = Path(__file__).resolve().parent.parent / "tasks"
|
||||
pytestmark = pytest.mark.skipif(
|
||||
not _TASKS_DIR.exists(),
|
||||
reason="tasks/ directory not present (private holdout — public set TBD)",
|
||||
)
|
||||
PUBLIC_TASKS_DIR = Path(__file__).resolve().parent.parent / "tasks-public"
|
||||
tasks_module.TASKS_DIR = PUBLIC_TASKS_DIR
|
||||
|
||||
|
||||
class DummyClient:
|
||||
@ -28,8 +23,13 @@ class DummyClient:
|
||||
|
||||
|
||||
def _prepare_workspace(task_id: str, tmp_path: Path) -> tuple[Path, object]:
|
||||
task = next(task for task in load_all_tasks() if task.id == task_id)
|
||||
harness = BenchmarkHarness(gateway_config=GatewayConfig(), model="test-model", randomize_order=False)
|
||||
task = next(task for task in load_all_tasks(tasks_dir=PUBLIC_TASKS_DIR) if task.id == task_id)
|
||||
harness = BenchmarkHarness(
|
||||
gateway_config=GatewayConfig(),
|
||||
model="test-model",
|
||||
randomize_order=False,
|
||||
tasks_dir=PUBLIC_TASKS_DIR,
|
||||
)
|
||||
workspace = tmp_path / task_id
|
||||
workspace.mkdir(parents=True, exist_ok=True)
|
||||
harness._setup_workspace(task, workspace)
|
||||
@ -57,50 +57,6 @@ async def test_python_completion_check_passes_after_fix(tmp_path: Path):
|
||||
|
||||
assert result.score == 1.0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_node_completion_check_passes_after_fix(tmp_path: Path):
|
||||
workspace, task = _prepare_workspace("t2-node-search-patch", tmp_path)
|
||||
# After hardening, render.js also exports emptyNote() with a legitimate
|
||||
# empty body. The scoped fix only patches normalizeNote's body and must
|
||||
# leave emptyNote alone.
|
||||
(workspace / "src" / "render.js").write_text(
|
||||
"function normalizeNote(note) {\n"
|
||||
" return {\n"
|
||||
" title: note.title.trim(),\n"
|
||||
" body: note.body.trim(),\n"
|
||||
" };\n"
|
||||
"}\n\n"
|
||||
"function emptyNote() {\n"
|
||||
" return {\n"
|
||||
" title: \"\",\n"
|
||||
" body: \"\",\n"
|
||||
" };\n"
|
||||
"}\n\n"
|
||||
"module.exports = { normalizeNote, emptyNote };\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
(workspace / "src" / "search.js").write_text(
|
||||
"function filterNotes(notes, query) {\n"
|
||||
" const needle = query.trim().toLowerCase();\n"
|
||||
" return notes.filter((note) => note.title.toLowerCase().includes(needle) || note.body.toLowerCase().includes(needle));\n"
|
||||
"}\n\n"
|
||||
"module.exports = { filterNotes };\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
runtime_values = build_runtime_values(workspace=workspace, repo_root=Path.cwd())
|
||||
result = await verify_completion(
|
||||
task.completion,
|
||||
workspace=workspace,
|
||||
client=DummyClient(), # type: ignore[arg-type]
|
||||
session_key="",
|
||||
runtime_values=runtime_values,
|
||||
)
|
||||
|
||||
assert result.score == 1.0
|
||||
|
||||
|
||||
def _playwright_available() -> bool:
|
||||
if not shutil.which("node"):
|
||||
return False
|
||||
@ -156,7 +112,10 @@ async def test_browser_completion_check_passes_after_fix(tmp_path: Path):
|
||||
|
||||
|
||||
def test_memory_task_trajectory_requires_memory_tool():
|
||||
task = next(task for task in load_all_tasks() if task.id == "t4-memory-recall-continuation")
|
||||
task = next(
|
||||
task for task in load_all_tasks(tasks_dir=PUBLIC_TASKS_DIR)
|
||||
if task.id == "t4-memory-recall-continuation"
|
||||
)
|
||||
transcript = Transcript(
|
||||
messages=[
|
||||
TranscriptMessage(role="assistant", tool_calls=[ToolCall(name="exec", input={"command": "cat docs/release_notes.md"}, success=True)]),
|
||||
@ -172,7 +131,10 @@ def test_memory_task_trajectory_requires_memory_tool():
|
||||
|
||||
|
||||
def test_delegation_task_trajectory_requires_delegate_family():
|
||||
task = next(task for task in load_all_tasks() if task.id == "t4-delegation-repair")
|
||||
task = next(
|
||||
task for task in load_all_tasks(tasks_dir=PUBLIC_TASKS_DIR)
|
||||
if task.id == "t4-delegation-repair"
|
||||
)
|
||||
transcript = Transcript(
|
||||
messages=[
|
||||
TranscriptMessage(role="assistant", tool_calls=[ToolCall(name="exec", input={"command": "rg billing ."}, success=True)]),
|
||||
|
||||
444
tests/test_openclaw_adapter.py
Normal file
444
tests/test_openclaw_adapter.py
Normal file
@ -0,0 +1,444 @@
|
||||
"""Tests for `OpenClawAdapter` — exercised against a stub gateway.
|
||||
|
||||
This validates the adapter wiring (lifecycle + state-query resolution)
|
||||
in isolation, before the harness is rewired through it. The stub
|
||||
`GatewayClient` records every call and produces canned responses so
|
||||
the adapter's branches are covered end-to-end without a real gateway.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
|
||||
from clawbench.adapters import get_adapter
|
||||
from clawbench.adapters.base import AdapterContext, StateQueryResult
|
||||
from clawbench.adapters.openclaw import OpenClawAdapter, OpenClawAdapterConfig
|
||||
from clawbench.canonical import (
|
||||
AdapterCapability,
|
||||
CanonicalTask,
|
||||
StateQuery,
|
||||
)
|
||||
from clawbench.canonical.convert import from_task_definition
|
||||
from clawbench.schemas import (
|
||||
CompletionSpec,
|
||||
ExecutionCheck,
|
||||
FileState,
|
||||
GatewayAssertion,
|
||||
MemoryState,
|
||||
SessionState,
|
||||
SimulatedUser,
|
||||
TaskDefinition,
|
||||
TaskFamily,
|
||||
TaskSetup,
|
||||
Tier,
|
||||
Transcript,
|
||||
UserTurn,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Stub GatewayClient
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class _StubGateway:
|
||||
"""Minimal GatewayClient stand-in for adapter tests.
|
||||
|
||||
Records every `create_agent`, `create_session`, `subscribe`,
|
||||
`send_and_wait`, `delete_*` call in `.calls`, and serves canned
|
||||
responses for the verification RPCs used by `OpenClawAdapter`.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.calls: list[tuple[str, dict[str, Any]]] = []
|
||||
self.rpc_responses: dict[str, dict[str, Any]] = {}
|
||||
self.send_transcript = Transcript()
|
||||
|
||||
async def __aenter__(self) -> "_StubGateway":
|
||||
self.calls.append(("__aenter__", {}))
|
||||
return self
|
||||
|
||||
async def __aexit__(self, *exc: object) -> None:
|
||||
self.calls.append(("__aexit__", {}))
|
||||
|
||||
async def create_agent(self, *, name: str, workspace: str) -> str:
|
||||
self.calls.append(("create_agent", {"name": name, "workspace": workspace}))
|
||||
return "agent-stub"
|
||||
|
||||
async def create_session(self, *, model: str, agent_id: str, label: str) -> str:
|
||||
self.calls.append(
|
||||
("create_session", {"model": model, "agent_id": agent_id, "label": label})
|
||||
)
|
||||
return f"session-{label}"
|
||||
|
||||
async def subscribe(self, session_key: str) -> None:
|
||||
self.calls.append(("subscribe", {"session_key": session_key}))
|
||||
|
||||
async def send_and_wait(
|
||||
self,
|
||||
session_key: str,
|
||||
message: str,
|
||||
*,
|
||||
timeout: float,
|
||||
) -> Transcript:
|
||||
self.calls.append(
|
||||
(
|
||||
"send_and_wait",
|
||||
{"session_key": session_key, "message": message, "timeout": timeout},
|
||||
)
|
||||
)
|
||||
return self.send_transcript
|
||||
|
||||
async def delete_session(self, session_key: str) -> None:
|
||||
self.calls.append(("delete_session", {"session_key": session_key}))
|
||||
|
||||
async def delete_agent(self, agent_id: str, *, delete_files: bool) -> None:
|
||||
self.calls.append(
|
||||
("delete_agent", {"agent_id": agent_id, "delete_files": delete_files})
|
||||
)
|
||||
|
||||
async def get_effective_tools(self, session_key: str) -> dict[str, Any]:
|
||||
self.calls.append(("get_effective_tools", {"session_key": session_key}))
|
||||
return self.rpc_responses.get(
|
||||
"tools.effective",
|
||||
{"groups": [{"tools": [{"id": "bash"}, {"id": "browser"}]}]},
|
||||
)
|
||||
|
||||
async def _rpc(self, method: str, params: dict[str, Any]) -> dict[str, Any]:
|
||||
self.calls.append((f"_rpc:{method}", dict(params)))
|
||||
if method in self.rpc_responses:
|
||||
return self.rpc_responses[method]
|
||||
raise RuntimeError(f"stub gateway: no response set for {method}")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fixtures
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _coding_task() -> CanonicalTask:
|
||||
return from_task_definition(
|
||||
TaskDefinition(
|
||||
id="oa-adapter-test",
|
||||
name="OA adapter test",
|
||||
tier=Tier.TIER1,
|
||||
family=TaskFamily.CODING,
|
||||
surface="coding",
|
||||
setup=TaskSetup(),
|
||||
user=SimulatedUser(
|
||||
max_turns=1,
|
||||
turns=[UserTurn(message="Do the task.")],
|
||||
),
|
||||
completion=CompletionSpec(
|
||||
files=[FileState(path="out.txt", exists=True)],
|
||||
execution_checks=[ExecutionCheck(name="ok", command="true")],
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def _mixed_state_task() -> CanonicalTask:
|
||||
return from_task_definition(
|
||||
TaskDefinition(
|
||||
id="oa-adapter-state-test",
|
||||
name="OA state test",
|
||||
tier=Tier.TIER2,
|
||||
family=TaskFamily.MULTI_TOOL,
|
||||
surface="tools",
|
||||
setup=TaskSetup(
|
||||
pre_check_gateway=[
|
||||
GatewayAssertion(
|
||||
method="agents.list",
|
||||
assert_path="$.count",
|
||||
assert_equals=0,
|
||||
),
|
||||
],
|
||||
),
|
||||
user=SimulatedUser(max_turns=1, turns=[UserTurn(message="go")]),
|
||||
completion=CompletionSpec(
|
||||
memory=[MemoryState(key_pattern="stack", exists=True, value_contains=["React"])],
|
||||
session=SessionState(should_exist=True, model_should_be="opus"),
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def _make_adapter_and_gateway() -> tuple[OpenClawAdapter, _StubGateway]:
|
||||
gateway = _StubGateway()
|
||||
adapter = OpenClawAdapter(OpenClawAdapterConfig(model="test-model"))
|
||||
adapter._client_factory = lambda: gateway # type: ignore[assignment]
|
||||
return adapter, gateway
|
||||
|
||||
|
||||
def _make_ctx(task: CanonicalTask, workspace: Path) -> AdapterContext:
|
||||
return AdapterContext(
|
||||
task=task,
|
||||
workspace=workspace,
|
||||
runtime_values={},
|
||||
run_index=0,
|
||||
model="test-model",
|
||||
transcript=Transcript(),
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Registration
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_openclaw_adapter_is_registered() -> None:
|
||||
cls = get_adapter("openclaw")
|
||||
assert cls is OpenClawAdapter
|
||||
|
||||
|
||||
def test_openclaw_declares_full_capability_set() -> None:
|
||||
assert AdapterCapability.FILES in OpenClawAdapter.capabilities
|
||||
assert AdapterCapability.EXECUTION in OpenClawAdapter.capabilities
|
||||
assert AdapterCapability.MEMORY in OpenClawAdapter.capabilities
|
||||
assert AdapterCapability.SESSION in OpenClawAdapter.capabilities
|
||||
assert AdapterCapability.CRON in OpenClawAdapter.capabilities
|
||||
assert AdapterCapability.GATEWAY_RPC in OpenClawAdapter.capabilities
|
||||
assert AdapterCapability.BROWSER in OpenClawAdapter.capabilities
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Lifecycle
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_setup_realizes_memory_seed_files(tmp_path: Path) -> None:
|
||||
task = from_task_definition(
|
||||
TaskDefinition(
|
||||
id="oa-seeded-memory",
|
||||
name="OA seeded memory",
|
||||
tier=Tier.TIER2,
|
||||
family=TaskFamily.MULTI_TOOL,
|
||||
surface="tools",
|
||||
setup=TaskSetup(
|
||||
memory_seed=[
|
||||
{
|
||||
"key": "event profile",
|
||||
"value": "Vegetarian food, quiet rooms, and no stairs.",
|
||||
}
|
||||
]
|
||||
),
|
||||
user=SimulatedUser(max_turns=1, turns=[UserTurn(message="go")]),
|
||||
)
|
||||
)
|
||||
adapter, gateway = _make_adapter_and_gateway()
|
||||
|
||||
async def _go() -> None:
|
||||
async with adapter:
|
||||
ctx = _make_ctx(task, tmp_path)
|
||||
await adapter.setup(ctx)
|
||||
|
||||
asyncio.run(_go())
|
||||
|
||||
assert (tmp_path / "MEMORY.md").read_text(encoding="utf-8").count("event profile") == 1
|
||||
assert "Vegetarian food" in (tmp_path / "memory" / "event_profile.md").read_text(encoding="utf-8")
|
||||
assert any(call[0] == "create_agent" for call in gateway.calls)
|
||||
|
||||
|
||||
def test_run_phase_creates_session_subscribes_and_drives_simulator(tmp_path: Path) -> None:
|
||||
task = _coding_task()
|
||||
adapter, gateway = _make_adapter_and_gateway()
|
||||
|
||||
async def _go() -> None:
|
||||
async with adapter:
|
||||
ctx = _make_ctx(task, tmp_path)
|
||||
await adapter.setup(ctx)
|
||||
result = await adapter.run_phase(task.phases[0], ctx)
|
||||
assert result.error is None
|
||||
await adapter.teardown(ctx)
|
||||
|
||||
asyncio.run(_go())
|
||||
|
||||
methods = [name for name, _ in gateway.calls]
|
||||
# Ordered sequence we expect:
|
||||
assert "create_agent" in methods
|
||||
assert "create_session" in methods
|
||||
assert "subscribe" in methods
|
||||
assert "send_and_wait" in methods
|
||||
assert "delete_session" in methods
|
||||
assert "delete_agent" in methods
|
||||
# The send_and_wait call should use the rendered user turn text.
|
||||
send_args = next(args for name, args in gateway.calls if name == "send_and_wait")
|
||||
assert send_args["message"] == "Do the task."
|
||||
|
||||
|
||||
def test_run_phase_fails_fast_without_setup(tmp_path: Path) -> None:
|
||||
task = _coding_task()
|
||||
adapter, _ = _make_adapter_and_gateway()
|
||||
|
||||
async def _go() -> None:
|
||||
async with adapter:
|
||||
ctx = _make_ctx(task, tmp_path)
|
||||
# Skip setup() — run_phase should return an error phase.
|
||||
result = await adapter.run_phase(task.phases[0], ctx)
|
||||
assert result.completed_normally is False
|
||||
assert result.error and "agent_id" in result.error
|
||||
|
||||
asyncio.run(_go())
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# State queries
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_memory_query_uses_memory_search_primary_path(tmp_path: Path) -> None:
|
||||
task = _mixed_state_task()
|
||||
adapter, gateway = _make_adapter_and_gateway()
|
||||
gateway.rpc_responses["memory.search"] = {
|
||||
"payload": {"entries": [{"value": "stack = React, Node, Postgres"}]}
|
||||
}
|
||||
|
||||
query = StateQuery(
|
||||
kind="memory",
|
||||
predicate="exists",
|
||||
selector={"key_pattern": "stack"},
|
||||
expected={"value_contains": ["React"]},
|
||||
required_capability=AdapterCapability.MEMORY,
|
||||
)
|
||||
|
||||
async def _go() -> StateQueryResult:
|
||||
async with adapter:
|
||||
ctx = _make_ctx(task, tmp_path)
|
||||
await adapter.setup(ctx)
|
||||
return await adapter.verify_state_query(query, ctx)
|
||||
|
||||
result = asyncio.run(_go())
|
||||
assert result.ok is True
|
||||
assert result.detail == "OK"
|
||||
|
||||
|
||||
def test_memory_query_falls_back_to_workspace_on_rpc_failure(tmp_path: Path) -> None:
|
||||
task = _mixed_state_task()
|
||||
adapter, gateway = _make_adapter_and_gateway()
|
||||
# No memory.search response → primary path raises, fallback runs.
|
||||
# Seed a MEMORY.md file in the workspace so the fallback succeeds.
|
||||
(tmp_path / "MEMORY.md").write_text(
|
||||
"stack: React, Node, Postgres", encoding="utf-8"
|
||||
)
|
||||
|
||||
query = StateQuery(
|
||||
kind="memory",
|
||||
predicate="exists",
|
||||
selector={"key_pattern": "stack"},
|
||||
expected={"value_contains": ["React"]},
|
||||
required_capability=AdapterCapability.MEMORY,
|
||||
)
|
||||
|
||||
async def _go() -> StateQueryResult:
|
||||
async with adapter:
|
||||
ctx = _make_ctx(task, tmp_path)
|
||||
await adapter.setup(ctx)
|
||||
return await adapter.verify_state_query(query, ctx)
|
||||
|
||||
result = asyncio.run(_go())
|
||||
assert result.ok is True
|
||||
|
||||
|
||||
def test_session_query_uses_sessions_resolve(tmp_path: Path) -> None:
|
||||
task = _mixed_state_task()
|
||||
adapter, gateway = _make_adapter_and_gateway()
|
||||
gateway.rpc_responses["sessions.resolve"] = {
|
||||
"payload": {"model": "claude-opus-4"}
|
||||
}
|
||||
|
||||
query = StateQuery(
|
||||
kind="session",
|
||||
predicate="exists",
|
||||
selector={},
|
||||
expected={"model": "opus"},
|
||||
required_capability=AdapterCapability.SESSION,
|
||||
)
|
||||
|
||||
async def _go() -> StateQueryResult:
|
||||
async with adapter:
|
||||
ctx = _make_ctx(task, tmp_path)
|
||||
await adapter.setup(ctx)
|
||||
ctx.adapter_state["last_session_key"] = "some-session"
|
||||
return await adapter.verify_state_query(query, ctx)
|
||||
|
||||
result = asyncio.run(_go())
|
||||
assert result.ok is True
|
||||
|
||||
|
||||
def test_gateway_query_resolves_json_path(tmp_path: Path) -> None:
|
||||
task = _mixed_state_task()
|
||||
adapter, gateway = _make_adapter_and_gateway()
|
||||
gateway.rpc_responses["memory.list"] = {
|
||||
"payload": {"count": 3}
|
||||
}
|
||||
|
||||
query = StateQuery(
|
||||
kind="custom",
|
||||
predicate="equals",
|
||||
selector={"method": "memory.list", "params": {}, "assert_path": "$.count"},
|
||||
expected={"equals": 3, "exists": True},
|
||||
required_capability=AdapterCapability.GATEWAY_RPC,
|
||||
)
|
||||
|
||||
async def _go() -> StateQueryResult:
|
||||
async with adapter:
|
||||
ctx = _make_ctx(task, tmp_path)
|
||||
await adapter.setup(ctx)
|
||||
return await adapter.verify_state_query(query, ctx)
|
||||
|
||||
result = asyncio.run(_go())
|
||||
assert result.ok is True
|
||||
|
||||
|
||||
def test_cron_query_returns_false_when_no_jobs(tmp_path: Path) -> None:
|
||||
task = _mixed_state_task()
|
||||
adapter, gateway = _make_adapter_and_gateway()
|
||||
gateway.rpc_responses["cron.list"] = {"payload": {"jobs": []}}
|
||||
|
||||
query = StateQuery(
|
||||
kind="cron",
|
||||
predicate="exists",
|
||||
selector={"description_contains": "daily"},
|
||||
expected={},
|
||||
required_capability=AdapterCapability.CRON,
|
||||
)
|
||||
|
||||
async def _go() -> StateQueryResult:
|
||||
async with adapter:
|
||||
ctx = _make_ctx(task, tmp_path)
|
||||
await adapter.setup(ctx)
|
||||
return await adapter.verify_state_query(query, ctx)
|
||||
|
||||
result = asyncio.run(_go())
|
||||
assert result.ok is False
|
||||
|
||||
|
||||
def test_pre_run_queries_evaluated_during_setup(tmp_path: Path) -> None:
|
||||
task = _mixed_state_task()
|
||||
adapter, gateway = _make_adapter_and_gateway()
|
||||
# Deliberately return the wrong count to trigger a pre-run failure.
|
||||
gateway.rpc_responses["agents.list"] = {"payload": {"count": 99}}
|
||||
|
||||
async def _go() -> list[str]:
|
||||
async with adapter:
|
||||
ctx = _make_ctx(task, tmp_path)
|
||||
await adapter.setup(ctx)
|
||||
return ctx.adapter_state.get("pre_run_failures", [])
|
||||
|
||||
failures = asyncio.run(_go())
|
||||
assert failures, "pre-run gateway assertion should have failed"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Requires-context guard
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_client_accessor_errors_when_not_in_context() -> None:
|
||||
adapter, _ = _make_adapter_and_gateway()
|
||||
with pytest.raises(RuntimeError):
|
||||
_ = adapter.client
|
||||
63
tests/test_public_surface.py
Normal file
63
tests/test_public_surface.py
Normal file
@ -0,0 +1,63 @@
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parent.parent
|
||||
TASK_ID_RE = re.compile(r"\bt[1-6]-[a-z0-9-]+")
|
||||
|
||||
|
||||
def _public_task_ids() -> set[str]:
|
||||
manifest = yaml.safe_load((REPO_ROOT / "tasks-public" / "MANIFEST.yaml").read_text(encoding="utf-8"))
|
||||
return {task["id"] for task in manifest["tasks"]}
|
||||
|
||||
|
||||
def _mentioned_task_ids(path: Path) -> set[str]:
|
||||
return set(TASK_ID_RE.findall(path.read_text(encoding="utf-8", errors="ignore")))
|
||||
|
||||
|
||||
def test_public_docs_only_reference_public_task_ids():
|
||||
public_ids = _public_task_ids()
|
||||
docs = [
|
||||
REPO_ROOT / "README.md",
|
||||
REPO_ROOT / "SPACE_README.md",
|
||||
REPO_ROOT / "tasks-public" / "README.md",
|
||||
REPO_ROOT / "tasks-public" / "MANIFEST.yaml",
|
||||
]
|
||||
|
||||
leaked: dict[str, list[str]] = {}
|
||||
for path in docs:
|
||||
private_mentions = sorted(_mentioned_task_ids(path) - public_ids)
|
||||
if private_mentions:
|
||||
leaked[str(path.relative_to(REPO_ROOT))] = private_mentions
|
||||
|
||||
assert leaked == {}
|
||||
|
||||
|
||||
def test_reusable_scripts_do_not_embed_private_task_ids():
|
||||
public_ids = _public_task_ids()
|
||||
leaked: dict[str, list[str]] = {}
|
||||
|
||||
for path in sorted((REPO_ROOT / "scripts").glob("*")):
|
||||
if not path.is_file() or path.suffix not in {".py", ".sh"}:
|
||||
continue
|
||||
private_mentions = sorted(_mentioned_task_ids(path) - public_ids)
|
||||
if private_mentions:
|
||||
leaked[str(path.relative_to(REPO_ROOT))] = private_mentions
|
||||
|
||||
assert leaked == {}
|
||||
|
||||
|
||||
def test_public_docs_match_manifest_task_count():
|
||||
manifest = yaml.safe_load((REPO_ROOT / "tasks-public" / "MANIFEST.yaml").read_text(encoding="utf-8"))
|
||||
task_count = int(manifest["task_count"])
|
||||
assert task_count == len(manifest["tasks"]) == 19
|
||||
|
||||
readme = (REPO_ROOT / "README.md").read_text(encoding="utf-8")
|
||||
space_readme = (REPO_ROOT / "SPACE_README.md").read_text(encoding="utf-8")
|
||||
|
||||
assert f"Core v1: {task_count} tasks" in readme
|
||||
assert "tasks : 19" in space_readme
|
||||
assert f"Core v1: {task_count + 8} tasks" not in readme
|
||||
assert f"tasks : {task_count + 1}" not in space_readme
|
||||
@ -1,37 +1,47 @@
|
||||
from pathlib import Path
|
||||
|
||||
import clawbench.tasks as tasks_module
|
||||
from clawbench.client import GatewayConfig
|
||||
from clawbench.harness import BenchmarkHarness
|
||||
from clawbench.tasks import load_all_tasks
|
||||
|
||||
PUBLIC_TASKS_DIR = Path(__file__).resolve().parent.parent / "tasks-public"
|
||||
tasks_module.TASKS_DIR = PUBLIC_TASKS_DIR
|
||||
|
||||
|
||||
def test_load_all_tasks_returns_full_corpus():
|
||||
tasks = load_all_tasks()
|
||||
# Public Core release has 19 tasks; full private dev set has 40.
|
||||
# Either must cover tiers 1-5 and carry capability/subset/judge metadata.
|
||||
assert len(tasks) >= 19
|
||||
tasks = load_all_tasks(tasks_dir=PUBLIC_TASKS_DIR)
|
||||
|
||||
assert len(tasks) == 19
|
||||
assert {task.tier.value for task in tasks} == {"tier1", "tier2", "tier3", "tier4", "tier5"}
|
||||
assert any(task.capabilities for task in tasks)
|
||||
assert any(task.subsets for task in tasks)
|
||||
assert any(task.scenario is not None for task in tasks)
|
||||
assert any("ambiguous" in [variant.value for variant in task.prompt_variants] for task in tasks)
|
||||
assert sum(1 for task in tasks if task.judge is not None) >= 6
|
||||
assert sum(1 for task in tasks if task.judge is not None) >= 5
|
||||
assert all(task.pool.value == "public_dev" for task in tasks)
|
||||
assert all(task.setup.asset_packs for task in tasks)
|
||||
|
||||
|
||||
def test_public_tasks_match_core_v1_manifest_shape():
|
||||
tasks = load_all_tasks(tasks_dir=PUBLIC_TASKS_DIR)
|
||||
task_ids = {task.id for task in tasks}
|
||||
|
||||
assert len(tasks) == 19
|
||||
assert "t1-bugfix-discount" in task_ids
|
||||
assert "t5-hallucination-resistant-evidence" in task_ids
|
||||
assert sum(1 for task in tasks if task.tier.value == "tier4") == 5
|
||||
assert sum(1 for task in tasks if task.family.value == "browser") == 2
|
||||
assert any("memory_continuation" in [cap.value for cap in task.capabilities] for task in tasks)
|
||||
|
||||
|
||||
def test_load_all_tasks_supports_pool_subset_and_capability_filters():
|
||||
hard_tasks = load_all_tasks(subsets=["hard"])
|
||||
consensus_tasks = load_all_tasks(subsets=["consensus"])
|
||||
bugfix_tasks = load_all_tasks(capabilities=["bugfix"])
|
||||
coding_scene_tasks = load_all_tasks(scenario="coding_dev_assist")
|
||||
ambiguous_tasks = load_all_tasks(prompt_variant="ambiguous")
|
||||
bugfix_tasks = load_all_tasks(tasks_dir=PUBLIC_TASKS_DIR, capabilities=["bugfix"])
|
||||
coding_scene_tasks = load_all_tasks(tasks_dir=PUBLIC_TASKS_DIR, scenario="coding_dev_assist")
|
||||
ambiguous_tasks = load_all_tasks(tasks_dir=PUBLIC_TASKS_DIR, prompt_variant="ambiguous")
|
||||
|
||||
assert hard_tasks
|
||||
assert consensus_tasks
|
||||
assert bugfix_tasks
|
||||
assert coding_scene_tasks
|
||||
assert ambiguous_tasks
|
||||
assert all("hard" in [subset.value for subset in task.subsets] for task in hard_tasks)
|
||||
assert all("consensus" in [subset.value for subset in task.subsets] for task in consensus_tasks)
|
||||
assert all("bugfix" in [capability.value for capability in task.capabilities] for task in bugfix_tasks)
|
||||
assert all(task.scenario and task.scenario.value == "coding_dev_assist" for task in coding_scene_tasks)
|
||||
assert all("ambiguous" in [variant.value for variant in task.prompt_variants] for task in ambiguous_tasks)
|
||||
@ -42,8 +52,16 @@ def test_workspace_setup_preserves_nested_asset_paths(tmp_path: Path):
|
||||
# passes whether the dev has private tasks/ or only the public release.
|
||||
# t4-browser-research-and-code has both flat files (report_client.py,
|
||||
# serve_docs.py) and nested dirs (docs/, tests/).
|
||||
task = next(task for task in load_all_tasks() if task.id == "t4-browser-research-and-code")
|
||||
harness = BenchmarkHarness(gateway_config=GatewayConfig(), model="test-model", randomize_order=False)
|
||||
task = next(
|
||||
task for task in load_all_tasks(tasks_dir=PUBLIC_TASKS_DIR)
|
||||
if task.id == "t4-browser-research-and-code"
|
||||
)
|
||||
harness = BenchmarkHarness(
|
||||
gateway_config=GatewayConfig(),
|
||||
model="test-model",
|
||||
randomize_order=False,
|
||||
tasks_dir=PUBLIC_TASKS_DIR,
|
||||
)
|
||||
workspace = tmp_path / "workspace"
|
||||
workspace.mkdir()
|
||||
|
||||
@ -57,7 +75,7 @@ def test_workspace_setup_preserves_nested_asset_paths(tmp_path: Path):
|
||||
def test_selected_tasks_include_judge_rubrics():
|
||||
# All assertions use task IDs from the Core v1 public set so CI
|
||||
# (without the private tasks/) reproduces locally.
|
||||
tasks = {task.id: task for task in load_all_tasks()}
|
||||
tasks = {task.id: task for task in load_all_tasks(tasks_dir=PUBLIC_TASKS_DIR)}
|
||||
|
||||
assert tasks["t1-bugfix-discount"].judge is not None
|
||||
assert tasks["t3-feature-export"].judge is not None
|
||||
|
||||
@ -1,38 +1,12 @@
|
||||
import pytest
|
||||
|
||||
from clawbench.schemas import BenchmarkResult
|
||||
from clawbench.upload import _json_column, _submission_shard_name, upload_result
|
||||
from clawbench.upload import upload_result
|
||||
|
||||
|
||||
def test_submission_shard_name_sanitizes_ids():
|
||||
assert _submission_shard_name("abc/def:ghi") == "abc-def-ghi.parquet"
|
||||
assert _submission_shard_name("...") == "submission.parquet"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_upload_result_writes_append_only_submission_shard(monkeypatch):
|
||||
uploads = []
|
||||
ensured = []
|
||||
uploaded_rows = []
|
||||
|
||||
class FakeApi:
|
||||
def __init__(self, token: str) -> None:
|
||||
self.token = token
|
||||
|
||||
def upload_file(self, *, path_or_fileobj: str, path_in_repo: str, repo_id: str, repo_type: str) -> None:
|
||||
import pandas as pd
|
||||
|
||||
uploads.append((path_or_fileobj, path_in_repo, repo_id, repo_type))
|
||||
uploaded_rows.extend(pd.read_parquet(path_or_fileobj).to_dict(orient="records"))
|
||||
|
||||
monkeypatch.setattr("huggingface_hub.HfApi", FakeApi)
|
||||
monkeypatch.setattr(
|
||||
"clawbench.upload.ensure_dataset_repo",
|
||||
lambda api, repo_id: ensured.append((api.token, repo_id)),
|
||||
)
|
||||
|
||||
result = BenchmarkResult(
|
||||
submission_id="run/123",
|
||||
def _result(submission_id: str = "run/123") -> BenchmarkResult:
|
||||
return BenchmarkResult(
|
||||
submission_id=submission_id,
|
||||
model="anthropic/claude-sonnet-4-6",
|
||||
provider="anthropic",
|
||||
timestamp="2026-04-28T00:00:00+00:00",
|
||||
@ -45,19 +19,58 @@ async def test_upload_result_writes_append_only_submission_shard(monkeypatch):
|
||||
overall_pass_hat_k=1.0,
|
||||
)
|
||||
|
||||
url = await upload_result(result, dataset_repo="openclaw/clawbench-results", token="hf_test")
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_upload_result_requires_token(monkeypatch):
|
||||
monkeypatch.delenv("HF_TOKEN", raising=False)
|
||||
|
||||
with pytest.raises(RuntimeError, match="HF_TOKEN not set"):
|
||||
await upload_result(_result(), dataset_repo="openclaw/clawbench-results")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_upload_result_appends_and_deduplicates_submissions(monkeypatch):
|
||||
ensured = []
|
||||
pushed = []
|
||||
|
||||
class FakeApi:
|
||||
def __init__(self, token: str) -> None:
|
||||
self.token = token
|
||||
|
||||
class FakeDataset:
|
||||
def __init__(self, rows):
|
||||
self.rows = rows
|
||||
|
||||
@classmethod
|
||||
def from_list(cls, rows):
|
||||
return cls(rows)
|
||||
|
||||
def push_to_hub(self, repo_id: str, *, split: str, token: str) -> None:
|
||||
pushed.append((repo_id, split, token, self.rows))
|
||||
|
||||
monkeypatch.setattr("huggingface_hub.HfApi", FakeApi)
|
||||
monkeypatch.setattr("datasets.Dataset", FakeDataset)
|
||||
monkeypatch.setattr(
|
||||
"datasets.load_dataset",
|
||||
lambda *args, **kwargs: [
|
||||
{"submission_id": "old-run", "model": "old-model"},
|
||||
{"submission_id": "run/123", "model": "stale-model"},
|
||||
],
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"clawbench.upload.ensure_dataset_repo",
|
||||
lambda api, repo_id: ensured.append((api.token, repo_id)),
|
||||
)
|
||||
|
||||
url = await upload_result(_result(), dataset_repo="openclaw/clawbench-results", token="hf_test")
|
||||
|
||||
assert url == "https://huggingface.co/datasets/openclaw/clawbench-results"
|
||||
assert ensured == [("hf_test", "openclaw/clawbench-results")]
|
||||
assert len(uploads) == 1
|
||||
local_path, path_in_repo, repo_id, repo_type = uploads[0]
|
||||
assert local_path.endswith("run-123.parquet")
|
||||
assert path_in_repo == "data/submissions/run-123.parquet"
|
||||
assert len(pushed) == 1
|
||||
|
||||
repo_id, split, token, rows = pushed[0]
|
||||
assert repo_id == "openclaw/clawbench-results"
|
||||
assert repo_type == "dataset"
|
||||
assert uploaded_rows[0]["overall_delivery_outcome_counts"] == "{}"
|
||||
assert uploaded_rows[0]["task_results"] == "[]"
|
||||
|
||||
|
||||
def test_json_column_is_stable_and_compact():
|
||||
assert _json_column({"b": 2, "a": 1}) == '{"a":1,"b":2}'
|
||||
assert split == "submissions"
|
||||
assert token == "hf_test"
|
||||
assert [row["submission_id"] for row in rows] == ["old-run", "run/123"]
|
||||
assert rows[-1]["model"] == "anthropic/claude-sonnet-4-6"
|
||||
|
||||
@ -45,7 +45,12 @@ def test_configure_browser_runtime_sets_benchmark_safe_openclaw_config(monkeypat
|
||||
assert json.loads(config_path.read_text(encoding="utf-8")) == {
|
||||
"agents": {"defaults": {"skipBootstrap": True}},
|
||||
"browser": {"headless": True, "noSandbox": True},
|
||||
"tools": {"exec": {"host": "gateway", "security": "full", "ask": "off"}},
|
||||
"approvals": {"exec": {"enabled": False}},
|
||||
}
|
||||
approvals = json.loads((state_dir / "exec-approvals.json").read_text(encoding="utf-8"))
|
||||
assert approvals["defaults"] == {"security": "full", "ask": "off", "askFallback": "full"}
|
||||
assert approvals["agents"]["*"] == {"security": "full", "ask": "off", "askFallback": "full"}
|
||||
|
||||
|
||||
def test_configure_browser_runtime_pins_subagents_to_active_model(monkeypatch):
|
||||
@ -72,6 +77,8 @@ def test_configure_browser_runtime_pins_subagents_to_active_model(monkeypatch):
|
||||
}
|
||||
},
|
||||
"browser": {"headless": True, "noSandbox": True},
|
||||
"tools": {"exec": {"host": "gateway", "security": "full", "ask": "off"}},
|
||||
"approvals": {"exec": {"enabled": False}},
|
||||
}
|
||||
|
||||
|
||||
@ -169,6 +176,11 @@ def test_materialize_lane_runtime_spaces_ports_and_copies_auth(tmp_path: Path, m
|
||||
assert lane1.port == GATEWAY_PORT + GATEWAY_PORT_SPACING
|
||||
assert lane1.state_dir is not None
|
||||
assert (lane1.state_dir / "agents" / "main" / "agent" / "auth-profiles.json").exists()
|
||||
lane_cfg = json.loads((lane1.state_dir / "openclaw.json").read_text(encoding="utf-8"))
|
||||
assert lane_cfg["tools"]["exec"] == {"host": "gateway", "security": "full", "ask": "off"}
|
||||
assert lane_cfg["approvals"]["exec"] == {"enabled": False}
|
||||
lane_approvals = json.loads((lane1.state_dir / "exec-approvals.json").read_text(encoding="utf-8"))
|
||||
assert lane_approvals["defaults"] == {"security": "full", "ask": "off", "askFallback": "full"}
|
||||
|
||||
|
||||
def test_job_progress_tracker_drops_finished_parallel_lane():
|
||||
|
||||
Loading…
Reference in New Issue
Block a user