clawbench: per-sweep cache archiving + generic sweep templates

- scripts/_archive_cache.sh: snapshot run_cache/<model>/ to
  run_cache_archive/<sweep_tag>/ at sweep exit with metadata.json.
  Sourced by sweep scripts so transcripts survive the next sweep's
  cache wipe and stay available for audits.
- scripts/container_sweep_single.sh: base multi-model sweep.
  Adds CACHE_SUB entries for claude-opus-4-7 / claude-sonnet-4-7 so
  their caches are force-cleared at sweep start. Calls archive helper
  on exit.
- scripts/container_sweep_minimal.sh: 1-run-per-task variant for fast
  fix validation (~20 min) instead of full 3-run sweep (~60 min).
- Dockerfile.main: parametrized clawbench-on-openclaw image with
  ARG BASE for pinning to any openclaw tag.
- scripts/git_checkpoint.py + README: documented checkpoint workflow
  for tagging known-good states during risky work.
- .gitignore: un-ignore scripts/, keep targeted ignores for
  __pycache__, .tmp, .local.py.

Co-Authored-By: Claude Opus 4 (1M context) <noreply@anthropic.com>
This commit is contained in:
scoootscooob 2026-04-18 12:46:45 -07:00
parent fe8fef7795
commit 8a5be9c686
7 changed files with 479 additions and 1 deletions

4
.gitignore vendored
View File

@ -12,4 +12,6 @@ data/
.DS_Store
.clawbench/
reports/
scripts/
scripts/__pycache__/
scripts/*.tmp
scripts/*.local.py

51
Dockerfile.main Normal file
View File

@ -0,0 +1,51 @@
# ClawBench HF Docker Space - main variant
# Layer the benchmark harness on top of the locally-built OpenClaw image
# from upstream/main. Includes native claude-opus-4-7 support (landed in
# commit 628b454eff "feat: default Anthropic to Opus 4.7"), so no patch
# step is required here.
ARG BASE=openclaw-main-opus47:latest
FROM ${BASE}
USER root
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && \
apt-get install -y python3-pip python-is-python3 && \
rm -rf /var/lib/apt/lists/*
RUN ln -s /app /openclaw
ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
RUN npx -y playwright@1.59.1 install --with-deps chromium && \
CHROME_PATH="$(find /ms-playwright -path '*/chrome' -type f | sort | head -n 1)" && \
test -x "$CHROME_PATH" && \
ln -sf "$CHROME_PATH" /usr/bin/chromium
ENV HOME=/home/node PATH=/home/node/.local/bin:$PATH
WORKDIR /home/node/app
COPY --chown=node:node pyproject.toml README.md ./
COPY --chown=node:node clawbench/ clawbench/
COPY --chown=node:node tasks/ tasks/
COPY --chown=node:node baselines/ baselines/
COPY --chown=node:node app.py .
RUN python3 -m pip install --break-system-packages --no-cache-dir .
RUN mkdir -p \
/data/results \
/data/queue \
/home/node/.openclaw/agents/dev \
/home/node/.openclaw/agents/main/agent && \
chown -R node:node /data /home/node/.openclaw && \
chmod -R 777 /data /home/node/.openclaw
USER node
ENV GATEWAY_PORT=18789
ENV OPENCLAW_HOME=/home/node
ENV OPENCLAW_STATE_DIR=/home/node/.openclaw
EXPOSE 7860
CMD ["python", "app.py"]

View File

@ -293,6 +293,28 @@ but writes to wrong paths or misses format constraints. This gap is where
profile-level improvements (workspace-aware prompts, path-checking pre-flight
calls, retry wrappers) have the most leverage.
### Version control checkpoints
Git is already the source of truth for this repo, but the safest workflow is:
```bash
# Start risky work on its own branch
git switch -c codex/<short-topic>
# Commit small checkpoints as you go
git add -A
git commit -m "Checkpoint: describe the working state"
# Mark a known-good version with an annotated tag
python3 scripts/git_checkpoint.py "before-profile-tuning"
# Push the branch and tags so recovery is not only local
git push -u origin HEAD
git push origin --tags
```
The checkpoint script refuses to tag a dirty worktree by default, so every saved version points at a reproducible commit instead of a half-finished local state.
### Docker (recommended for reproducibility)
```bash

58
scripts/_archive_cache.sh Executable file
View File

@ -0,0 +1,58 @@
#!/bin/bash
# Shared helper sourced by container_sweep_*.sh scripts to snapshot the
# per-model run_cache after a sweep completes. Called at END of each sweep.
#
# Requires these env vars (already set by parent script):
# CLAWBENCH_RUN_CACHE_DIR - e.g. /data/run_cache
# CACHE_SUB - e.g. openai_gpt-5.4
# SWEEP_OUT_TAG - e.g. v2026-4-18-pr68627-gpt54
# SWEEP_LABEL - e.g. gpt54
# SWEEP_LOGDIR - e.g. /data/drift_2026-04-18-pr68627-gpt54
#
# Writes snapshot to: /data/run_cache_archive/<SWEEP_OUT_TAG>/<CACHE_SUB>/
# Also writes a metadata.json with sweep label/model/timestamp for indexing.
archive_run_cache() {
if [ -z "${CACHE_SUB:-}" ]; then
echo "[archive] skipped: no CACHE_SUB configured"
return 0
fi
local src="${CLAWBENCH_RUN_CACHE_DIR:-/data/run_cache}/$CACHE_SUB"
if [ ! -d "$src" ]; then
echo "[archive] skipped: cache dir $src missing"
return 0
fi
local dest_root="/data/run_cache_archive/${SWEEP_OUT_TAG:-untagged}"
local dest="$dest_root/$CACHE_SUB"
mkdir -p "$dest_root"
rm -rf "$dest" # idempotent — re-running replaces prior snapshot for this tag
cp -r "$src" "$dest"
# Write a small metadata.json alongside for quick lookup
local meta="$dest_root/metadata.json"
python3 - <<PYEOF
import json, os, datetime
meta_path = "$meta"
# Merge with existing (a single tag may cover multiple models on the same sweep)
existing = {}
if os.path.exists(meta_path):
try:
with open(meta_path) as f: existing = json.load(f)
except Exception:
existing = {}
entries = existing.setdefault("models", {})
entries["${CACHE_SUB}"] = {
"sweep_label": "${SWEEP_LABEL:-}",
"sweep_model": "${SWEEP_MODEL:-}",
"sweep_out_tag": "${SWEEP_OUT_TAG:-}",
"sweep_logdir": "${SWEEP_LOGDIR:-}",
"archived_at": datetime.datetime.now(datetime.timezone.utc).isoformat(),
"run_count": len([p for p in os.listdir("$src") for r in os.listdir(os.path.join("$src", p)) if r.startswith("run")]) if os.path.isdir("$src") else 0,
}
with open(meta_path, "w") as f: json.dump(existing, f, indent=2)
PYEOF
local runs
runs=$(find "$dest" -name "run*.json" 2>/dev/null | wc -l | tr -d ' ')
echo "[archive] saved $runs transcripts to $dest"
}

View File

@ -0,0 +1,98 @@
#!/bin/bash
# Minimal single-model sweep — 1 run per task (not 3) for fast validation.
# Used to quickly test if an openrouter-stream fix actually works without
# committing to a full 60-minute 3-run sweep.
#
# Invocation (from host):
# docker run -d --name clawbench-<LABEL> \
# -e SWEEP_LABEL=<label> -e SWEEP_MODEL=<routed-model> \
# -e SWEEP_PROFILE=<abs-profile-path> \
# -e SWEEP_LOGDIR=<output-dir-in-container> \
# -e SWEEP_OUT_TAG=<tag> \
# -v .../scripts:/home/node/app/scripts:ro \
# -v .../data:/data \
# -v .../data/container-home-openclaw:/home/node/.openclaw \
# -v .../profiles:/home/node/app/profiles:ro \
# --memory 8g \
# <image> \
# bash /home/node/app/scripts/container_sweep_minimal.sh
set -u
: "${SWEEP_LABEL:?SWEEP_LABEL required}"
: "${SWEEP_MODEL:?SWEEP_MODEL required}"
: "${SWEEP_PROFILE:?SWEEP_PROFILE required}"
: "${SWEEP_LOGDIR:?SWEEP_LOGDIR required}"
: "${SWEEP_OUT_TAG:?SWEEP_OUT_TAG required}"
cd /data
mkdir -p "$SWEEP_LOGDIR"
export OPENCLAW_GATEWAY_TOKEN="local-dev-token-for-testing"
export CLAWBENCH_RUN_CACHE_DIR="/data/run_cache"
mkdir -p "$CLAWBENCH_RUN_CACHE_DIR"
export NODE_OPTIONS="--max-old-space-size=4096"
# Clear cache for target model
case "$SWEEP_MODEL" in
openrouter/z-ai/glm-5.1) CACHE_SUB="openrouter_z-ai_glm-5.1" ;;
openrouter/minimax/minimax-m2.7) CACHE_SUB="openrouter_minimax_minimax-m2.7" ;;
openrouter/moonshotai/kimi-k2.5) CACHE_SUB="openrouter_moonshotai_kimi-k2.5" ;;
*) CACHE_SUB="" ;;
esac
if [ -n "$CACHE_SUB" ] && [ -d "$CLAWBENCH_RUN_CACHE_DIR/$CACHE_SUB" ]; then
echo "clearing cache: $CLAWBENCH_RUN_CACHE_DIR/$CACHE_SUB"
rm -rf "$CLAWBENCH_RUN_CACHE_DIR/$CACHE_SUB"
fi
OUT="$SWEEP_LOGDIR/docker_${SWEEP_LABEL}_${SWEEP_OUT_TAG}.json"
LOG="$SWEEP_LOGDIR/docker_${SWEEP_LABEL}_${SWEEP_OUT_TAG}.log"
GWLOG="$SWEEP_LOGDIR/gateway_${SWEEP_LABEL}.log"
rm -f "$OUT"
echo "===== MINIMAL SWEEP START $(date '+%Y-%m-%d %H:%M:%S') ====="
echo "label: $SWEEP_LABEL"
echo "model: $SWEEP_MODEL"
echo "profile: $SWEEP_PROFILE"
echo "out: $OUT"
echo "runs: 1 per task (MINIMAL)"
echo "Starting gateway on :18789 (heap=4GB) ..."
openclaw gateway --port 18789 > "$GWLOG" 2>&1 &
GATEWAY_PID=$!
ready=0
for i in $(seq 1 120); do
if curl -sf -H "Authorization: Bearer $OPENCLAW_GATEWAY_TOKEN" http://127.0.0.1:18789/health > /dev/null 2>&1; then
echo "Gateway healthy after ${i}s"
ready=1
break
fi
sleep 1
done
if [ $ready -ne 1 ]; then
echo "ERROR: gateway failed to come up"
exit 1
fi
echo "===== $(date '+%H:%M:%S') starting $SWEEP_LABEL ($SWEEP_MODEL) ====="
clawbench run \
--model "$SWEEP_MODEL" \
--runs 1 \
--concurrency 4 \
--profile "$SWEEP_PROFILE" \
--judge-model "anthropic/claude-sonnet-4-6" \
-o "$OUT" \
> "$LOG" 2>&1
status=$?
echo "===== $(date '+%H:%M:%S') done $SWEEP_LABEL (exit $status) ====="
# Archive the cache for future audits
# shellcheck disable=SC1091
source "$(dirname "$0")/_archive_cache.sh" 2>/dev/null && archive_run_cache || echo "[archive] helper missing, skipping"
kill $GATEWAY_PID 2>/dev/null
wait $GATEWAY_PID 2>/dev/null
exit $status

133
scripts/container_sweep_single.sh Executable file
View File

@ -0,0 +1,133 @@
#!/bin/bash
# Single-model sweep with fresh gateway + bumped Node heap to prevent OOM.
#
# Invocation (from host):
# docker run -d --name clawbench-sweep-<LABEL> \
# -e SWEEP_LABEL=<label> -e SWEEP_MODEL=<routed-model> -e SWEEP_PROFILE=<abs-profile-path> \
# -v .../scripts:/home/node/app/scripts:ro \
# -v .../data:/data \
# -v .../data/container-home-openclaw:/home/node/.openclaw \
# -v .../profiles:/home/node/app/profiles:ro \
# --memory 8g \
# clawbench-clawbench:latest \
# bash /home/node/app/scripts/container_sweep_single.sh
#
# Differences vs container_sweep.sh:
# - Bumps gateway Node.js heap via NODE_OPTIONS=--max-old-space-size=4096 (prevents 2GB OOM we saw at ~4h)
# - One model per container (no shared-gateway drift between models)
# - Force-clears run_cache for THIS model before running (prevents cache-replay masking)
# - Writes to the same $LOGDIR/docker_${label}_${SWEEP_OUT_TAG}.json as the original sweep
# so generate_drift_report.py picks it up without changes
set -u
: "${SWEEP_LABEL:?SWEEP_LABEL required (e.g. glm, minimax, kimi)}"
: "${SWEEP_MODEL:?SWEEP_MODEL required (e.g. openrouter/z-ai/glm-5.1)}"
: "${SWEEP_PROFILE:?SWEEP_PROFILE required (absolute path in container)}"
# Optional overrides (defaults target the v4.14 drift sweep):
# SWEEP_LOGDIR — where JSONs and logs go (default /data/drift_2026-04-14)
# SWEEP_OUT_TAG — tag embedded in output filename (default v2026-4-14)
: "${SWEEP_LOGDIR:=/data/drift_2026-04-14}"
: "${SWEEP_OUT_TAG:=v2026-4-14}"
cd /data
LOGDIR="$SWEEP_LOGDIR"
mkdir -p "$LOGDIR"
export OPENCLAW_GATEWAY_TOKEN="local-dev-token-for-testing"
export CLAWBENCH_RUN_CACHE_DIR="/data/run_cache"
mkdir -p "$CLAWBENCH_RUN_CACHE_DIR"
# OOM fix: give the gateway Node process a 4GB old-space ceiling instead of the default ~2GB.
# Scoped via env so we don't stomp on other Node processes (clawbench itself is python).
export NODE_OPTIONS="--max-old-space-size=4096"
# Map label -> cache subdir (matches what clawbench writes)
case "$SWEEP_MODEL" in
anthropic/claude-opus-4-7) CACHE_SUB="anthropic_claude-opus-4-7" ;;
anthropic/claude-sonnet-4-7) CACHE_SUB="anthropic_claude-sonnet-4-7" ;;
anthropic/claude-opus-4-6) CACHE_SUB="anthropic_claude-opus-4-6" ;;
anthropic/claude-sonnet-4-6) CACHE_SUB="anthropic_claude-sonnet-4-6" ;;
openai/gpt-5.4) CACHE_SUB="openai_gpt-5.4" ;;
openai/gpt-5.2) CACHE_SUB="openai_gpt-5.2" ;;
openrouter/z-ai/glm-5.1) CACHE_SUB="openrouter_z-ai_glm-5.1" ;;
openrouter/minimax/minimax-m2.7) CACHE_SUB="openrouter_minimax_minimax-m2.7" ;;
openrouter/moonshotai/kimi-k2.5) CACHE_SUB="openrouter_moonshotai_kimi-k2.5" ;;
*) CACHE_SUB="" ;;
esac
OUT="$LOGDIR/docker_${SWEEP_LABEL}_${SWEEP_OUT_TAG}.json"
LOG="$LOGDIR/docker_${SWEEP_LABEL}_${SWEEP_OUT_TAG}.log"
GWLOG="$LOGDIR/gateway_${SWEEP_LABEL}.log"
echo "===== SINGLE-MODEL SWEEP START $(date '+%Y-%m-%d %H:%M:%S') ====="
echo "label: $SWEEP_LABEL"
echo "model: $SWEEP_MODEL"
echo "profile: $SWEEP_PROFILE"
echo "out: $OUT"
echo "gwlog: $GWLOG"
echo "NODE_OPTIONS: $NODE_OPTIONS"
# Force-clear this model's run_cache so we actually re-run (no replays)
if [ -n "$CACHE_SUB" ] && [ -d "$CLAWBENCH_RUN_CACHE_DIR/$CACHE_SUB" ]; then
echo "clearing cache: $CLAWBENCH_RUN_CACHE_DIR/$CACHE_SUB"
rm -rf "$CLAWBENCH_RUN_CACHE_DIR/$CACHE_SUB"
fi
# Also remove any stale result JSON so we don't skip-on-idempotence
if [ -f "$OUT" ]; then
echo "removing stale result: $OUT"
rm -f "$OUT"
fi
# Start gateway with bumped heap
echo "Starting gateway on :18789 (heap=4GB) ..."
openclaw gateway --port 18789 > "$GWLOG" 2>&1 &
GATEWAY_PID=$!
echo "gateway pid=$GATEWAY_PID"
ready=0
for i in $(seq 1 120); do
if curl -sf -H "Authorization: Bearer $OPENCLAW_GATEWAY_TOKEN" http://127.0.0.1:18789/health > /dev/null 2>&1; then
echo "Gateway healthy after ${i}s"
ready=1
break
fi
sleep 1
done
if [ $ready -ne 1 ]; then
echo "ERROR: gateway failed to come up within 120s"
tail -30 "$GWLOG"
exit 1
fi
echo "===== $(date '+%H:%M:%S') starting $SWEEP_LABEL ($SWEEP_MODEL) ====="
clawbench run \
--model "$SWEEP_MODEL" \
--runs 3 \
--concurrency 4 \
--profile "$SWEEP_PROFILE" \
--judge-model "anthropic/claude-sonnet-4-6" \
-o "$OUT" \
> "$LOG" 2>&1
status=$?
if [ $status -eq 0 ]; then
echo "===== $(date '+%H:%M:%S') done $SWEEP_LABEL (exit 0) ====="
else
echo "===== $(date '+%H:%M:%S') FAILED $SWEEP_LABEL (exit $status) ====="
tail -20 "$LOG"
fi
# Archive the cache for future audits (preserves transcripts per sweep tag)
# shellcheck disable=SC1091
source "$(dirname "$0")/_archive_cache.sh" 2>/dev/null && archive_run_cache || echo "[archive] helper missing, skipping"
echo ""
echo "===== SINGLE-MODEL SWEEP END $(date '+%Y-%m-%d %H:%M:%S') ====="
kill $GATEWAY_PID 2>/dev/null
wait $GATEWAY_PID 2>/dev/null
echo "gateway stopped"
exit $status

114
scripts/git_checkpoint.py Normal file
View File

@ -0,0 +1,114 @@
#!/usr/bin/env python3
"""Create an annotated git checkpoint tag for a clean working tree."""
from __future__ import annotations
import argparse
import re
import subprocess
import sys
from datetime import datetime
from pathlib import Path
def run_git(args: list[str], repo_root: Path, capture_output: bool = True) -> subprocess.CompletedProcess[str]:
return subprocess.run(
["git", *args],
cwd=repo_root,
check=True,
text=True,
capture_output=capture_output,
)
def repo_root() -> Path:
try:
result = subprocess.run(
["git", "rev-parse", "--show-toplevel"],
check=True,
text=True,
capture_output=True,
)
except subprocess.CalledProcessError as exc:
raise SystemExit("Not inside a git repository.") from exc
return Path(result.stdout.strip())
def sanitize_label(label: str) -> str:
slug = re.sub(r"[^a-z0-9]+", "-", label.strip().lower()).strip("-")
if not slug:
raise SystemExit("Checkpoint name must contain at least one letter or number.")
return slug[:48]
def ensure_clean_worktree(root: Path) -> None:
status = run_git(["status", "--porcelain"], root).stdout.strip()
if status:
raise SystemExit(
"Working tree is not clean. Commit or stash your changes first, or rerun with --allow-dirty."
)
def current_branch(root: Path) -> str:
return run_git(["rev-parse", "--abbrev-ref", "HEAD"], root).stdout.strip()
def tag_exists(root: Path, tag_name: str) -> bool:
result = run_git(["tag", "--list", tag_name], root)
return result.stdout.strip() == tag_name
def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description="Create an annotated checkpoint tag for the current HEAD commit."
)
parser.add_argument("name", help="Human-readable checkpoint name, e.g. 'before benchmark rerun'.")
parser.add_argument(
"--allow-dirty",
action="store_true",
help="Allow tagging even if the working tree has uncommitted changes.",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Print the tag that would be created without modifying git state.",
)
return parser
def main() -> int:
parser = build_parser()
args = parser.parse_args()
root = repo_root()
if not args.allow_dirty:
ensure_clean_worktree(root)
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
branch = current_branch(root)
slug = sanitize_label(args.name)
tag_name = f"checkpoint/{timestamp}-{slug}"
if tag_exists(root, tag_name):
raise SystemExit(f"Checkpoint tag already exists: {tag_name}")
message = f"Checkpoint '{args.name}' from branch '{branch}' at {timestamp}"
if args.dry_run:
print(tag_name)
print(message)
return 0
run_git(["tag", "-a", tag_name, "-m", message], root, capture_output=False)
print(f"Created {tag_name}")
print(f"Push it with: git push origin {tag_name}")
return 0
if __name__ == "__main__":
try:
raise SystemExit(main())
except subprocess.CalledProcessError as exc:
if exc.stderr:
sys.stderr.write(exc.stderr)
raise SystemExit(exc.returncode) from exc