clawbench/scripts/container_sweep_single.sh
scoootscooob 01a31e55fb sweep: per-container state isolation + qwen model-id fix
scripts/container_sweep_single.sh: clone pristine OpenClaw state to
/tmp/ per sweep before starting the gateway. Carries over config
(openclaw.json, identity/, devices/, exec-approvals.json, tasks/,
subagents/, flows/, cron/) but leaves runtime dirs (agents/,
workspace*/, logs/, memory/, cache/) empty. Sets OPENCLAW_STATE_DIR
to the isolated dir so the gateway writes to /tmp instead of the
shared host mount. Fixes the cascading "RPC agents.create timed out
after 60s" failures caused by 4k+ stale agents accumulating across
sequential sweeps.

profiles/frontier_qwen_3_6.yaml: fix base_model from
openrouter/qwen/qwen-3.6-plus (with dash) to openrouter/qwen/qwen3.6-plus
(no dash). The dashed slug is unknown to OpenRouter and silently fails;
the no-dash version is the real canonical.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 19:48:30 -07:00

176 lines
7.0 KiB
Bash
Executable File

#!/bin/bash
# Single-model sweep with fresh gateway + bumped Node heap to prevent OOM.
#
# Invocation (from host):
# docker run -d --name clawbench-sweep-<LABEL> \
# -e SWEEP_LABEL=<label> -e SWEEP_MODEL=<routed-model> -e SWEEP_PROFILE=<abs-profile-path> \
# -v .../scripts:/home/node/app/scripts:ro \
# -v .../data:/data \
# -v .../data/container-home-openclaw:/home/node/.openclaw \
# -v .../profiles:/home/node/app/profiles:ro \
# --memory 8g \
# clawbench-clawbench:latest \
# bash /home/node/app/scripts/container_sweep_single.sh
#
# Differences vs container_sweep.sh:
# - Bumps gateway Node.js heap via NODE_OPTIONS=--max-old-space-size=4096 (prevents 2GB OOM we saw at ~4h)
# - One model per container (no shared-gateway drift between models)
# - Force-clears run_cache for THIS model before running (prevents cache-replay masking)
# - Writes to the same $LOGDIR/docker_${label}_${SWEEP_OUT_TAG}.json as the original sweep
# so generate_drift_report.py picks it up without changes
set -u
: "${SWEEP_LABEL:?SWEEP_LABEL required (e.g. glm, minimax, kimi)}"
: "${SWEEP_MODEL:?SWEEP_MODEL required (e.g. openrouter/z-ai/glm-5.1)}"
: "${SWEEP_PROFILE:?SWEEP_PROFILE required (absolute path in container)}"
# Optional overrides (defaults target the v4.14 drift sweep):
# SWEEP_LOGDIR — where JSONs and logs go (default /data/drift_2026-04-14)
# SWEEP_OUT_TAG — tag embedded in output filename (default v2026-4-14)
: "${SWEEP_LOGDIR:=/data/drift_2026-04-14}"
: "${SWEEP_OUT_TAG:=v2026-4-14}"
cd /data
LOGDIR="$SWEEP_LOGDIR"
mkdir -p "$LOGDIR"
export OPENCLAW_GATEWAY_TOKEN="local-dev-token-for-testing"
export CLAWBENCH_RUN_CACHE_DIR="/data/run_cache"
mkdir -p "$CLAWBENCH_RUN_CACHE_DIR"
# OOM fix: give the gateway Node process a 4GB old-space ceiling instead of the default ~2GB.
# Scoped via env so we don't stomp on other Node processes (clawbench itself is python).
export NODE_OPTIONS="--max-old-space-size=4096"
# State-dir isolation: the shared /home/node/.openclaw mount accumulates cruft
# across sweeps (agents/, workspace/, logs/, memory/, stale openclaw.json.*.tmp)
# which triggers gateway hot-reload churn and cascading `RPC agents.create timed
# out after 60s` failures. Give each sweep a pristine state dir that carries
# over only the config (openclaw.json, identity/, devices/, exec-approvals.json,
# tasks/, subagents/, flows/, cron/) and leaves runtime state empty.
SRC_STATE="/home/node/.openclaw"
FRESH_STATE="/tmp/openclaw-state-${SWEEP_LABEL}-$$"
echo "[state-isolate] cloning config from $SRC_STATE to $FRESH_STATE"
mkdir -p "$FRESH_STATE"
# Copy the main config (skip the .tmp/.bak/.clobbered/.pre-* cruft that can
# confuse the loader — only the canonical openclaw.json is needed).
if [ -f "$SRC_STATE/openclaw.json" ]; then
cp "$SRC_STATE/openclaw.json" "$FRESH_STATE/openclaw.json"
fi
if [ -f "$SRC_STATE/exec-approvals.json" ]; then
cp "$SRC_STATE/exec-approvals.json" "$FRESH_STATE/exec-approvals.json"
fi
# Carry over static config dirs — these are read-mostly and don't accumulate
# per-run cruft. SKIP: agents/ workspace*/ logs/ memory/ cache/ browser/ canvas/
# which all grow unboundedly across sweeps.
for d in identity devices tasks subagents flows cron; do
if [ -d "$SRC_STATE/$d" ]; then
cp -r "$SRC_STATE/$d" "$FRESH_STATE/$d"
fi
done
# Ensure runtime dirs exist but are empty
mkdir -p "$FRESH_STATE/agents" "$FRESH_STATE/workspace" "$FRESH_STATE/logs" "$FRESH_STATE/memory" "$FRESH_STATE/cache"
export OPENCLAW_STATE_DIR="$FRESH_STATE"
echo "[state-isolate] OPENCLAW_STATE_DIR=$OPENCLAW_STATE_DIR"
du -sh "$FRESH_STATE" 2>/dev/null | sed 's/^/[state-isolate] size: /'
# Map label -> cache subdir (matches what clawbench writes)
case "$SWEEP_MODEL" in
anthropic/claude-opus-4-7) CACHE_SUB="anthropic_claude-opus-4-7" ;;
anthropic/claude-sonnet-4-7) CACHE_SUB="anthropic_claude-sonnet-4-7" ;;
anthropic/claude-opus-4-6) CACHE_SUB="anthropic_claude-opus-4-6" ;;
anthropic/claude-sonnet-4-6) CACHE_SUB="anthropic_claude-sonnet-4-6" ;;
openai/gpt-5.4) CACHE_SUB="openai_gpt-5.4" ;;
openai/gpt-5.2) CACHE_SUB="openai_gpt-5.2" ;;
google/gemini-3.1-pro-preview) CACHE_SUB="google_gemini-3.1-pro-preview" ;;
openrouter/z-ai/glm-5.1) CACHE_SUB="openrouter_z-ai_glm-5.1" ;;
openrouter/qwen/qwen3.6-plus) CACHE_SUB="openrouter_qwen_qwen3.6-plus" ;;
openrouter/minimax/minimax-m2.7) CACHE_SUB="openrouter_minimax_minimax-m2.7" ;;
openrouter/moonshotai/kimi-k2.5) CACHE_SUB="openrouter_moonshotai_kimi-k2.5" ;;
# kimi-k2.6 is not yet supported in the openclaw version under test — skip.
*) CACHE_SUB="" ;;
esac
OUT="$LOGDIR/docker_${SWEEP_LABEL}_${SWEEP_OUT_TAG}.json"
LOG="$LOGDIR/docker_${SWEEP_LABEL}_${SWEEP_OUT_TAG}.log"
GWLOG="$LOGDIR/gateway_${SWEEP_LABEL}.log"
echo "===== SINGLE-MODEL SWEEP START $(date '+%Y-%m-%d %H:%M:%S') ====="
echo "label: $SWEEP_LABEL"
echo "model: $SWEEP_MODEL"
echo "profile: $SWEEP_PROFILE"
echo "out: $OUT"
echo "gwlog: $GWLOG"
echo "NODE_OPTIONS: $NODE_OPTIONS"
# Force-clear this model's run_cache so we actually re-run (no replays)
if [ -n "$CACHE_SUB" ] && [ -d "$CLAWBENCH_RUN_CACHE_DIR/$CACHE_SUB" ]; then
echo "clearing cache: $CLAWBENCH_RUN_CACHE_DIR/$CACHE_SUB"
rm -rf "$CLAWBENCH_RUN_CACHE_DIR/$CACHE_SUB"
fi
# Also remove any stale result JSON so we don't skip-on-idempotence
if [ -f "$OUT" ]; then
echo "removing stale result: $OUT"
rm -f "$OUT"
fi
# Start gateway with bumped heap
echo "Starting gateway on :18789 (heap=4GB) ..."
openclaw gateway --port 18789 > "$GWLOG" 2>&1 &
GATEWAY_PID=$!
echo "gateway pid=$GATEWAY_PID"
ready=0
for i in $(seq 1 120); do
if curl -sf -H "Authorization: Bearer $OPENCLAW_GATEWAY_TOKEN" http://127.0.0.1:18789/health > /dev/null 2>&1; then
echo "Gateway healthy after ${i}s"
ready=1
break
fi
sleep 1
done
if [ $ready -ne 1 ]; then
echo "ERROR: gateway failed to come up within 120s"
tail -30 "$GWLOG"
exit 1
fi
echo "===== $(date '+%H:%M:%S') starting $SWEEP_LABEL ($SWEEP_MODEL) ====="
clawbench run \
--model "$SWEEP_MODEL" \
--runs 3 \
--concurrency 4 \
--profile "$SWEEP_PROFILE" \
--judge-model "anthropic/claude-sonnet-4-6" \
-o "$OUT" \
> "$LOG" 2>&1
status=$?
if [ $status -eq 0 ]; then
echo "===== $(date '+%H:%M:%S') done $SWEEP_LABEL (exit 0) ====="
else
echo "===== $(date '+%H:%M:%S') FAILED $SWEEP_LABEL (exit $status) ====="
tail -20 "$LOG"
fi
# Archive the cache for future audits (preserves transcripts per sweep tag)
# shellcheck disable=SC1091
source "$(dirname "$0")/_archive_cache.sh" 2>/dev/null && archive_run_cache || echo "[archive] helper missing, skipping"
echo ""
echo "===== SINGLE-MODEL SWEEP END $(date '+%Y-%m-%d %H:%M:%S') ====="
kill $GATEWAY_PID 2>/dev/null
wait $GATEWAY_PID 2>/dev/null
echo "gateway stopped"
# Clean up the isolated state dir (don't accumulate /tmp cruft across sweeps).
if [ -n "${FRESH_STATE:-}" ] && [ -d "$FRESH_STATE" ]; then
echo "[state-isolate] removing $FRESH_STATE"
rm -rf "$FRESH_STATE"
fi
exit $status