scripts/container_sweep_single.sh: clone pristine OpenClaw state to /tmp/ per sweep before starting the gateway. Carries over config (openclaw.json, identity/, devices/, exec-approvals.json, tasks/, subagents/, flows/, cron/) but leaves runtime dirs (agents/, workspace*/, logs/, memory/, cache/) empty. Sets OPENCLAW_STATE_DIR to the isolated dir so the gateway writes to /tmp instead of the shared host mount. Fixes the cascading "RPC agents.create timed out after 60s" failures caused by 4k+ stale agents accumulating across sequential sweeps. profiles/frontier_qwen_3_6.yaml: fix base_model from openrouter/qwen/qwen-3.6-plus (with dash) to openrouter/qwen/qwen3.6-plus (no dash). The dashed slug is unknown to OpenRouter and silently fails; the no-dash version is the real canonical. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
176 lines
7.0 KiB
Bash
Executable File
176 lines
7.0 KiB
Bash
Executable File
#!/bin/bash
|
|
# Single-model sweep with fresh gateway + bumped Node heap to prevent OOM.
|
|
#
|
|
# Invocation (from host):
|
|
# docker run -d --name clawbench-sweep-<LABEL> \
|
|
# -e SWEEP_LABEL=<label> -e SWEEP_MODEL=<routed-model> -e SWEEP_PROFILE=<abs-profile-path> \
|
|
# -v .../scripts:/home/node/app/scripts:ro \
|
|
# -v .../data:/data \
|
|
# -v .../data/container-home-openclaw:/home/node/.openclaw \
|
|
# -v .../profiles:/home/node/app/profiles:ro \
|
|
# --memory 8g \
|
|
# clawbench-clawbench:latest \
|
|
# bash /home/node/app/scripts/container_sweep_single.sh
|
|
#
|
|
# Differences vs container_sweep.sh:
|
|
# - Bumps gateway Node.js heap via NODE_OPTIONS=--max-old-space-size=4096 (prevents 2GB OOM we saw at ~4h)
|
|
# - One model per container (no shared-gateway drift between models)
|
|
# - Force-clears run_cache for THIS model before running (prevents cache-replay masking)
|
|
# - Writes to the same $LOGDIR/docker_${label}_${SWEEP_OUT_TAG}.json as the original sweep
|
|
# so generate_drift_report.py picks it up without changes
|
|
|
|
set -u
|
|
|
|
: "${SWEEP_LABEL:?SWEEP_LABEL required (e.g. glm, minimax, kimi)}"
|
|
: "${SWEEP_MODEL:?SWEEP_MODEL required (e.g. openrouter/z-ai/glm-5.1)}"
|
|
: "${SWEEP_PROFILE:?SWEEP_PROFILE required (absolute path in container)}"
|
|
|
|
# Optional overrides (defaults target the v4.14 drift sweep):
|
|
# SWEEP_LOGDIR — where JSONs and logs go (default /data/drift_2026-04-14)
|
|
# SWEEP_OUT_TAG — tag embedded in output filename (default v2026-4-14)
|
|
: "${SWEEP_LOGDIR:=/data/drift_2026-04-14}"
|
|
: "${SWEEP_OUT_TAG:=v2026-4-14}"
|
|
|
|
cd /data
|
|
|
|
LOGDIR="$SWEEP_LOGDIR"
|
|
mkdir -p "$LOGDIR"
|
|
|
|
export OPENCLAW_GATEWAY_TOKEN="local-dev-token-for-testing"
|
|
export CLAWBENCH_RUN_CACHE_DIR="/data/run_cache"
|
|
mkdir -p "$CLAWBENCH_RUN_CACHE_DIR"
|
|
|
|
# OOM fix: give the gateway Node process a 4GB old-space ceiling instead of the default ~2GB.
|
|
# Scoped via env so we don't stomp on other Node processes (clawbench itself is python).
|
|
export NODE_OPTIONS="--max-old-space-size=4096"
|
|
|
|
# State-dir isolation: the shared /home/node/.openclaw mount accumulates cruft
|
|
# across sweeps (agents/, workspace/, logs/, memory/, stale openclaw.json.*.tmp)
|
|
# which triggers gateway hot-reload churn and cascading `RPC agents.create timed
|
|
# out after 60s` failures. Give each sweep a pristine state dir that carries
|
|
# over only the config (openclaw.json, identity/, devices/, exec-approvals.json,
|
|
# tasks/, subagents/, flows/, cron/) and leaves runtime state empty.
|
|
SRC_STATE="/home/node/.openclaw"
|
|
FRESH_STATE="/tmp/openclaw-state-${SWEEP_LABEL}-$$"
|
|
echo "[state-isolate] cloning config from $SRC_STATE to $FRESH_STATE"
|
|
mkdir -p "$FRESH_STATE"
|
|
# Copy the main config (skip the .tmp/.bak/.clobbered/.pre-* cruft that can
|
|
# confuse the loader — only the canonical openclaw.json is needed).
|
|
if [ -f "$SRC_STATE/openclaw.json" ]; then
|
|
cp "$SRC_STATE/openclaw.json" "$FRESH_STATE/openclaw.json"
|
|
fi
|
|
if [ -f "$SRC_STATE/exec-approvals.json" ]; then
|
|
cp "$SRC_STATE/exec-approvals.json" "$FRESH_STATE/exec-approvals.json"
|
|
fi
|
|
# Carry over static config dirs — these are read-mostly and don't accumulate
|
|
# per-run cruft. SKIP: agents/ workspace*/ logs/ memory/ cache/ browser/ canvas/
|
|
# which all grow unboundedly across sweeps.
|
|
for d in identity devices tasks subagents flows cron; do
|
|
if [ -d "$SRC_STATE/$d" ]; then
|
|
cp -r "$SRC_STATE/$d" "$FRESH_STATE/$d"
|
|
fi
|
|
done
|
|
# Ensure runtime dirs exist but are empty
|
|
mkdir -p "$FRESH_STATE/agents" "$FRESH_STATE/workspace" "$FRESH_STATE/logs" "$FRESH_STATE/memory" "$FRESH_STATE/cache"
|
|
export OPENCLAW_STATE_DIR="$FRESH_STATE"
|
|
echo "[state-isolate] OPENCLAW_STATE_DIR=$OPENCLAW_STATE_DIR"
|
|
du -sh "$FRESH_STATE" 2>/dev/null | sed 's/^/[state-isolate] size: /'
|
|
|
|
# Map label -> cache subdir (matches what clawbench writes)
|
|
case "$SWEEP_MODEL" in
|
|
anthropic/claude-opus-4-7) CACHE_SUB="anthropic_claude-opus-4-7" ;;
|
|
anthropic/claude-sonnet-4-7) CACHE_SUB="anthropic_claude-sonnet-4-7" ;;
|
|
anthropic/claude-opus-4-6) CACHE_SUB="anthropic_claude-opus-4-6" ;;
|
|
anthropic/claude-sonnet-4-6) CACHE_SUB="anthropic_claude-sonnet-4-6" ;;
|
|
openai/gpt-5.4) CACHE_SUB="openai_gpt-5.4" ;;
|
|
openai/gpt-5.2) CACHE_SUB="openai_gpt-5.2" ;;
|
|
google/gemini-3.1-pro-preview) CACHE_SUB="google_gemini-3.1-pro-preview" ;;
|
|
openrouter/z-ai/glm-5.1) CACHE_SUB="openrouter_z-ai_glm-5.1" ;;
|
|
openrouter/qwen/qwen3.6-plus) CACHE_SUB="openrouter_qwen_qwen3.6-plus" ;;
|
|
openrouter/minimax/minimax-m2.7) CACHE_SUB="openrouter_minimax_minimax-m2.7" ;;
|
|
openrouter/moonshotai/kimi-k2.5) CACHE_SUB="openrouter_moonshotai_kimi-k2.5" ;;
|
|
# kimi-k2.6 is not yet supported in the openclaw version under test — skip.
|
|
*) CACHE_SUB="" ;;
|
|
esac
|
|
|
|
OUT="$LOGDIR/docker_${SWEEP_LABEL}_${SWEEP_OUT_TAG}.json"
|
|
LOG="$LOGDIR/docker_${SWEEP_LABEL}_${SWEEP_OUT_TAG}.log"
|
|
GWLOG="$LOGDIR/gateway_${SWEEP_LABEL}.log"
|
|
|
|
echo "===== SINGLE-MODEL SWEEP START $(date '+%Y-%m-%d %H:%M:%S') ====="
|
|
echo "label: $SWEEP_LABEL"
|
|
echo "model: $SWEEP_MODEL"
|
|
echo "profile: $SWEEP_PROFILE"
|
|
echo "out: $OUT"
|
|
echo "gwlog: $GWLOG"
|
|
echo "NODE_OPTIONS: $NODE_OPTIONS"
|
|
|
|
# Force-clear this model's run_cache so we actually re-run (no replays)
|
|
if [ -n "$CACHE_SUB" ] && [ -d "$CLAWBENCH_RUN_CACHE_DIR/$CACHE_SUB" ]; then
|
|
echo "clearing cache: $CLAWBENCH_RUN_CACHE_DIR/$CACHE_SUB"
|
|
rm -rf "$CLAWBENCH_RUN_CACHE_DIR/$CACHE_SUB"
|
|
fi
|
|
|
|
# Also remove any stale result JSON so we don't skip-on-idempotence
|
|
if [ -f "$OUT" ]; then
|
|
echo "removing stale result: $OUT"
|
|
rm -f "$OUT"
|
|
fi
|
|
|
|
# Start gateway with bumped heap
|
|
echo "Starting gateway on :18789 (heap=4GB) ..."
|
|
openclaw gateway --port 18789 > "$GWLOG" 2>&1 &
|
|
GATEWAY_PID=$!
|
|
echo "gateway pid=$GATEWAY_PID"
|
|
|
|
ready=0
|
|
for i in $(seq 1 120); do
|
|
if curl -sf -H "Authorization: Bearer $OPENCLAW_GATEWAY_TOKEN" http://127.0.0.1:18789/health > /dev/null 2>&1; then
|
|
echo "Gateway healthy after ${i}s"
|
|
ready=1
|
|
break
|
|
fi
|
|
sleep 1
|
|
done
|
|
if [ $ready -ne 1 ]; then
|
|
echo "ERROR: gateway failed to come up within 120s"
|
|
tail -30 "$GWLOG"
|
|
exit 1
|
|
fi
|
|
|
|
echo "===== $(date '+%H:%M:%S') starting $SWEEP_LABEL ($SWEEP_MODEL) ====="
|
|
clawbench run \
|
|
--model "$SWEEP_MODEL" \
|
|
--runs 3 \
|
|
--concurrency 4 \
|
|
--profile "$SWEEP_PROFILE" \
|
|
--judge-model "anthropic/claude-sonnet-4-6" \
|
|
-o "$OUT" \
|
|
> "$LOG" 2>&1
|
|
status=$?
|
|
|
|
if [ $status -eq 0 ]; then
|
|
echo "===== $(date '+%H:%M:%S') done $SWEEP_LABEL (exit 0) ====="
|
|
else
|
|
echo "===== $(date '+%H:%M:%S') FAILED $SWEEP_LABEL (exit $status) ====="
|
|
tail -20 "$LOG"
|
|
fi
|
|
|
|
# Archive the cache for future audits (preserves transcripts per sweep tag)
|
|
# shellcheck disable=SC1091
|
|
source "$(dirname "$0")/_archive_cache.sh" 2>/dev/null && archive_run_cache || echo "[archive] helper missing, skipping"
|
|
|
|
echo ""
|
|
echo "===== SINGLE-MODEL SWEEP END $(date '+%Y-%m-%d %H:%M:%S') ====="
|
|
kill $GATEWAY_PID 2>/dev/null
|
|
wait $GATEWAY_PID 2>/dev/null
|
|
echo "gateway stopped"
|
|
|
|
# Clean up the isolated state dir (don't accumulate /tmp cruft across sweeps).
|
|
if [ -n "${FRESH_STATE:-}" ] && [ -d "$FRESH_STATE" ]; then
|
|
echo "[state-isolate] removing $FRESH_STATE"
|
|
rm -rf "$FRESH_STATE"
|
|
fi
|
|
|
|
exit $status
|