clawbench: per-sweep cache archiving + generic sweep templates
- scripts/_archive_cache.sh: snapshot run_cache/<model>/ to run_cache_archive/<sweep_tag>/ at sweep exit with metadata.json. Sourced by sweep scripts so transcripts survive the next sweep's cache wipe and stay available for audits. - scripts/container_sweep_single.sh: base multi-model sweep. Adds CACHE_SUB entries for claude-opus-4-7 / claude-sonnet-4-7 so their caches are force-cleared at sweep start. Calls archive helper on exit. - scripts/container_sweep_minimal.sh: 1-run-per-task variant for fast fix validation (~20 min) instead of full 3-run sweep (~60 min). - Dockerfile.main: parametrized clawbench-on-openclaw image with ARG BASE for pinning to any openclaw tag. - scripts/git_checkpoint.py + README: documented checkpoint workflow for tagging known-good states during risky work. - .gitignore: un-ignore scripts/, keep targeted ignores for __pycache__, .tmp, .local.py. Co-Authored-By: Claude Opus 4 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
fe8fef7795
commit
8a5be9c686
4
.gitignore
vendored
4
.gitignore
vendored
@ -12,4 +12,6 @@ data/
|
||||
.DS_Store
|
||||
.clawbench/
|
||||
reports/
|
||||
scripts/
|
||||
scripts/__pycache__/
|
||||
scripts/*.tmp
|
||||
scripts/*.local.py
|
||||
|
||||
51
Dockerfile.main
Normal file
51
Dockerfile.main
Normal file
@ -0,0 +1,51 @@
|
||||
# ClawBench HF Docker Space - main variant
|
||||
# Layer the benchmark harness on top of the locally-built OpenClaw image
|
||||
# from upstream/main. Includes native claude-opus-4-7 support (landed in
|
||||
# commit 628b454eff "feat: default Anthropic to Opus 4.7"), so no patch
|
||||
# step is required here.
|
||||
|
||||
ARG BASE=openclaw-main-opus47:latest
|
||||
FROM ${BASE}
|
||||
|
||||
USER root
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
RUN apt-get update && \
|
||||
apt-get install -y python3-pip python-is-python3 && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN ln -s /app /openclaw
|
||||
|
||||
ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
|
||||
RUN npx -y playwright@1.59.1 install --with-deps chromium && \
|
||||
CHROME_PATH="$(find /ms-playwright -path '*/chrome' -type f | sort | head -n 1)" && \
|
||||
test -x "$CHROME_PATH" && \
|
||||
ln -sf "$CHROME_PATH" /usr/bin/chromium
|
||||
|
||||
ENV HOME=/home/node PATH=/home/node/.local/bin:$PATH
|
||||
WORKDIR /home/node/app
|
||||
|
||||
COPY --chown=node:node pyproject.toml README.md ./
|
||||
COPY --chown=node:node clawbench/ clawbench/
|
||||
COPY --chown=node:node tasks/ tasks/
|
||||
COPY --chown=node:node baselines/ baselines/
|
||||
COPY --chown=node:node app.py .
|
||||
|
||||
RUN python3 -m pip install --break-system-packages --no-cache-dir .
|
||||
|
||||
RUN mkdir -p \
|
||||
/data/results \
|
||||
/data/queue \
|
||||
/home/node/.openclaw/agents/dev \
|
||||
/home/node/.openclaw/agents/main/agent && \
|
||||
chown -R node:node /data /home/node/.openclaw && \
|
||||
chmod -R 777 /data /home/node/.openclaw
|
||||
|
||||
USER node
|
||||
|
||||
ENV GATEWAY_PORT=18789
|
||||
ENV OPENCLAW_HOME=/home/node
|
||||
ENV OPENCLAW_STATE_DIR=/home/node/.openclaw
|
||||
|
||||
EXPOSE 7860
|
||||
CMD ["python", "app.py"]
|
||||
22
README.md
22
README.md
@ -293,6 +293,28 @@ but writes to wrong paths or misses format constraints. This gap is where
|
||||
profile-level improvements (workspace-aware prompts, path-checking pre-flight
|
||||
calls, retry wrappers) have the most leverage.
|
||||
|
||||
### Version control checkpoints
|
||||
|
||||
Git is already the source of truth for this repo, but the safest workflow is:
|
||||
|
||||
```bash
|
||||
# Start risky work on its own branch
|
||||
git switch -c codex/<short-topic>
|
||||
|
||||
# Commit small checkpoints as you go
|
||||
git add -A
|
||||
git commit -m "Checkpoint: describe the working state"
|
||||
|
||||
# Mark a known-good version with an annotated tag
|
||||
python3 scripts/git_checkpoint.py "before-profile-tuning"
|
||||
|
||||
# Push the branch and tags so recovery is not only local
|
||||
git push -u origin HEAD
|
||||
git push origin --tags
|
||||
```
|
||||
|
||||
The checkpoint script refuses to tag a dirty worktree by default, so every saved version points at a reproducible commit instead of a half-finished local state.
|
||||
|
||||
### Docker (recommended for reproducibility)
|
||||
|
||||
```bash
|
||||
|
||||
58
scripts/_archive_cache.sh
Executable file
58
scripts/_archive_cache.sh
Executable file
@ -0,0 +1,58 @@
|
||||
#!/bin/bash
|
||||
# Shared helper sourced by container_sweep_*.sh scripts to snapshot the
|
||||
# per-model run_cache after a sweep completes. Called at END of each sweep.
|
||||
#
|
||||
# Requires these env vars (already set by parent script):
|
||||
# CLAWBENCH_RUN_CACHE_DIR - e.g. /data/run_cache
|
||||
# CACHE_SUB - e.g. openai_gpt-5.4
|
||||
# SWEEP_OUT_TAG - e.g. v2026-4-18-pr68627-gpt54
|
||||
# SWEEP_LABEL - e.g. gpt54
|
||||
# SWEEP_LOGDIR - e.g. /data/drift_2026-04-18-pr68627-gpt54
|
||||
#
|
||||
# Writes snapshot to: /data/run_cache_archive/<SWEEP_OUT_TAG>/<CACHE_SUB>/
|
||||
# Also writes a metadata.json with sweep label/model/timestamp for indexing.
|
||||
|
||||
archive_run_cache() {
|
||||
if [ -z "${CACHE_SUB:-}" ]; then
|
||||
echo "[archive] skipped: no CACHE_SUB configured"
|
||||
return 0
|
||||
fi
|
||||
local src="${CLAWBENCH_RUN_CACHE_DIR:-/data/run_cache}/$CACHE_SUB"
|
||||
if [ ! -d "$src" ]; then
|
||||
echo "[archive] skipped: cache dir $src missing"
|
||||
return 0
|
||||
fi
|
||||
local dest_root="/data/run_cache_archive/${SWEEP_OUT_TAG:-untagged}"
|
||||
local dest="$dest_root/$CACHE_SUB"
|
||||
mkdir -p "$dest_root"
|
||||
rm -rf "$dest" # idempotent — re-running replaces prior snapshot for this tag
|
||||
cp -r "$src" "$dest"
|
||||
|
||||
# Write a small metadata.json alongside for quick lookup
|
||||
local meta="$dest_root/metadata.json"
|
||||
python3 - <<PYEOF
|
||||
import json, os, datetime
|
||||
meta_path = "$meta"
|
||||
# Merge with existing (a single tag may cover multiple models on the same sweep)
|
||||
existing = {}
|
||||
if os.path.exists(meta_path):
|
||||
try:
|
||||
with open(meta_path) as f: existing = json.load(f)
|
||||
except Exception:
|
||||
existing = {}
|
||||
entries = existing.setdefault("models", {})
|
||||
entries["${CACHE_SUB}"] = {
|
||||
"sweep_label": "${SWEEP_LABEL:-}",
|
||||
"sweep_model": "${SWEEP_MODEL:-}",
|
||||
"sweep_out_tag": "${SWEEP_OUT_TAG:-}",
|
||||
"sweep_logdir": "${SWEEP_LOGDIR:-}",
|
||||
"archived_at": datetime.datetime.now(datetime.timezone.utc).isoformat(),
|
||||
"run_count": len([p for p in os.listdir("$src") for r in os.listdir(os.path.join("$src", p)) if r.startswith("run")]) if os.path.isdir("$src") else 0,
|
||||
}
|
||||
with open(meta_path, "w") as f: json.dump(existing, f, indent=2)
|
||||
PYEOF
|
||||
|
||||
local runs
|
||||
runs=$(find "$dest" -name "run*.json" 2>/dev/null | wc -l | tr -d ' ')
|
||||
echo "[archive] saved $runs transcripts to $dest"
|
||||
}
|
||||
98
scripts/container_sweep_minimal.sh
Executable file
98
scripts/container_sweep_minimal.sh
Executable file
@ -0,0 +1,98 @@
|
||||
#!/bin/bash
|
||||
# Minimal single-model sweep — 1 run per task (not 3) for fast validation.
|
||||
# Used to quickly test if an openrouter-stream fix actually works without
|
||||
# committing to a full 60-minute 3-run sweep.
|
||||
#
|
||||
# Invocation (from host):
|
||||
# docker run -d --name clawbench-<LABEL> \
|
||||
# -e SWEEP_LABEL=<label> -e SWEEP_MODEL=<routed-model> \
|
||||
# -e SWEEP_PROFILE=<abs-profile-path> \
|
||||
# -e SWEEP_LOGDIR=<output-dir-in-container> \
|
||||
# -e SWEEP_OUT_TAG=<tag> \
|
||||
# -v .../scripts:/home/node/app/scripts:ro \
|
||||
# -v .../data:/data \
|
||||
# -v .../data/container-home-openclaw:/home/node/.openclaw \
|
||||
# -v .../profiles:/home/node/app/profiles:ro \
|
||||
# --memory 8g \
|
||||
# <image> \
|
||||
# bash /home/node/app/scripts/container_sweep_minimal.sh
|
||||
|
||||
set -u
|
||||
|
||||
: "${SWEEP_LABEL:?SWEEP_LABEL required}"
|
||||
: "${SWEEP_MODEL:?SWEEP_MODEL required}"
|
||||
: "${SWEEP_PROFILE:?SWEEP_PROFILE required}"
|
||||
: "${SWEEP_LOGDIR:?SWEEP_LOGDIR required}"
|
||||
: "${SWEEP_OUT_TAG:?SWEEP_OUT_TAG required}"
|
||||
|
||||
cd /data
|
||||
mkdir -p "$SWEEP_LOGDIR"
|
||||
|
||||
export OPENCLAW_GATEWAY_TOKEN="local-dev-token-for-testing"
|
||||
export CLAWBENCH_RUN_CACHE_DIR="/data/run_cache"
|
||||
mkdir -p "$CLAWBENCH_RUN_CACHE_DIR"
|
||||
export NODE_OPTIONS="--max-old-space-size=4096"
|
||||
|
||||
# Clear cache for target model
|
||||
case "$SWEEP_MODEL" in
|
||||
openrouter/z-ai/glm-5.1) CACHE_SUB="openrouter_z-ai_glm-5.1" ;;
|
||||
openrouter/minimax/minimax-m2.7) CACHE_SUB="openrouter_minimax_minimax-m2.7" ;;
|
||||
openrouter/moonshotai/kimi-k2.5) CACHE_SUB="openrouter_moonshotai_kimi-k2.5" ;;
|
||||
*) CACHE_SUB="" ;;
|
||||
esac
|
||||
if [ -n "$CACHE_SUB" ] && [ -d "$CLAWBENCH_RUN_CACHE_DIR/$CACHE_SUB" ]; then
|
||||
echo "clearing cache: $CLAWBENCH_RUN_CACHE_DIR/$CACHE_SUB"
|
||||
rm -rf "$CLAWBENCH_RUN_CACHE_DIR/$CACHE_SUB"
|
||||
fi
|
||||
|
||||
OUT="$SWEEP_LOGDIR/docker_${SWEEP_LABEL}_${SWEEP_OUT_TAG}.json"
|
||||
LOG="$SWEEP_LOGDIR/docker_${SWEEP_LABEL}_${SWEEP_OUT_TAG}.log"
|
||||
GWLOG="$SWEEP_LOGDIR/gateway_${SWEEP_LABEL}.log"
|
||||
|
||||
rm -f "$OUT"
|
||||
|
||||
echo "===== MINIMAL SWEEP START $(date '+%Y-%m-%d %H:%M:%S') ====="
|
||||
echo "label: $SWEEP_LABEL"
|
||||
echo "model: $SWEEP_MODEL"
|
||||
echo "profile: $SWEEP_PROFILE"
|
||||
echo "out: $OUT"
|
||||
echo "runs: 1 per task (MINIMAL)"
|
||||
|
||||
echo "Starting gateway on :18789 (heap=4GB) ..."
|
||||
openclaw gateway --port 18789 > "$GWLOG" 2>&1 &
|
||||
GATEWAY_PID=$!
|
||||
|
||||
ready=0
|
||||
for i in $(seq 1 120); do
|
||||
if curl -sf -H "Authorization: Bearer $OPENCLAW_GATEWAY_TOKEN" http://127.0.0.1:18789/health > /dev/null 2>&1; then
|
||||
echo "Gateway healthy after ${i}s"
|
||||
ready=1
|
||||
break
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
if [ $ready -ne 1 ]; then
|
||||
echo "ERROR: gateway failed to come up"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "===== $(date '+%H:%M:%S') starting $SWEEP_LABEL ($SWEEP_MODEL) ====="
|
||||
clawbench run \
|
||||
--model "$SWEEP_MODEL" \
|
||||
--runs 1 \
|
||||
--concurrency 4 \
|
||||
--profile "$SWEEP_PROFILE" \
|
||||
--judge-model "anthropic/claude-sonnet-4-6" \
|
||||
-o "$OUT" \
|
||||
> "$LOG" 2>&1
|
||||
status=$?
|
||||
|
||||
echo "===== $(date '+%H:%M:%S') done $SWEEP_LABEL (exit $status) ====="
|
||||
|
||||
# Archive the cache for future audits
|
||||
# shellcheck disable=SC1091
|
||||
source "$(dirname "$0")/_archive_cache.sh" 2>/dev/null && archive_run_cache || echo "[archive] helper missing, skipping"
|
||||
|
||||
kill $GATEWAY_PID 2>/dev/null
|
||||
wait $GATEWAY_PID 2>/dev/null
|
||||
exit $status
|
||||
133
scripts/container_sweep_single.sh
Executable file
133
scripts/container_sweep_single.sh
Executable file
@ -0,0 +1,133 @@
|
||||
#!/bin/bash
|
||||
# Single-model sweep with fresh gateway + bumped Node heap to prevent OOM.
|
||||
#
|
||||
# Invocation (from host):
|
||||
# docker run -d --name clawbench-sweep-<LABEL> \
|
||||
# -e SWEEP_LABEL=<label> -e SWEEP_MODEL=<routed-model> -e SWEEP_PROFILE=<abs-profile-path> \
|
||||
# -v .../scripts:/home/node/app/scripts:ro \
|
||||
# -v .../data:/data \
|
||||
# -v .../data/container-home-openclaw:/home/node/.openclaw \
|
||||
# -v .../profiles:/home/node/app/profiles:ro \
|
||||
# --memory 8g \
|
||||
# clawbench-clawbench:latest \
|
||||
# bash /home/node/app/scripts/container_sweep_single.sh
|
||||
#
|
||||
# Differences vs container_sweep.sh:
|
||||
# - Bumps gateway Node.js heap via NODE_OPTIONS=--max-old-space-size=4096 (prevents 2GB OOM we saw at ~4h)
|
||||
# - One model per container (no shared-gateway drift between models)
|
||||
# - Force-clears run_cache for THIS model before running (prevents cache-replay masking)
|
||||
# - Writes to the same $LOGDIR/docker_${label}_${SWEEP_OUT_TAG}.json as the original sweep
|
||||
# so generate_drift_report.py picks it up without changes
|
||||
|
||||
set -u
|
||||
|
||||
: "${SWEEP_LABEL:?SWEEP_LABEL required (e.g. glm, minimax, kimi)}"
|
||||
: "${SWEEP_MODEL:?SWEEP_MODEL required (e.g. openrouter/z-ai/glm-5.1)}"
|
||||
: "${SWEEP_PROFILE:?SWEEP_PROFILE required (absolute path in container)}"
|
||||
|
||||
# Optional overrides (defaults target the v4.14 drift sweep):
|
||||
# SWEEP_LOGDIR — where JSONs and logs go (default /data/drift_2026-04-14)
|
||||
# SWEEP_OUT_TAG — tag embedded in output filename (default v2026-4-14)
|
||||
: "${SWEEP_LOGDIR:=/data/drift_2026-04-14}"
|
||||
: "${SWEEP_OUT_TAG:=v2026-4-14}"
|
||||
|
||||
cd /data
|
||||
|
||||
LOGDIR="$SWEEP_LOGDIR"
|
||||
mkdir -p "$LOGDIR"
|
||||
|
||||
export OPENCLAW_GATEWAY_TOKEN="local-dev-token-for-testing"
|
||||
export CLAWBENCH_RUN_CACHE_DIR="/data/run_cache"
|
||||
mkdir -p "$CLAWBENCH_RUN_CACHE_DIR"
|
||||
|
||||
# OOM fix: give the gateway Node process a 4GB old-space ceiling instead of the default ~2GB.
|
||||
# Scoped via env so we don't stomp on other Node processes (clawbench itself is python).
|
||||
export NODE_OPTIONS="--max-old-space-size=4096"
|
||||
|
||||
# Map label -> cache subdir (matches what clawbench writes)
|
||||
case "$SWEEP_MODEL" in
|
||||
anthropic/claude-opus-4-7) CACHE_SUB="anthropic_claude-opus-4-7" ;;
|
||||
anthropic/claude-sonnet-4-7) CACHE_SUB="anthropic_claude-sonnet-4-7" ;;
|
||||
anthropic/claude-opus-4-6) CACHE_SUB="anthropic_claude-opus-4-6" ;;
|
||||
anthropic/claude-sonnet-4-6) CACHE_SUB="anthropic_claude-sonnet-4-6" ;;
|
||||
openai/gpt-5.4) CACHE_SUB="openai_gpt-5.4" ;;
|
||||
openai/gpt-5.2) CACHE_SUB="openai_gpt-5.2" ;;
|
||||
openrouter/z-ai/glm-5.1) CACHE_SUB="openrouter_z-ai_glm-5.1" ;;
|
||||
openrouter/minimax/minimax-m2.7) CACHE_SUB="openrouter_minimax_minimax-m2.7" ;;
|
||||
openrouter/moonshotai/kimi-k2.5) CACHE_SUB="openrouter_moonshotai_kimi-k2.5" ;;
|
||||
*) CACHE_SUB="" ;;
|
||||
esac
|
||||
|
||||
OUT="$LOGDIR/docker_${SWEEP_LABEL}_${SWEEP_OUT_TAG}.json"
|
||||
LOG="$LOGDIR/docker_${SWEEP_LABEL}_${SWEEP_OUT_TAG}.log"
|
||||
GWLOG="$LOGDIR/gateway_${SWEEP_LABEL}.log"
|
||||
|
||||
echo "===== SINGLE-MODEL SWEEP START $(date '+%Y-%m-%d %H:%M:%S') ====="
|
||||
echo "label: $SWEEP_LABEL"
|
||||
echo "model: $SWEEP_MODEL"
|
||||
echo "profile: $SWEEP_PROFILE"
|
||||
echo "out: $OUT"
|
||||
echo "gwlog: $GWLOG"
|
||||
echo "NODE_OPTIONS: $NODE_OPTIONS"
|
||||
|
||||
# Force-clear this model's run_cache so we actually re-run (no replays)
|
||||
if [ -n "$CACHE_SUB" ] && [ -d "$CLAWBENCH_RUN_CACHE_DIR/$CACHE_SUB" ]; then
|
||||
echo "clearing cache: $CLAWBENCH_RUN_CACHE_DIR/$CACHE_SUB"
|
||||
rm -rf "$CLAWBENCH_RUN_CACHE_DIR/$CACHE_SUB"
|
||||
fi
|
||||
|
||||
# Also remove any stale result JSON so we don't skip-on-idempotence
|
||||
if [ -f "$OUT" ]; then
|
||||
echo "removing stale result: $OUT"
|
||||
rm -f "$OUT"
|
||||
fi
|
||||
|
||||
# Start gateway with bumped heap
|
||||
echo "Starting gateway on :18789 (heap=4GB) ..."
|
||||
openclaw gateway --port 18789 > "$GWLOG" 2>&1 &
|
||||
GATEWAY_PID=$!
|
||||
echo "gateway pid=$GATEWAY_PID"
|
||||
|
||||
ready=0
|
||||
for i in $(seq 1 120); do
|
||||
if curl -sf -H "Authorization: Bearer $OPENCLAW_GATEWAY_TOKEN" http://127.0.0.1:18789/health > /dev/null 2>&1; then
|
||||
echo "Gateway healthy after ${i}s"
|
||||
ready=1
|
||||
break
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
if [ $ready -ne 1 ]; then
|
||||
echo "ERROR: gateway failed to come up within 120s"
|
||||
tail -30 "$GWLOG"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "===== $(date '+%H:%M:%S') starting $SWEEP_LABEL ($SWEEP_MODEL) ====="
|
||||
clawbench run \
|
||||
--model "$SWEEP_MODEL" \
|
||||
--runs 3 \
|
||||
--concurrency 4 \
|
||||
--profile "$SWEEP_PROFILE" \
|
||||
--judge-model "anthropic/claude-sonnet-4-6" \
|
||||
-o "$OUT" \
|
||||
> "$LOG" 2>&1
|
||||
status=$?
|
||||
|
||||
if [ $status -eq 0 ]; then
|
||||
echo "===== $(date '+%H:%M:%S') done $SWEEP_LABEL (exit 0) ====="
|
||||
else
|
||||
echo "===== $(date '+%H:%M:%S') FAILED $SWEEP_LABEL (exit $status) ====="
|
||||
tail -20 "$LOG"
|
||||
fi
|
||||
|
||||
# Archive the cache for future audits (preserves transcripts per sweep tag)
|
||||
# shellcheck disable=SC1091
|
||||
source "$(dirname "$0")/_archive_cache.sh" 2>/dev/null && archive_run_cache || echo "[archive] helper missing, skipping"
|
||||
|
||||
echo ""
|
||||
echo "===== SINGLE-MODEL SWEEP END $(date '+%Y-%m-%d %H:%M:%S') ====="
|
||||
kill $GATEWAY_PID 2>/dev/null
|
||||
wait $GATEWAY_PID 2>/dev/null
|
||||
echo "gateway stopped"
|
||||
exit $status
|
||||
114
scripts/git_checkpoint.py
Normal file
114
scripts/git_checkpoint.py
Normal file
@ -0,0 +1,114 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Create an annotated git checkpoint tag for a clean working tree."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def run_git(args: list[str], repo_root: Path, capture_output: bool = True) -> subprocess.CompletedProcess[str]:
|
||||
return subprocess.run(
|
||||
["git", *args],
|
||||
cwd=repo_root,
|
||||
check=True,
|
||||
text=True,
|
||||
capture_output=capture_output,
|
||||
)
|
||||
|
||||
|
||||
def repo_root() -> Path:
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["git", "rev-parse", "--show-toplevel"],
|
||||
check=True,
|
||||
text=True,
|
||||
capture_output=True,
|
||||
)
|
||||
except subprocess.CalledProcessError as exc:
|
||||
raise SystemExit("Not inside a git repository.") from exc
|
||||
return Path(result.stdout.strip())
|
||||
|
||||
|
||||
def sanitize_label(label: str) -> str:
|
||||
slug = re.sub(r"[^a-z0-9]+", "-", label.strip().lower()).strip("-")
|
||||
if not slug:
|
||||
raise SystemExit("Checkpoint name must contain at least one letter or number.")
|
||||
return slug[:48]
|
||||
|
||||
|
||||
def ensure_clean_worktree(root: Path) -> None:
|
||||
status = run_git(["status", "--porcelain"], root).stdout.strip()
|
||||
if status:
|
||||
raise SystemExit(
|
||||
"Working tree is not clean. Commit or stash your changes first, or rerun with --allow-dirty."
|
||||
)
|
||||
|
||||
|
||||
def current_branch(root: Path) -> str:
|
||||
return run_git(["rev-parse", "--abbrev-ref", "HEAD"], root).stdout.strip()
|
||||
|
||||
|
||||
def tag_exists(root: Path, tag_name: str) -> bool:
|
||||
result = run_git(["tag", "--list", tag_name], root)
|
||||
return result.stdout.strip() == tag_name
|
||||
|
||||
|
||||
def build_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Create an annotated checkpoint tag for the current HEAD commit."
|
||||
)
|
||||
parser.add_argument("name", help="Human-readable checkpoint name, e.g. 'before benchmark rerun'.")
|
||||
parser.add_argument(
|
||||
"--allow-dirty",
|
||||
action="store_true",
|
||||
help="Allow tagging even if the working tree has uncommitted changes.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Print the tag that would be created without modifying git state.",
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = build_parser()
|
||||
args = parser.parse_args()
|
||||
|
||||
root = repo_root()
|
||||
if not args.allow_dirty:
|
||||
ensure_clean_worktree(root)
|
||||
|
||||
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||
branch = current_branch(root)
|
||||
slug = sanitize_label(args.name)
|
||||
tag_name = f"checkpoint/{timestamp}-{slug}"
|
||||
|
||||
if tag_exists(root, tag_name):
|
||||
raise SystemExit(f"Checkpoint tag already exists: {tag_name}")
|
||||
|
||||
message = f"Checkpoint '{args.name}' from branch '{branch}' at {timestamp}"
|
||||
|
||||
if args.dry_run:
|
||||
print(tag_name)
|
||||
print(message)
|
||||
return 0
|
||||
|
||||
run_git(["tag", "-a", tag_name, "-m", message], root, capture_output=False)
|
||||
print(f"Created {tag_name}")
|
||||
print(f"Push it with: git push origin {tag_name}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
raise SystemExit(main())
|
||||
except subprocess.CalledProcessError as exc:
|
||||
if exc.stderr:
|
||||
sys.stderr.write(exc.stderr)
|
||||
raise SystemExit(exc.returncode) from exc
|
||||
Loading…
Reference in New Issue
Block a user