From cce89d828b4a689def01d4c3e09f6034d5b7316c Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 2 May 2026 18:34:01 -0700 Subject: [PATCH] feat: add crabbox validation wiring --- .agents/skills/blacksmith-testbox/SKILL.md | 5 + .agents/skills/crabbox/SKILL.md | 96 ++++++++++++ .crabbox.yaml | 47 ++++++ .github/workflows/README.md | 16 ++ .github/workflows/crabbox-hydrate.yml | 166 +++++++++++++++++++++ tests/test_blacksmith_setup.py | 40 +++++ 6 files changed, 370 insertions(+) create mode 100644 .agents/skills/crabbox/SKILL.md create mode 100644 .crabbox.yaml create mode 100644 .github/workflows/crabbox-hydrate.yml diff --git a/.agents/skills/blacksmith-testbox/SKILL.md b/.agents/skills/blacksmith-testbox/SKILL.md index 3e780ea..5a6943a 100644 --- a/.agents/skills/blacksmith-testbox/SKILL.md +++ b/.agents/skills/blacksmith-testbox/SKILL.md @@ -10,6 +10,11 @@ agent dotfiles, Docker, or a benchmark run that is too heavy for the local machine. Keep normal unit-test iteration local unless the user asks for Testbox proof. +Crabbox is the sibling lane for reusable owned-capacity proof. Use +`.agents/skills/crabbox/SKILL.md` and `.crabbox.yaml` when ClawBench needs +AWS-backed reusable boxes or Crabbox sync/log/result inspection. Keep this +skill focused on Blacksmith CI parity. + ## Warmup Run from the repository root: diff --git a/.agents/skills/crabbox/SKILL.md b/.agents/skills/crabbox/SKILL.md new file mode 100644 index 0000000..f501e11 --- /dev/null +++ b/.agents/skills/crabbox/SKILL.md @@ -0,0 +1,96 @@ +--- +name: crabbox +description: Use Crabbox for ClawBench remote Linux validation, warmed reusable boxes, GitHub Actions hydration, sync timing, logs, results, caches, and lease cleanup. +--- + +# Crabbox + +Use Crabbox when ClawBench needs remote Linux proof on owned capacity, a large +runner class, reusable warm state, or a Blacksmith alternative. + +## Before Running + +- Run from the repo root. Crabbox sync mirrors the current checkout. +- Prefer local targeted tests for tight edit loops. +- Prefer Blacksmith Testbox when the task explicitly asks for Blacksmith or a + Blacksmith-specific CI comparison. +- Use Crabbox for broad ClawBench gates when owned AWS capacity is the right + remote lane. +- Check `.crabbox.yaml` for repo defaults before adding flags. +- Sanity-check the selected binary before remote work. Prefer the local + `openclaw/crabbox` checkout when present because the user PATH shim can be + stale: `command -v crabbox; ../crabbox/bin/crabbox --version`. +- Install with `brew install openclaw/tap/crabbox`; auth is required before use: + `crabbox login --url https://crabbox.openclaw.ai --provider aws`. +- On macOS the user config is `~/Library/Application Support/crabbox/config.yaml`; + it must include `broker.url`, `broker.token`, and usually `provider: aws`. + +## ClawBench Flow + +AWS/owned-capacity flow for Python tests: + +```sh +crabbox warmup --idle-timeout 90m +crabbox actions hydrate --id +crabbox run --id --timing-json --shell -- "python -m pytest -q" +``` + +For commands that need hydrated HF/provider credentials or agent dotfiles, use +the helper installed by the hydration workflow: + +```sh +crabbox run --id --timing-json --shell -- "clawbench-testbox-env python -m pytest -q" +crabbox run --id --timing-json --shell -- "clawbench-testbox-env clawbench run --model anthropic/claude-sonnet-4-6 --adapter simulated" +``` + +Blacksmith-backed Crabbox flow can delegate setup to the existing Testbox +workflow: + +```sh +crabbox run --provider blacksmith-testbox --blacksmith-org openclaw --blacksmith-workflow .github/workflows/ci-check-testbox.yml --blacksmith-job check --blacksmith-ref main --idle-timeout 90m --timing-json --shell -- "python -m pytest -q" +``` + +Stop boxes you created before handoff: + +```sh +crabbox stop +``` + +## Useful Commands + +```sh +crabbox status --id --wait +crabbox inspect --id --json +crabbox sync-plan +crabbox history --lease +crabbox logs +crabbox results +crabbox cache stats --id +crabbox ssh --id +``` + +Use `--debug` on `run` when measuring sync timing. +Use `--timing-json` on warmup, hydrate, and run when comparing AWS and +blacksmith-testbox timings. +Use `--market spot|on-demand` on AWS warmup or one-shot run when testing quota +or capacity behavior without changing `.crabbox.yaml`. + +## Hydration Boundary + +`.github/workflows/crabbox-hydrate.yml` is repo-specific on purpose. It owns +ClawBench checkout, setup-python, pip install, provider/HF env hydration, +agent-dotfile restoration, ready marker, and keepalive. Crabbox owns runner +registration, workflow dispatch, SSH sync, command execution, logs/results, +local lease claims, and idle cleanup. + +Do not add ClawBench-specific setup to Crabbox. Put repo setup in the hydration +workflow and generic lease/sync behavior in Crabbox. + +## Cleanup + +Crabbox has coordinator-owned idle expiry and local lease claims, so ClawBench +does not need a custom ledger. Default idle timeout is 30 minutes unless config +or flags set a different value. Still stop boxes you created when done. +If `crabbox list` prints `orphan=no-active-lease`, treat it as an operator +review hint; do not delete `keep=true` machines without checking provider and +coordinator state. diff --git a/.crabbox.yaml b/.crabbox.yaml new file mode 100644 index 0000000..42f3009 --- /dev/null +++ b/.crabbox.yaml @@ -0,0 +1,47 @@ +profile: clawbench-check +provider: aws +class: beast +capacity: + market: spot + strategy: most-available + fallback: on-demand-after-120s + regions: + - eu-west-1 +actions: + workflow: .github/workflows/crabbox-hydrate.yml + job: hydrate + ref: main + runnerLabels: + - crabbox + - clawbench + runnerVersion: latest + ephemeral: true +aws: + region: eu-west-1 + rootGB: 400 +sync: + delete: true + checksum: false + gitSeed: true + fingerprint: true + baseRef: main + exclude: + - .artifacts + - .codex + - .DS_Store + - .pytest_cache + - .ruff_cache + - .venv + - dist + - htmlcov + - playwright-report + - test-results +env: + allow: + - CI + - CLAWBENCH_* + - OPENCLAW_* + - PYTHON* +ssh: + user: crabbox + port: "2222" diff --git a/.github/workflows/README.md b/.github/workflows/README.md index c4f1523..65473c7 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -29,6 +29,22 @@ It installs ClawBench, hydrates provider/HF secrets into dotfiles from repo or org secrets, and installs `~/.local/bin/clawbench-testbox-env` for commands that need that live auth. +## `crabbox-hydrate.yml` — Crabbox Actions hydration + +This workflow exists for the Crabbox CLI from `openclaw/crabbox`: + +```bash +crabbox warmup --idle-timeout 90m +crabbox actions hydrate --id +crabbox run --id --shell -- "python -m pytest -q" +``` + +It runs on the dynamic self-hosted runner label registered by Crabbox, installs +ClawBench, hydrates the same provider/HF secrets and agent dotfiles as the +Blacksmith Testbox workflow, writes the Crabbox ready marker under +`~/.crabbox/actions/`, and keeps the job alive for follow-up SSH sync/run +commands. + ## `sync-to-hf-space.yml` — auto-mirror main to the HF Space Mirrors every push to `main` into the HF Space git remote so diff --git a/.github/workflows/crabbox-hydrate.yml b/.github/workflows/crabbox-hydrate.yml new file mode 100644 index 0000000..fe6193e --- /dev/null +++ b/.github/workflows/crabbox-hydrate.yml @@ -0,0 +1,166 @@ +name: Crabbox Hydrate + +on: + workflow_dispatch: + inputs: + crabbox_id: + description: "Crabbox lease ID" + required: true + type: string + ref: + description: "Git ref to hydrate" + required: false + type: string + crabbox_runner_label: + description: "Dynamic Crabbox runner label" + required: true + type: string + crabbox_job: + description: "Hydration job identifier expected by Crabbox" + required: false + default: "hydrate" + type: string + crabbox_keep_alive_minutes: + description: "Minutes to keep the hydrated job alive" + required: false + default: "90" + type: string + +permissions: + contents: read + +jobs: + hydrate: + name: hydrate + runs-on: [self-hosted, "${{ inputs.crabbox_runner_label }}"] + timeout-minutes: 120 + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + cache: pip + + - name: Install project + run: | + python -m pip install --upgrade pip + python -m pip install -e . + + - name: Prepare Crabbox shell + shell: bash + run: | + set -euo pipefail + git fetch --no-tags --depth=50 origin "+refs/heads/main:refs/remotes/origin/main" + python_dir="$(dirname "$(python -c 'import sys; print(sys.executable)')")" + sudo ln -sf "$python_dir/python" /usr/local/bin/python + sudo ln -sf "$python_dir/python" /usr/local/bin/python3 + sudo ln -sf "$python_dir/pip" /usr/local/bin/pip + sudo ln -sf "$python_dir/pip" /usr/local/bin/pip3 + sudo ln -sf "$python_dir/pytest" /usr/local/bin/pytest + + - name: Hydrate Crabbox env helper + shell: bash + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + HF_USERNAME: ${{ secrets.HF_USERNAME }} + CLAWBENCH_QUEUE_DATASET: ${{ vars.CLAWBENCH_QUEUE_DATASET || 'openclaw/clawbench-results' }} + CLAWBENCH_JUDGE_MODEL: ${{ vars.CLAWBENCH_JUDGE_MODEL }} + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + ANTHROPIC_API_KEY_OLD: ${{ secrets.ANTHROPIC_API_KEY_OLD }} + ANTHROPIC_API_TOKEN: ${{ secrets.ANTHROPIC_API_TOKEN }} + CEREBRAS_API_KEY: ${{ secrets.CEREBRAS_API_KEY }} + DEEPINFRA_API_KEY: ${{ secrets.DEEPINFRA_API_KEY }} + FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }} + GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} + GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }} + GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }} + KIMI_API_KEY: ${{ secrets.KIMI_API_KEY }} + MINIMAX_API_KEY: ${{ secrets.MINIMAX_API_KEY }} + MISTRAL_API_KEY: ${{ secrets.MISTRAL_API_KEY }} + MOONSHOT_API_KEY: ${{ secrets.MOONSHOT_API_KEY }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + OPENAI_BASE_URL: ${{ secrets.OPENAI_BASE_URL }} + OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} + QWEN_API_KEY: ${{ secrets.QWEN_API_KEY }} + TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }} + XAI_API_KEY: ${{ secrets.XAI_API_KEY }} + ZAI_API_KEY: ${{ secrets.ZAI_API_KEY }} + Z_AI_API_KEY: ${{ secrets.Z_AI_API_KEY }} + OPENCLAW_CODEX_AUTH_JSON: ${{ secrets.OPENCLAW_CODEX_AUTH_JSON }} + OPENCLAW_CODEX_CONFIG_TOML: ${{ secrets.OPENCLAW_CODEX_CONFIG_TOML }} + OPENCLAW_CLAUDE_JSON: ${{ secrets.OPENCLAW_CLAUDE_JSON }} + OPENCLAW_CLAUDE_CREDENTIALS_JSON: ${{ secrets.OPENCLAW_CLAUDE_CREDENTIALS_JSON }} + OPENCLAW_CLAUDE_SETTINGS_JSON: ${{ secrets.OPENCLAW_CLAUDE_SETTINGS_JSON }} + OPENCLAW_CLAUDE_SETTINGS_LOCAL_JSON: ${{ secrets.OPENCLAW_CLAUDE_SETTINGS_LOCAL_JSON }} + OPENCLAW_GEMINI_SETTINGS_JSON: ${{ secrets.OPENCLAW_GEMINI_SETTINGS_JSON }} + CLAWBENCH_CODEX_AUTH_JSON: ${{ secrets.CLAWBENCH_CODEX_AUTH_JSON }} + CLAWBENCH_CODEX_CONFIG_TOML: ${{ secrets.CLAWBENCH_CODEX_CONFIG_TOML }} + CLAWBENCH_CLAUDE_JSON: ${{ secrets.CLAWBENCH_CLAUDE_JSON }} + CLAWBENCH_CLAUDE_CREDENTIALS_JSON: ${{ secrets.CLAWBENCH_CLAUDE_CREDENTIALS_JSON }} + CLAWBENCH_CLAUDE_SETTINGS_JSON: ${{ secrets.CLAWBENCH_CLAUDE_SETTINGS_JSON }} + CLAWBENCH_CLAUDE_SETTINGS_LOCAL_JSON: ${{ secrets.CLAWBENCH_CLAUDE_SETTINGS_LOCAL_JSON }} + CLAWBENCH_GEMINI_SETTINGS_JSON: ${{ secrets.CLAWBENCH_GEMINI_SETTINGS_JSON }} + run: | + bash scripts/ci-hydrate-testbox-env.sh + sudo ln -sf "$HOME/.local/bin/clawbench-testbox-env" /usr/local/bin/clawbench-testbox-env + + - name: Mark Crabbox ready + shell: bash + run: | + set -euo pipefail + job="${{ inputs.crabbox_job }}" + if [ -z "$job" ]; then job=hydrate; fi + mkdir -p "$HOME/.crabbox/actions" + state="$HOME/.crabbox/actions/${{ inputs.crabbox_id }}.env" + env_file="$HOME/.crabbox/actions/${{ inputs.crabbox_id }}.env.sh" + services_file="$HOME/.crabbox/actions/${{ inputs.crabbox_id }}.services" + write_export() { + key="$1" + value="${!key-}" + if [ -n "$value" ]; then + printf 'export %s=%q\n' "$key" "$value" + fi + } + { + for key in CI GITHUB_ACTIONS GITHUB_WORKSPACE GITHUB_REPOSITORY GITHUB_RUN_ID GITHUB_RUN_NUMBER GITHUB_RUN_ATTEMPT GITHUB_REF GITHUB_REF_NAME GITHUB_SHA GITHUB_EVENT_NAME GITHUB_ACTOR RUNNER_OS RUNNER_ARCH RUNNER_TEMP RUNNER_TOOL_CACHE; do + write_export "$key" + done + } > "${env_file}.tmp" + mv "${env_file}.tmp" "$env_file" + { + echo "# Docker containers visible from the hydrated runner" + docker ps --format '{{.Names}}\t{{.Image}}\t{{.Ports}}' 2>/dev/null || true + } > "${services_file}.tmp" + mv "${services_file}.tmp" "$services_file" + tmp="${state}.tmp" + { + echo "WORKSPACE=${GITHUB_WORKSPACE}" + echo "RUN_ID=${GITHUB_RUN_ID}" + echo "JOB=${job}" + echo "ENV_FILE=${env_file}" + echo "SERVICES_FILE=${services_file}" + echo "READY_AT=$(date -u +%Y-%m-%dT%H:%M:%SZ)" + } > "$tmp" + mv "$tmp" "$state" + + - name: Keep Crabbox job alive + shell: bash + run: | + set -euo pipefail + minutes="${{ inputs.crabbox_keep_alive_minutes }}" + case "$minutes" in + ''|*[!0-9]*) minutes=90 ;; + esac + stop="$HOME/.crabbox/actions/${{ inputs.crabbox_id }}.stop" + deadline=$(( $(date +%s) + minutes * 60 )) + while [ "$(date +%s)" -lt "$deadline" ]; do + if [ -f "$stop" ]; then + exit 0 + fi + sleep 15 + done diff --git a/tests/test_blacksmith_setup.py b/tests/test_blacksmith_setup.py index d28fc69..3089121 100644 --- a/tests/test_blacksmith_setup.py +++ b/tests/test_blacksmith_setup.py @@ -20,6 +20,46 @@ def test_testbox_workflow_hydrates_secrets_and_dotfiles(): assert "CLAWBENCH_CODEX_AUTH_JSON" in workflow +def test_crabbox_config_uses_actions_hydration(): + config = Path(".crabbox.yaml").read_text(encoding="utf-8") + + assert "profile: clawbench-check" in config + assert "provider: aws" in config + assert "workflow: .github/workflows/crabbox-hydrate.yml" in config + assert "job: hydrate" in config + assert "baseRef: main" in config + assert "- clawbench" in config + assert "- CLAWBENCH_*" in config + assert "- OPENCLAW_*" in config + + +def test_crabbox_workflow_hydrates_secrets_dotfiles_and_ready_marker(): + workflow = Path(".github/workflows/crabbox-hydrate.yml").read_text(encoding="utf-8") + + assert "crabbox_id:" in workflow + assert "crabbox_runner_label:" in workflow + assert 'runs-on: [self-hosted, "${{ inputs.crabbox_runner_label }}"]' in workflow + assert "actions/setup-python@v5" in workflow + assert "python -m pip install -e ." in workflow + assert "scripts/ci-hydrate-testbox-env.sh" in workflow + assert "HF_TOKEN" in workflow + assert "OPENCLAW_CODEX_AUTH_JSON" in workflow + assert "CLAWBENCH_CODEX_AUTH_JSON" in workflow + assert "/usr/local/bin/clawbench-testbox-env" in workflow + assert "$HOME/.crabbox/actions/${{ inputs.crabbox_id }}.env" in workflow + assert "crabbox_keep_alive_minutes" in workflow + + +def test_crabbox_skill_documents_clawbench_flow(): + skill = Path(".agents/skills/crabbox/SKILL.md").read_text(encoding="utf-8") + + assert "openclaw/crabbox" in skill + assert ".crabbox.yaml" in skill + assert "crabbox actions hydrate" in skill + assert "clawbench-testbox-env" in skill + assert ".github/workflows/crabbox-hydrate.yml" in skill + + def test_testbox_helper_sources_hydrated_profile(): script = Path("scripts/ci-hydrate-testbox-env.sh").read_text(encoding="utf-8")