clawbench/app.py
scoootscooob a7ee76ed0f Fix presets: only verified HF Inference models
Tested each via router.huggingface.co/v1/chat/completions:
- GLM 5.1 (754B), GLM 5, Qwen3 32B, DeepSeek R1, Kimi K2, MiniMax M2.5
- Gemma 4 26B MoE, Llama 3.3/3.1 70B
- Claude Sonnet/Opus 4.6 (via API key)
Removed models that don't work on HF free tier.
2026-04-07 13:35:07 -07:00

340 lines
12 KiB
Python

"""ClawBench HF Space — leaderboard + submission + background eval worker.
This single file is the entry point for the HF Docker Space.
It runs:
1. Gradio frontend (leaderboard + submission form + queue status)
2. Background eval worker (polls queue, runs benchmark, stores results)
All state persists via:
- /data/ directory (HF persistent storage)
- HF Dataset (openclaw/clawbench-results) for cross-restart persistence
"""
from __future__ import annotations
import asyncio
import json
import logging
import os
import threading
from pathlib import Path
import gradio as gr
import pandas as pd
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
logger = logging.getLogger("clawbench.app")
RESULTS_DIR = Path("/data/results") if Path("/data").exists() else Path("data/results")
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
# ---------------------------------------------------------------------------
# Preset models for quick submission
# ---------------------------------------------------------------------------
PRESET_MODELS = {
# All models verified working on HF Inference API (free with HF_TOKEN)
# Tested 2026-04-07 via router.huggingface.co/v1/chat/completions
#
# --- Chinese open-source ---
"GLM 5.1 (754B MoE)": "huggingface/zai-org/GLM-5.1",
"GLM 5 (400B MoE)": "huggingface/zai-org/GLM-5",
"Qwen3 32B": "huggingface/Qwen/Qwen3-32B",
"DeepSeek R1": "huggingface/deepseek-ai/DeepSeek-R1",
"Kimi K2 Instruct": "huggingface/moonshotai/Kimi-K2-Instruct",
"MiniMax M2.5": "huggingface/MiniMaxAI/MiniMax-M2.5",
# --- Google open-source ---
"Gemma 4 26B MoE": "huggingface/google/gemma-4-26B-A4B-it",
# --- Meta open-source ---
"Llama 3.3 70B": "huggingface/meta-llama/Llama-3.3-70B-Instruct",
"Llama 3.1 70B": "huggingface/meta-llama/Llama-3.1-70B-Instruct",
# --- Anthropic (proprietary, uses ANTHROPIC_API_KEY) ---
"Claude Sonnet 4.6": "anthropic/claude-sonnet-4-6",
"Claude Opus 4.6": "anthropic/claude-opus-4-6",
}
# ---------------------------------------------------------------------------
# Background worker (starts in a thread)
# ---------------------------------------------------------------------------
from clawbench.queue import JobQueue, SubmissionRequest
queue = JobQueue()
def _start_worker() -> None:
"""Start the eval worker in a background thread with its own event loop."""
from clawbench.worker import EvalWorker
async def _run():
worker = EvalWorker(queue)
await worker.start()
loop = asyncio.new_event_loop()
loop.run_until_complete(_run())
worker_thread = threading.Thread(target=_start_worker, daemon=True)
worker_thread.start()
logger.info("Background eval worker started")
# ---------------------------------------------------------------------------
# Data loading
# ---------------------------------------------------------------------------
def load_leaderboard() -> pd.DataFrame:
rows = []
# Load from HF Dataset
try:
from datasets import load_dataset
ds = load_dataset(
os.environ.get("CLAWBENCH_QUEUE_DATASET", "openclaw/clawbench-results"),
split="submissions",
)
for row in ds:
rows.append(_flatten_result(row))
except Exception:
pass
# Load from local results
if RESULTS_DIR.exists():
for f in sorted(RESULTS_DIR.glob("*.json")):
try:
data = json.loads(f.read_text())
rows.append(_flatten_result(data))
except Exception:
pass
if not rows:
return pd.DataFrame(columns=[
"Model", "Composite", "State", "Trajectory", "Behavior",
"pass^k", "CI", "Tasks", "Timestamp",
])
# Deduplicate by model (keep latest)
seen = {}
for r in rows:
key = r["Model"]
if key not in seen or r["Timestamp"] > seen[key]["Timestamp"]:
seen[key] = r
df = pd.DataFrame(list(seen.values()))
df = df.sort_values("Composite", ascending=False).reset_index(drop=True)
df.index = df.index + 1
df.index.name = "#"
return df
def _flatten_result(data: dict) -> dict:
tasks = data.get("task_results", [])
n_tasks = len(tasks) if isinstance(tasks, list) else 0
return {
"Model": data.get("model", ""),
"Composite": round(data.get("overall_composite", 0), 3),
"State": round(data.get("overall_state", 0), 3),
"Trajectory": round(data.get("overall_trajectory", 0), 3),
"Behavior": round(data.get("overall_behavior", 0), 3),
"pass^k": f"{data.get('overall_pass_hat_k', 0):.0%}",
"CI": f"{data.get('overall_ci_lower', 0):.2f}-{data.get('overall_ci_upper', 0):.2f}",
"Tasks": n_tasks,
"Timestamp": data.get("timestamp", "")[:16],
}
def load_queue() -> pd.DataFrame:
jobs = asyncio.run(queue.list_jobs(limit=20))
if not jobs:
return pd.DataFrame(columns=["ID", "Model", "Status", "Submitted", "Runs"])
rows = []
for j in jobs:
rows.append({
"ID": j.job_id,
"Model": j.request.model,
"Status": j.status.value,
"Submitted": j.submitted_at[:16] if j.submitted_at else "",
"Runs": j.request.runs_per_task,
})
return pd.DataFrame(rows)
# ---------------------------------------------------------------------------
# Submission handler
# ---------------------------------------------------------------------------
def submit_model(model: str, preset: str, provider: str, runs: int, category: str | None, submitter: str) -> str:
# Use preset if selected, otherwise use custom model ID
model_id = PRESET_MODELS.get(preset, "") or model.strip()
if not model_id:
return "Please enter a model ID or select a preset."
cat = category if category != "all" else None
request = SubmissionRequest(
model=model_id,
provider=provider.strip(),
runs_per_task=int(runs),
category=cat,
submitter=submitter.strip(),
)
job = asyncio.run(queue.submit(request))
return f"Submitted [{model_id}]! Job ID: {job.job_id}. Check the Queue tab."
def submit_all_presets(runs: int, submitter: str) -> str:
"""Submit all preset models at once."""
submitted = []
for name, model_id in PRESET_MODELS.items():
request = SubmissionRequest(
model=model_id,
provider="",
runs_per_task=int(runs),
submitter=submitter.strip(),
)
job = asyncio.run(queue.submit(request))
submitted.append(f"{name} ({job.job_id})")
return f"Submitted {len(submitted)} models:\n" + "\n".join(f" - {s}" for s in submitted)
# ---------------------------------------------------------------------------
# Gradio app
# ---------------------------------------------------------------------------
DESCRIPTION = """
# ClawBench
Rigorous benchmark for AI models as [OpenClaw](https://github.com/openclaw/openclaw) agents.
Submit a model below and it will be evaluated on HF infrastructure.
**Three axes**: Environment State | Tool Trajectory | Agent Behavior
**Primary metric**: pass^k (ALL runs must succeed)
"""
with gr.Blocks(title="ClawBench", theme=gr.themes.Base()) as demo:
gr.Markdown(DESCRIPTION)
with gr.Tab("Leaderboard"):
refresh_btn = gr.Button("Refresh", scale=0)
leaderboard = gr.Dataframe(
value=load_leaderboard,
interactive=False,
wrap=True,
)
refresh_btn.click(fn=load_leaderboard, outputs=leaderboard)
with gr.Tab("Submit"):
gr.Markdown("### Submit a model for evaluation")
gr.Markdown(
"Select a preset or enter a custom model ID. Open-source models "
"run via HuggingFace Inference API. Proprietary models need API keys set as Space secrets."
)
preset_input = gr.Dropdown(
choices=["(custom)"] + list(PRESET_MODELS.keys()),
value="(custom)",
label="Preset models",
)
with gr.Row():
model_input = gr.Textbox(
label="Custom Model ID (if not using preset)",
placeholder="e.g. huggingface/org/model-name",
scale=3,
)
provider_input = gr.Textbox(
label="Provider",
placeholder="auto-detected from model ID",
scale=1,
)
with gr.Row():
runs_input = gr.Slider(
minimum=1, maximum=10, value=3, step=1,
label="Runs per task (higher = more reliable pass^k)",
)
category_input = gr.Dropdown(
choices=["all", "general", "openclaw", "adversarial"],
value="all",
label="Category",
)
submitter_input = gr.Textbox(
label="Your name (optional)",
placeholder="HF username",
)
with gr.Row():
submit_btn = gr.Button("Submit Model", variant="primary")
submit_all_btn = gr.Button("Submit All Presets", variant="secondary")
submit_output = gr.Textbox(label="Status", interactive=False, lines=5)
submit_btn.click(
fn=submit_model,
inputs=[model_input, preset_input, provider_input, runs_input, category_input, submitter_input],
outputs=submit_output,
)
submit_all_btn.click(
fn=submit_all_presets,
inputs=[runs_input, submitter_input],
outputs=submit_output,
)
gr.Markdown("""
**All presets verified working on HF Inference API (free):**
| Model | Provider | Size | API |
|-------|----------|------|-----|
| GLM 5.1 | Z.ai | 754B MoE | HF free |
| GLM 5 | Z.ai | 400B MoE | HF free |
| Qwen3 32B | Alibaba | 32B | HF free |
| DeepSeek R1 | DeepSeek | 671B MoE | HF free |
| Kimi K2 Instruct | Moonshot AI | MoE | HF free |
| MiniMax M2.5 | MiniMax | MoE | HF free |
| Gemma 4 26B MoE | Google | 26B MoE | HF free |
| Llama 3.3 70B | Meta | 70B | HF free |
| Llama 3.1 70B | Meta | 70B | HF free |
| Claude Sonnet 4.6 | Anthropic | - | API key |
| Claude Opus 4.6 | Anthropic | - | API key |
""")
with gr.Tab("Queue"):
gr.Markdown("### Evaluation Queue")
queue_refresh = gr.Button("Refresh", scale=0)
queue_table = gr.Dataframe(value=load_queue, interactive=False, wrap=True)
queue_refresh.click(fn=load_queue, outputs=queue_table)
with gr.Tab("Methodology"):
gr.Markdown("""
## How ClawBench evaluates agents
### Axis 1: Environment State (ground truth)
After the agent runs, we query the actual environment — filesystem, memory, cron jobs, gateway state.
We **never** trust what the agent said. We verify the world changed.
### Axis 2: Trajectory (tool call graph)
Precision/recall/F1 on tool call sequences vs reference.
Plus ordering (LIS-based), efficiency scoring, and forbidden tool detection.
### Axis 3: Behavior (LLM judge)
Only for subjective quality. Judge is scoped: does NOT score completion or efficiency.
### Simulated Users
- **Static**: deterministic baseline
- **Adaptive**: LLM-generated, reacts to agent
- **Adversarial**: contradictions, impossible requests, hallucination traps
### pass^k: Production Reliability
| pass@1 | pass^5 | pass^8 |
|--------|--------|--------|
| 90% | 59% | 43% |
| 95% | 77% | 66% |
| 99% | 95% | 92% |
### Based on
- [TAU-bench](https://github.com/sierra-research/tau-bench) — POMDP, pass^k, state verification
- [SWE-bench](https://www.swebench.com/) — deterministic test-based verification
- [WebArena](https://webarena.dev/) — programmatic state assertions
- [Anthropic eval guide](https://www.anthropic.com/engineering/demystifying-evals-for-ai-agents)
""")
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)