diff --git a/README.md b/README.md index 9197fd3..b4ca88b 100644 --- a/README.md +++ b/README.md @@ -251,6 +251,48 @@ clawbench run --model anthropic/claude-opus-4-6 --profile profiles/frontier_opus clawbench diagnose profiles/frontier_opus_4_6.yaml ``` +### Running locally with small models (Ollama) + +A single consumer GPU running an open-weight model through +[Ollama](https://ollama.com) is enough to develop plugin profiles, validate +algorithmic ideas, and submit scored results — no API keys or cloud spend +required. + +Profiles tested locally can still be submitted as pull requests with +reference results. The built-in GitHub Actions workflows in this repo only +run the test suite and deployment sync, so treat local Ollama numbers as +contributor-side evidence unless a maintainer separately reruns them on +other infrastructure. + +```bash +# Pull a model and set your gateway token +ollama pull gpt-oss:20b # or llama3.1:8b, qwen3:14b, etc. +export OPENCLAW_GATEWAY_TOKEN= + +# Quick smoke test +clawbench run --model ollama/gpt-oss:20b --task t1-fs-quick-note --runs 1 + +# Tier-1 sweep with confidence intervals +clawbench run --model ollama/gpt-oss:20b --tier tier1 --runs 5 + +# Tier-2 sweep (run separately; the CLI accepts one --tier at a time) +clawbench run --model ollama/gpt-oss:20b --tier tier2 --runs 5 --concurrency 2 + +# Inspect the reference profile's fingerprint and historical neighbors +clawbench diagnose profiles/local_ollama_gpt_oss.yaml +``` + +**Reference contributor-side results** (gpt-oss:20b, RTX 4090, Docker sandbox, network=none): + +| Scope | Score | CI | Completion | Trajectory | Behavior | +|---|---|---|---|---|---| +| Tier-1 (6 tasks × 3 runs) | 0.397 | 0.346–0.447 | 0.056 | 0.522 | 1.000 | + +High trajectory/behavior but low completion — the model uses tools correctly +but writes to wrong paths or misses format constraints. This gap is where +profile-level improvements (workspace-aware prompts, path-checking pre-flight +calls, retry wrappers) have the most leverage. + ### Docker (recommended for reproducibility) ```bash diff --git a/app.py b/app.py index 6d5a7b3..429015d 100644 --- a/app.py +++ b/app.py @@ -983,7 +983,8 @@ with gr.Blocks(title="ClawBench", theme=clawbench_theme, css=CUSTOM_CSS) as demo gr.Markdown("### Submit a model for evaluation") gr.Markdown( "Select a preset or enter a custom model ID. Open-source models " - "run via HuggingFace Inference API. Proprietary models need model auth configured in the Space runtime." + "run via HuggingFace Inference API. You can also use locally hosted models " + "(for example Ollama) when your OpenClaw runtime has them configured." ) preset_input = gr.Dropdown( @@ -994,7 +995,7 @@ with gr.Blocks(title="ClawBench", theme=clawbench_theme, css=CUSTOM_CSS) as demo with gr.Row(): model_input = gr.Textbox( label="Custom Model ID (if not using preset)", - placeholder="e.g. huggingface/org/model-name", + placeholder="e.g. huggingface/org/model-name or ollama/gpt-oss:20b", scale=3, ) provider_input = gr.Textbox( diff --git a/profiles/local_ollama_gpt_oss.yaml b/profiles/local_ollama_gpt_oss.yaml new file mode 100644 index 0000000..9478872 --- /dev/null +++ b/profiles/local_ollama_gpt_oss.yaml @@ -0,0 +1,30 @@ +profile: + name: local-ollama-gpt-oss + base_model: ollama/gpt-oss:20b + notes: | + Reference profile metadata for local Ollama experiments. Change + base_model and tools_allow to match your setup, then use + `clawbench diagnose` to inspect the fingerprint and attach your + benchmark results separately in a PR. + + Contributor-side baseline (gpt-oss:20b, RTX 4090, Docker sandbox, + network=none): + Tier-1+2: 0.467 (CI 0.375-0.545), T=0.666, B=0.850, C=0.202 + plugins: + enabled: + - ollama + - id: memory-lancedb + config: + dimensions: 1536 + - browser-playwright + slots: + memory: memory-lancedb + contextEngine: builtin + tools_allow: + - bash + - file_read + - file_edit + - browser_navigate + - browser_click + - memory_read + - memory_write