From ee8ff7934773bbbbf41aba10a68193faa75b9657 Mon Sep 17 00:00:00 2001
From: scoootscooob <zhentongfan@gmail.com>
Date: Thu, 16 Apr 2026 19:49:04 -0700
Subject: [PATCH] docs: fix ollama profile guidance

---
 README.md                          | 22 +++++++++++-----------
 profiles/local_ollama_gpt_oss.yaml | 10 ++++++----
 2 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index 0ea4c56..b4ca88b 100644
--- a/README.md
+++ b/README.md
@@ -258,11 +258,11 @@ A single consumer GPU running an open-weight model through
 algorithmic ideas, and submit scored results — no API keys or cloud spend
 required.
 
-Profiles tested locally can be submitted as pull requests. The official
-ClawBench CI re-evaluates merged profiles against frontier models, so
-researchers and small teams can contribute configurations and novel
-strategies (tool-routing, memory architectures, prompt scaffolding) while
-the project handles the expensive runs.
+Profiles tested locally can still be submitted as pull requests with
+reference results. The built-in GitHub Actions workflows in this repo only
+run the test suite and deployment sync, so treat local Ollama numbers as
+contributor-side evidence unless a maintainer separately reruns them on
+other infrastructure.
 
 ```bash
 # Pull a model and set your gateway token
@@ -275,18 +275,18 @@ clawbench run --model ollama/gpt-oss:20b --task t1-fs-quick-note --runs 1
 # Tier-1 sweep with confidence intervals
 clawbench run --model ollama/gpt-oss:20b --tier tier1 --runs 5
 
-# Full local eval with a plugin profile (see profiles/ for examples)
-clawbench run --model ollama/gpt-oss:20b \
-  --profile profiles/local_ollama_gpt_oss.yaml \
-  --tier tier1 --tier tier2 --runs 5 --concurrency 2
+# Tier-2 sweep (run separately; the CLI accepts one --tier at a time)
+clawbench run --model ollama/gpt-oss:20b --tier tier2 --runs 5 --concurrency 2
+
+# Inspect the reference profile's fingerprint and historical neighbors
+clawbench diagnose profiles/local_ollama_gpt_oss.yaml
 ```
 
-**Reference results** (gpt-oss:20b, RTX 4090, Docker sandbox, network=none):
+**Reference contributor-side results** (gpt-oss:20b, RTX 4090, Docker sandbox, network=none):
 
 | Scope | Score | CI | Completion | Trajectory | Behavior |
 |---|---|---|---|---|---|
 | Tier-1 (6 tasks × 3 runs) | 0.397 | 0.346–0.447 | 0.056 | 0.522 | 1.000 |
-| Tier-1+2 (14 tasks × 5 runs) | 0.467 | 0.375–0.545 | 0.202 | 0.666 | 0.850 |
 
 High trajectory/behavior but low completion — the model uses tools correctly
 but writes to wrong paths or misses format constraints. This gap is where
diff --git a/profiles/local_ollama_gpt_oss.yaml b/profiles/local_ollama_gpt_oss.yaml
index 0517d9b..9478872 100644
--- a/profiles/local_ollama_gpt_oss.yaml
+++ b/profiles/local_ollama_gpt_oss.yaml
@@ -2,11 +2,13 @@ profile:
   name: local-ollama-gpt-oss
   base_model: ollama/gpt-oss:20b
   notes: |
-    Reference profile for local Ollama models. Change base_model and
-    tools_allow to match your setup, run with --profile, and submit
-    your results as a PR.
+    Reference profile metadata for local Ollama experiments. Change
+    base_model and tools_allow to match your setup, then use
+    `clawbench diagnose` to inspect the fingerprint and attach your
+    benchmark results separately in a PR.
 
-    Baseline (gpt-oss:20b, RTX 4090, Docker sandbox, network=none):
+    Contributor-side baseline (gpt-oss:20b, RTX 4090, Docker sandbox,
+    network=none):
       Tier-1+2: 0.467 (CI 0.375-0.545), T=0.666, B=0.850, C=0.202
   plugins:
     enabled: