From 595cdc910cf3288e78e2bebe1d409ec0ca1c8ea0 Mon Sep 17 00:00:00 2001
From: scoootscooob <zhentongfan@gmail.com>
Date: Thu, 23 Apr 2026 12:40:23 -0700
Subject: [PATCH] Add public domain scaffold and adapter diagnostics

---
 README.md                    |   4 +-
 clawbench/cli.py             |  35 +++++++-
 clawbench/diagnose_cli.py    |  52 ++++++++++-
 clawbench/harness.py         |  19 ++++
 tasks-domain/MANIFEST.yaml   | 163 +++++++++++++++++++++++++++++++++++
 tasks-domain/README.md       |  59 +++++++++++++
 tasks-public/MANIFEST.yaml   |   6 +-
 tasks-public/README.md       |  10 +--
 tests/test_harness.py        |  52 +++++++++++
 tests/test_v05_extensions.py |  17 ++++
 10 files changed, 402 insertions(+), 15 deletions(-)
 create mode 100644 tasks-domain/MANIFEST.yaml
 create mode 100644 tasks-domain/README.md

diff --git a/README.md b/README.md
index 17c6598..44c377f 100644
--- a/README.md
+++ b/README.md
@@ -245,9 +245,9 @@ Core v1 is a signal-curated public release of 19 tasks from the internal 40-task
 | Tier | Core v1 count | What it tests | Examples |
 |------|:---:|---|---|
 | **Tier 1** | 2 | Single-tool basics | Bugfix discount calc, quick file note |
-| **Tier 2** | 7 | Multi-step, 2-3 tools | Config loader repair, browser form fix, priv redaction |
+| **Tier 2** | 6 | Multi-step, 2-3 tools | Config loader repair, browser form fix, priv redaction |
 | **Tier 3** | 5 | Complex orchestration | SQL query analysis, inbox triage, data pipeline report |
-| **Tier 4** | 4 | Cross-system reasoning | Cross-repo migration, delegation repair, browser research+code |
+| **Tier 4** | 5 | Cross-system reasoning | Cross-repo migration, delegation repair, memory continuation, browser research+code |
 | **Tier 5** | 1 | Adversarial | Hallucination-resistant evidence |
 
 Full manifest: [`tasks-public/MANIFEST.yaml`](tasks-public/MANIFEST.yaml).
diff --git a/clawbench/cli.py b/clawbench/cli.py
index 523e42d..616e963 100644
--- a/clawbench/cli.py
+++ b/clawbench/cli.py
@@ -10,7 +10,7 @@ from pathlib import Path
 import click
 
 from clawbench.client import GatewayConfig
-from clawbench.harness import BenchmarkHarness
+from clawbench.harness import BenchmarkHarness, KNOWN_ADAPTERS
 
 SCENARIO_CHOICES = [
     "file_system_ops",
@@ -41,6 +41,13 @@ def cli(verbose: bool) -> None:
 
 @cli.command()
 @click.option("--model", "-m", required=True, help="Model to benchmark")
+@click.option(
+    "--adapter",
+    type=click.Choice(KNOWN_ADAPTERS),
+    default="openclaw",
+    show_default=True,
+    help="Agent harness adapter. OpenClaw is executable today; other adapters are tracked targets.",
+)
 @click.option("--gateway-token", envvar="OPENCLAW_GATEWAY_TOKEN", default="", help="Gateway auth token")
 @click.option(
     "--judge-model",
@@ -123,6 +130,7 @@ def cli(verbose: bool) -> None:
 )
 def run(
     model: str,
+    adapter: str,
     gateway_token: str,
     judge_model: str,
     runs: int,
@@ -148,6 +156,7 @@ def run(
     harness = BenchmarkHarness(
         gateway_config=gateway_config,
         model=model,
+        adapter=adapter,
         judge_model=judge_model,
         runs_per_task=runs,
         tier=tier,
@@ -178,6 +187,7 @@ def run(
         _run_v05_diagnostic(
             profile_path=profile,
             result=result,
+            task_runs=harness.last_task_runs,
             runs_per_task=runs,
             insights_dir=insights_dir,
         )
@@ -269,6 +279,7 @@ def _run_v05_diagnostic(
     *,
     profile_path: Path,
     result,
+    task_runs: dict[str, list] | None,
     runs_per_task: int,
     insights_dir: Path,
 ) -> None:
@@ -278,6 +289,7 @@ def _run_v05_diagnostic(
         DEFAULT_MANIFEST_DIR,
         DEFAULT_SUBMISSIONS_DIR,
         ensure_data_dirs,
+        infer_registration_traces_from_manifests,
         load_manifests,
         write_submission_record,
     )
@@ -291,6 +303,7 @@ def _run_v05_diagnostic(
     plugin_profile = PluginProfile.from_yaml_file(profile_path)
     plugin_ids = [e.id for e in plugin_profile.plugins]
     manifests = load_manifests(DEFAULT_MANIFEST_DIR, plugin_ids)
+    traces = infer_registration_traces_from_manifests(plugin_profile, manifests)
     db = HistoricalDatabase(path=DEFAULT_DB_PATH)
 
     # Extract per-task scores + tier map from the BenchmarkResult
@@ -301,12 +314,16 @@ def _run_v05_diagnostic(
         if getattr(task_stats, "tier", ""):
             tier_of[task_stats.task_id] = task_stats.tier
 
+    transcripts = _merge_task_transcripts_from_runs(task_runs or {})
+
     diagnostic = submit_run(
         profile=plugin_profile,
         manifests=manifests,
         db=db,
         actual_overall_score=float(result.overall_score),
         actual_per_task_scores=actual_per_task,
+        traces=traces,
+        transcripts=transcripts,
         tier_of=tier_of or None,
         n_runs_contributing=runs_per_task,
     )
@@ -329,6 +346,22 @@ def _run_v05_diagnostic(
     )
 
 
+def _merge_task_transcripts_from_runs(task_runs: dict[str, list]):
+    """Merge all run transcripts per task for the v0.5 utilization audit."""
+    if not task_runs:
+        return None
+    from clawbench.schemas import Transcript
+
+    merged: dict[str, Transcript] = {}
+    for task_id, runs in task_runs.items():
+        transcript = Transcript()
+        for run in runs:
+            transcript.messages.extend(getattr(run.transcript, "messages", []))
+        if transcript.messages:
+            merged[task_id] = transcript
+    return merged or None
+
+
 @cli.command()
 @click.argument("profile", type=click.Path(exists=True, path_type=Path))
 @click.option(
diff --git a/clawbench/diagnose_cli.py b/clawbench/diagnose_cli.py
index b13f6ce..548233a 100644
--- a/clawbench/diagnose_cli.py
+++ b/clawbench/diagnose_cli.py
@@ -37,7 +37,8 @@ from clawbench.diagnostic import build_diagnostic, submit_run
 from clawbench.insights import publish_insights
 from clawbench.prediction import HistoricalDatabase
 from clawbench.profile import PluginManifest, PluginProfile, RegistrationTrace
-from clawbench.schemas import Transcript
+from clawbench.schemas import ToolCall, Transcript
+from clawbench.trajectory import classify_tool_call
 
 
 DEFAULT_CLAWBENCH_ROOT = Path(".clawbench")
@@ -80,6 +81,39 @@ def load_transcripts(path: Path) -> dict[str, Transcript]:
     return out
 
 
+def infer_registration_traces_from_manifests(
+    profile: PluginProfile,
+    manifests: dict[str, PluginManifest],
+) -> dict[str, RegistrationTrace]:
+    """Build best-effort registration traces from manifest-declared tools.
+
+    Full runtime registration traces are better because they include hooks,
+    gateway methods, routes, and services. This fallback still gives the
+    diagnostic layer exact manifest-declared tool names, which is enough to
+    attribute many transcript tool calls instead of dropping all utilization
+    into the unassigned bucket.
+    """
+    traces: dict[str, RegistrationTrace] = {}
+    for entry in profile.plugins:
+        manifest = manifests.get(entry.id)
+        if manifest is None:
+            continue
+        tools = list(manifest.contracts.get("tools", []))
+        families = sorted(
+            {
+                classify_tool_call(ToolCall(name=tool))[0]
+                for tool in tools
+                if tool
+            }
+        )
+        traces[entry.id] = RegistrationTrace(
+            plugin_id=entry.id,
+            tools=tools,
+            tool_families_seen=families,
+        )
+    return traces
+
+
 def write_submission_record(
     submissions_dir: Path, fingerprint_hash: str, report_dict: dict
 ) -> Path:
@@ -162,6 +196,7 @@ def main() -> None:
     profile = PluginProfile.from_yaml_file(args.profile)
     plugin_ids = [e.id for e in profile.plugins]
     manifests = load_manifests(args.manifests, plugin_ids)
+    traces = infer_registration_traces_from_manifests(profile, manifests)
     db = HistoricalDatabase(path=args.db)
 
     actual_overall: float | None = None
@@ -172,9 +207,16 @@ def main() -> None:
             sys.exit(2)
         results_data = json.loads(args.results.read_text(encoding="utf-8"))
         actual_overall = float(results_data.get("overall_score", 0.0))
-        actual_per_task = {
-            k: float(v) for k, v in results_data.get("per_task_score", {}).items()
-        }
+        if "per_task_score" in results_data:
+            actual_per_task = {
+                k: float(v) for k, v in results_data.get("per_task_score", {}).items()
+            }
+        else:
+            actual_per_task = {
+                str(item.get("task_id")): float(item.get("mean_task_score", 0.0))
+                for item in results_data.get("task_results", [])
+                if item.get("task_id")
+            }
 
     transcripts: dict[str, Transcript] | None = None
     if args.transcripts:
@@ -208,6 +250,7 @@ def main() -> None:
             db=db,
             actual_overall_score=actual_overall,
             actual_per_task_scores=actual_per_task,
+            traces=traces,
             transcripts=transcripts,
             tier_of=tier_of,
         )
@@ -223,6 +266,7 @@ def main() -> None:
             db=db,
             actual_overall_score=actual_overall,
             actual_per_task_scores=actual_per_task,
+            traces=traces,
             transcripts=transcripts,
             tier_of=tier_of,
         )
diff --git a/clawbench/harness.py b/clawbench/harness.py
index b955aa6..cb416df 100644
--- a/clawbench/harness.py
+++ b/clawbench/harness.py
@@ -40,6 +40,9 @@ from clawbench.tasks import get_assets_dir, load_all_tasks
 logger = logging.getLogger(__name__)
 console = Console()
 
+KNOWN_ADAPTERS = ("openclaw", "hermes", "codex", "claude-code")
+EXECUTABLE_ADAPTERS = {"openclaw"}
+
 
 class _NullCtx:
     """A no-op async context manager used to skip the browser semaphore
@@ -79,6 +82,7 @@ class BenchmarkHarness:
         quiet: bool = False,
         concurrency: int = 1,
         browser_concurrency: int = 1,
+        adapter: str = "openclaw",
     ) -> None:
         self.gateway_config = gateway_config
         self.model = model
@@ -102,10 +106,21 @@ class BenchmarkHarness:
         self.quiet = quiet
         self.concurrency = max(1, int(concurrency))
         self.browser_concurrency = max(1, int(browser_concurrency))
+        self.adapter = adapter
         self.repo_root = Path(__file__).parent.parent
         self.last_task_runs: dict[str, list[TaskRunResult]] = {}
 
     async def run(self) -> BenchmarkResult:
+        if self.adapter not in KNOWN_ADAPTERS:
+            raise ValueError(
+                f"Unknown adapter '{self.adapter}'. Known adapters: {', '.join(KNOWN_ADAPTERS)}"
+            )
+        if self.adapter not in EXECUTABLE_ADAPTERS:
+            raise ValueError(
+                f"Adapter '{self.adapter}' is registered as a target but is not yet wired "
+                "into the end-to-end scoring harness. Use 'openclaw' for executable runs."
+            )
+
         tasks = load_all_tasks(
             tasks_dir=self.tasks_dir,
             tier=self.tier,
@@ -129,6 +144,7 @@ class BenchmarkHarness:
         if not self.quiet:
             console.print(f"\n[bold]ClawBench v{__version__}[/bold] — {len(tasks)} tasks x {self.runs_per_task} runs")
             console.print(f"Model: [cyan]{self.model}[/cyan]")
+            console.print(f"Adapter: [cyan]{self.adapter}[/cyan]")
             if self.judge_model:
                 console.print(f"Advisory judge: [magenta]{self.judge_model}[/magenta]")
             mode = "serial" if self.concurrency == 1 else f"parallel(concurrency={self.concurrency}, browser={self.browser_concurrency})"
@@ -726,6 +742,9 @@ class BenchmarkHarness:
                 "artifact_type": self.artifact_type or "all",
                 "prompt_variant": self.prompt_variant,
                 "judge_model": self.judge_model,
+                "adapter": self.adapter,
+                "known_adapters": list(KNOWN_ADAPTERS),
+                "executable_adapters": sorted(EXECUTABLE_ADAPTERS),
                 "subsets": self.subsets,
                 "capabilities": self.capabilities,
                 "official_only": self.official_only,
diff --git a/tasks-domain/MANIFEST.yaml b/tasks-domain/MANIFEST.yaml
new file mode 100644
index 0000000..bcd1499
--- /dev/null
+++ b/tasks-domain/MANIFEST.yaml
@@ -0,0 +1,163 @@
+manifest_version: 1
+release: clawbench-domain-v0
+status: scaffold
+purpose: |
+  Domain coverage scaffold for proving that model + general harness + plugins
+  covers the jobs served by most agent SaaS products. This is not the small
+  public Core v1 benchmark. It is the planned expansion corpus.
+
+relationship_to_core_v1: |
+  tasks-public/Core v1 is the public, signal-curated reproducibility set.
+  tasks-domain is the domain coverage and ablation suite. Core v1 can stay
+  small; domain coverage should grow through templates and private variants.
+
+domains:
+  - id: crm
+    label: CRM
+    representative_jobs:
+      - lead enrichment
+      - account update from meeting notes
+      - opportunity risk summary
+      - duplicate contact cleanup
+      - follow-up task creation
+    plugin_requirements: [browser, crm_api, docs, search, memory]
+    verifier_contracts: [api_state, structured_artifact, cited_evidence]
+
+  - id: support
+    label: Support
+    representative_jobs:
+      - ticket triage
+      - macro draft with policy evidence
+      - escalation routing
+      - refund eligibility lookup
+      - customer timeline summary
+    plugin_requirements: [browser, support_api, knowledge_base, email]
+    verifier_contracts: [api_state, policy_match, cited_evidence]
+
+  - id: email_calendar
+    label: Email and calendar
+    representative_jobs:
+      - thread summarization
+      - meeting scheduling
+      - follow-up drafting
+      - conflict detection
+      - contact-aware prioritization
+    plugin_requirements: [email, calendar, contacts, memory]
+    verifier_contracts: [calendar_state, draft_content, no_duplicate_state]
+
+  - id: docs_sheets_slides
+    label: Docs, sheets, slides
+    representative_jobs:
+      - spreadsheet cleanup
+      - deck update
+      - document redaction
+      - chart generation
+      - report formatting
+    plugin_requirements: [filesystem, spreadsheet, document, slides, charting]
+    verifier_contracts: [file_structure, rendered_diff, formula_check]
+
+  - id: project_management
+    label: Project management
+    representative_jobs:
+      - issue grooming
+      - sprint status update
+      - dependency tracking
+      - stale task cleanup
+      - launch checklist synthesis
+    plugin_requirements: [pm_api, repo, docs, notifications]
+    verifier_contracts: [api_state, link_integrity, dependency_state]
+
+  - id: finance_ops
+    label: Finance ops
+    representative_jobs:
+      - invoice reconciliation
+      - expense categorization
+      - budget variance report
+      - payment exception triage
+      - tax document checklist
+    plugin_requirements: [spreadsheet, accounting_api, document, ocr]
+    verifier_contracts: [numeric_tolerance, ledger_delta, audit_trail]
+
+  - id: data_analytics
+    label: Data analytics
+    representative_jobs:
+      - SQL answer
+      - dashboard explanation
+      - ETL patch
+      - anomaly investigation
+      - chart specification
+    plugin_requirements: [database, notebook, filesystem, bi_api]
+    verifier_contracts: [query_result, execution_check, chart_spec]
+
+  - id: security_admin
+    label: Security admin
+    representative_jobs:
+      - access review
+      - incident timeline
+      - secret rotation plan
+      - policy exception review
+      - audit log evidence packet
+    plugin_requirements: [identity_api, logs, repo, policy_docs]
+    verifier_contracts: [policy_state, cited_logs, refusal_gate]
+
+  - id: ecommerce_ops
+    label: Ecommerce ops
+    representative_jobs:
+      - catalog update
+      - order exception handling
+      - promo QA
+      - inventory reconciliation
+      - returns policy response
+    plugin_requirements: [storefront_api, spreadsheet, browser, email]
+    verifier_contracts: [api_state, price_check, order_state]
+
+  - id: devtools
+    label: Devtools
+    representative_jobs:
+      - repo migration
+      - CI failure repair
+      - release note generation
+      - dependency update
+      - multi-repo contract change
+    plugin_requirements: [shell, git, filesystem, package_registry]
+    verifier_contracts: [test_pass, diff_assertion, changelog_check]
+
+  - id: research
+    label: Research
+    representative_jobs:
+      - evidence memo
+      - citation synthesis
+      - source contradiction handling
+      - market scan
+      - literature extraction
+    plugin_requirements: [browser, web_search, web_fetch, document]
+    verifier_contracts: [citation_check, no_fabrication, source_coverage]
+
+  - id: personal_ops
+    label: Personal ops
+    representative_jobs:
+      - travel planning
+      - household planning
+      - health admin summary
+      - personal finance checklist
+      - recurring reminder setup
+    plugin_requirements: [calendar, browser, memory, document]
+    verifier_contracts: [constraint_satisfaction, state_transition, refusal_gate]
+
+release_targets:
+  domain_count: 12
+  templates_per_domain: 5
+  private_variants_per_template: 3
+  runs_per_configuration: 3
+  public_templates_total: 60
+  private_variants_total: 180
+
+ablation_classes:
+  - id: model_only
+    description: Model with minimal shell/filesystem access.
+  - id: model_plus_harness
+    description: Model plus general OpenClaw-style harness, no domain plugins.
+  - id: core_plugins
+    description: Harness plus common browser, memory, filesystem, and execution plugins.
+  - id: domain_plugins
+    description: Harness plus the plugins needed for each domain state surface.
diff --git a/tasks-domain/README.md b/tasks-domain/README.md
new file mode 100644
index 0000000..3112a98
--- /dev/null
+++ b/tasks-domain/README.md
@@ -0,0 +1,59 @@
+# ClawBench Domain Suite
+
+`tasks-public/` is the small public Core v1 set. `tasks-domain/` is the
+coverage scaffold for the larger proof corpus: the domains served by most
+agent SaaS products, expressed as deterministic benchmark work.
+
+The claim this suite is meant to support is:
+
+> A capable model plus a general agent harness plus the right plugins can
+> cover the task domains that most agent SaaS products sell.
+
+This is intentionally not a clone of vendor products. It is a taxonomy of
+jobs, state transitions, and verifier contracts.
+
+## Domains
+
+| Domain | Representative jobs | Required plugin surface | Verification style |
+|---|---|---|---|
+| CRM | lead enrichment, account updates, meeting notes to opportunities | browser, CRM API, docs, search | API state assertions, fixture diffs |
+| Support | ticket triage, macro draft, escalation, refund lookup | browser/API, knowledge base, email | ticket state, cited evidence, policy checks |
+| Email and calendar | thread summarization, scheduling, follow-ups | mail, calendar, contacts, memory | event state, draft content, no-duplicate checks |
+| Docs, sheets, slides | spreadsheet cleanup, deck edits, document redaction | file, office docs, charting | structural file assertions, rendered diffs |
+| Project management | issue grooming, sprint updates, dependency tracking | PM API, repo, docs, notifications | issue state, links, blocked/unblocked status |
+| Finance ops | invoice reconciliation, expense coding, budget variance | spreadsheets, accounting API, OCR | ledger deltas, numeric tolerances, audit trail |
+| Data analytics | SQL, dashboard explanation, ETL patch, anomaly report | database, notebooks, BI API | query results, chart spec, report content |
+| Security admin | access review, incident timeline, secret rotation plan | identity, logs, repo, policy docs | policy state, log-derived evidence, refusal gates |
+| Ecommerce ops | catalog updates, order exception handling, promo QA | storefront API, spreadsheet, browser | product state, order workflow, price checks |
+| Devtools | repo migration, CI fix, release note, dependency update | shell, git, code, package registry | test pass, diff assertions, changelog checks |
+| Research | web evidence, citation synthesis, source contradiction | browser, web search, docs | citation verifier, no-fabrication checks |
+| Personal ops | travel, household planning, health/wellness admin | calendar, browser, memory, docs | constraint satisfaction, state updates |
+
+## Proof Standard
+
+Each domain task should declare:
+
+- `domain`: one of the domains above
+- `job`: the user-facing job being covered
+- `saas_equivalents`: examples of products whose core workflow overlaps
+- `plugin_requirements`: tool families and state surfaces needed
+- `deterministic_floor`: the verifier that must pass before any judge score
+- `holdout_variant_policy`: how private variants are generated
+- `ablation_axis`: which plugins or harness capabilities the task tests
+
+## Minimum Bar
+
+For a credible first domain release:
+
+- 12 domains
+- 5 task templates per domain
+- 3 private variants per template
+- 3 runs per configuration
+- at least 4 configuration classes:
+  - model only
+  - model plus harness
+  - model plus harness plus core plugins
+  - model plus harness plus domain plugins
+
+That yields 60 public templates and 180 private variants before repetitions.
+The public templates explain coverage; the private variants carry the proof.
diff --git a/tasks-public/MANIFEST.yaml b/tasks-public/MANIFEST.yaml
index 72569cf..eadedfc 100644
--- a/tasks-public/MANIFEST.yaml
+++ b/tasks-public/MANIFEST.yaml
@@ -43,12 +43,12 @@ selection_basis:
 coverage:
   tiers:
     tier1: 2
-    tier2: 7
+    tier2: 6
     tier3: 5
-    tier4: 4
+    tier4: 5
     tier5: 1
   families:
-    tools: 7
+    tools: 8
     coding: 2
     repo: 3
     browser: 2
diff --git a/tasks-public/README.md b/tasks-public/README.md
index 80ddd3b..ffa35d8 100644
--- a/tasks-public/README.md
+++ b/tasks-public/README.md
@@ -33,9 +33,9 @@ against your own configuration.
 
 | Dimension | Breakdown |
 |---|---|
-| Tiers | T1=2, T2=7, T3=5, T4=4, T5=1 |
-| Families | tools=7, coding=2, repo=3, browser=2, multi_tool=3, adversarial=1 |
-| Capabilities | bugfix, refactor, test_authoring, multifile_reasoning, browser_debugging, structured_output, graceful_refusal, delegation, tool_composition, research_synthesis, cross_repo_change, memory_continuation |
+| Tiers | T1=2, T2=6, T3=5, T4=5, T5=1 |
+| Families | tools=8, coding=2, repo=3, browser=2, multi_tool=3, adversarial=1 |
+| Capabilities | bugfix, test_authoring, multifile_reasoning, browser_debugging, structured_output, graceful_refusal, delegation, tool_composition, research_synthesis, cross_repo_change, memory_continuation |
 
 ## Directory layout
 
@@ -44,9 +44,9 @@ tasks-public/
 ├── MANIFEST.yaml          # Machine-readable task list + metadata
 ├── README.md              # This file
 ├── tier1/                 # 2 task YAMLs
-├── tier2/                 # 7 task YAMLs
+├── tier2/                 # 6 task YAMLs
 ├── tier3/                 # 5 task YAMLs
-├── tier4/                 # 4 task YAMLs
+├── tier4/                 # 5 task YAMLs
 ├── tier5/                 # 1 task YAML
 └── assets/                # 19 asset packs (verifier scripts + fixtures)
 ```
diff --git a/tests/test_harness.py b/tests/test_harness.py
index 93ab66b..85227a4 100644
--- a/tests/test_harness.py
+++ b/tests/test_harness.py
@@ -163,3 +163,55 @@ def test_compose_result_from_task_stats_supports_parallel_environment_metadata()
     assert merged_result.environment["parallel_lanes"] == 2
     assert merged_result.environment["requested_parallel_lanes"] == 3
     assert merged_result.environment["browser_tasks_serialized"] is False
+
+
+@pytest.mark.asyncio
+async def test_run_records_adapter_surface(monkeypatch):
+    task = next(task for task in load_all_tasks() if task.id == "t1-bugfix-discount")
+
+    async def fake_run_single(self, current_task, run_index: int):
+        return TaskRunResult(
+            task_id=current_task.id,
+            tier=current_task.tier.value,
+            family=current_task.family.value,
+            run_index=run_index,
+            run_score=1.0,
+            completion_result=CompletionResult(total_assertions=1, passed_assertions=1, score=1.0),
+        )
+
+    monkeypatch.setattr("clawbench.harness.load_all_tasks", lambda **_: [task])
+    monkeypatch.setattr(BenchmarkHarness, "_run_single", fake_run_single)
+
+    harness = BenchmarkHarness(
+        gateway_config=GatewayConfig(),
+        model="test-model",
+        adapter="openclaw",
+        runs_per_task=1,
+        randomize_order=False,
+        print_report=False,
+        quiet=True,
+    )
+
+    result = await harness.run()
+
+    assert result.environment["adapter"] == "openclaw"
+    assert "hermes" in result.environment["known_adapters"]
+
+
+@pytest.mark.asyncio
+async def test_run_rejects_registered_but_unwired_adapter(monkeypatch):
+    task = next(task for task in load_all_tasks() if task.id == "t1-bugfix-discount")
+    monkeypatch.setattr("clawbench.harness.load_all_tasks", lambda **_: [task])
+
+    harness = BenchmarkHarness(
+        gateway_config=GatewayConfig(),
+        model="test-model",
+        adapter="hermes",
+        runs_per_task=1,
+        randomize_order=False,
+        print_report=False,
+        quiet=True,
+    )
+
+    with pytest.raises(ValueError, match="not yet wired"):
+        await harness.run()
diff --git a/tests/test_v05_extensions.py b/tests/test_v05_extensions.py
index 1b9a290..e3cff45 100644
--- a/tests/test_v05_extensions.py
+++ b/tests/test_v05_extensions.py
@@ -23,6 +23,7 @@ from pathlib import Path
 sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
 
 from clawbench.diagnostic import build_diagnostic, submit_run
+from clawbench.diagnose_cli import infer_registration_traces_from_manifests
 from clawbench.factor_analysis import analyze
 from clawbench.insights import (
     compute_capability_gaps,
@@ -139,6 +140,22 @@ def test_taguchi_sn_handles_zero_score_without_crashing():
 # ---------------------------------------------------------------------------
 
 
+def test_infer_registration_traces_from_manifests_uses_declared_tools():
+    profile = _make_profile("p", ["alpha", "missing"])
+    manifests = {
+        "alpha": _make_manifest(
+            "alpha",
+            tools=["read_file", "browser_click", "memory_write"],
+        ),
+    }
+
+    traces = infer_registration_traces_from_manifests(profile, manifests)
+
+    assert set(traces) == {"alpha"}
+    assert traces["alpha"].tools == ["read_file", "browser_click", "memory_write"]
+    assert traces["alpha"].tool_families_seen == ["browser", "memory", "read"]
+
+
 def test_audit_flags_dead_weight_plugin():
     profile = _make_profile("p", ["alpha", "beta"])
     manifests = {