Add public domain scaffold and adapter diagnostics

2026-04-23 12:40:23 -07:00 · 2026-04-23 12:40:23 -07:00 · 595cdc910c
commit 595cdc910c
parent df32a5f073
10 changed files with 402 additions and 15 deletions
--- a/README.md
+++ b/README.md
@ -245,9 +245,9 @@ Core v1 is a signal-curated public release of 19 tasks from the internal 40-task
 | Tier | Core v1 count | What it tests | Examples |
 |------|:---:|---|---|
 | **Tier 1** | 2 | Single-tool basics | Bugfix discount calc, quick file note |
-| **Tier 2** | 7 | Multi-step, 2-3 tools | Config loader repair, browser form fix, priv redaction |
+| **Tier 2** | 6 | Multi-step, 2-3 tools | Config loader repair, browser form fix, priv redaction |
 | **Tier 3** | 5 | Complex orchestration | SQL query analysis, inbox triage, data pipeline report |
-| **Tier 4** | 4 | Cross-system reasoning | Cross-repo migration, delegation repair, browser research+code |
+| **Tier 4** | 5 | Cross-system reasoning | Cross-repo migration, delegation repair, memory continuation, browser research+code |
 | **Tier 5** | 1 | Adversarial | Hallucination-resistant evidence |

 Full manifest: [`tasks-public/MANIFEST.yaml`](tasks-public/MANIFEST.yaml).
--- a/clawbench/cli.py
+++ b/clawbench/cli.py
@ -10,7 +10,7 @@ from pathlib import Path
 import click

 from clawbench.client import GatewayConfig
-from clawbench.harness import BenchmarkHarness
+from clawbench.harness import BenchmarkHarness, KNOWN_ADAPTERS

 SCENARIO_CHOICES = [
    "file_system_ops",
@ -41,6 +41,13 @@ def cli(verbose: bool) -> None:

@cli.command()
@click.option("--model", "-m", required=True, help="Model to benchmark")
+@click.option(
+    "--adapter",
+    type=click.Choice(KNOWN_ADAPTERS),
+    default="openclaw",
+    show_default=True,
+    help="Agent harness adapter. OpenClaw is executable today; other adapters are tracked targets.",
+)
@click.option("--gateway-token", envvar="OPENCLAW_GATEWAY_TOKEN", default="", help="Gateway auth token")
@click.option(
    "--judge-model",
@ -123,6 +130,7 @@ def cli(verbose: bool) -> None:
 )
 def run(
    model: str,
+    adapter: str,
    gateway_token: str,
    judge_model: str,
    runs: int,
@ -148,6 +156,7 @@ def run(
    harness = BenchmarkHarness(
        gateway_config=gateway_config,
        model=model,
+        adapter=adapter,
        judge_model=judge_model,
        runs_per_task=runs,
        tier=tier,
@ -178,6 +187,7 @@ def run(
        _run_v05_diagnostic(
            profile_path=profile,
            result=result,
+            task_runs=harness.last_task_runs,
            runs_per_task=runs,
            insights_dir=insights_dir,
        )
@ -269,6 +279,7 @@ def _run_v05_diagnostic(
    *,
    profile_path: Path,
    result,
+    task_runs: dict[str, list] | None,
    runs_per_task: int,
    insights_dir: Path,
 ) -> None:
@ -278,6 +289,7 @@ def _run_v05_diagnostic(
        DEFAULT_MANIFEST_DIR,
        DEFAULT_SUBMISSIONS_DIR,
        ensure_data_dirs,
+        infer_registration_traces_from_manifests,
        load_manifests,
        write_submission_record,
    )
@ -291,6 +303,7 @@ def _run_v05_diagnostic(
    plugin_profile = PluginProfile.from_yaml_file(profile_path)
    plugin_ids = [e.id for e in plugin_profile.plugins]
    manifests = load_manifests(DEFAULT_MANIFEST_DIR, plugin_ids)
+    traces = infer_registration_traces_from_manifests(plugin_profile, manifests)
    db = HistoricalDatabase(path=DEFAULT_DB_PATH)

    # Extract per-task scores + tier map from the BenchmarkResult
@ -301,12 +314,16 @@ def _run_v05_diagnostic(
        if getattr(task_stats, "tier", ""):
            tier_of[task_stats.task_id] = task_stats.tier

+    transcripts = _merge_task_transcripts_from_runs(task_runs or {})
+
    diagnostic = submit_run(
        profile=plugin_profile,
        manifests=manifests,
        db=db,
        actual_overall_score=float(result.overall_score),
        actual_per_task_scores=actual_per_task,
+        traces=traces,
+        transcripts=transcripts,
        tier_of=tier_of or None,
        n_runs_contributing=runs_per_task,
    )
@ -329,6 +346,22 @@ def _run_v05_diagnostic(
    )


+def _merge_task_transcripts_from_runs(task_runs: dict[str, list]):
+    """Merge all run transcripts per task for the v0.5 utilization audit."""
+    if not task_runs:
+        return None
+    from clawbench.schemas import Transcript
+
+    merged: dict[str, Transcript] = {}
+    for task_id, runs in task_runs.items():
+        transcript = Transcript()
+        for run in runs:
+            transcript.messages.extend(getattr(run.transcript, "messages", []))
+        if transcript.messages:
+            merged[task_id] = transcript
+    return merged or None
+
+
@cli.command()
@click.argument("profile", type=click.Path(exists=True, path_type=Path))
@click.option(
--- a/clawbench/diagnose_cli.py
+++ b/clawbench/diagnose_cli.py
@ -37,7 +37,8 @@ from clawbench.diagnostic import build_diagnostic, submit_run
 from clawbench.insights import publish_insights
 from clawbench.prediction import HistoricalDatabase
 from clawbench.profile import PluginManifest, PluginProfile, RegistrationTrace
-from clawbench.schemas import Transcript
+from clawbench.schemas import ToolCall, Transcript
+from clawbench.trajectory import classify_tool_call


 DEFAULT_CLAWBENCH_ROOT = Path(".clawbench")
@ -80,6 +81,39 @@ def load_transcripts(path: Path) -> dict[str, Transcript]:
    return out


+def infer_registration_traces_from_manifests(
+    profile: PluginProfile,
+    manifests: dict[str, PluginManifest],
+) -> dict[str, RegistrationTrace]:
+    """Build best-effort registration traces from manifest-declared tools.
+
+    Full runtime registration traces are better because they include hooks,
+    gateway methods, routes, and services. This fallback still gives the
+    diagnostic layer exact manifest-declared tool names, which is enough to
+    attribute many transcript tool calls instead of dropping all utilization
+    into the unassigned bucket.
+    """
+    traces: dict[str, RegistrationTrace] = {}
+    for entry in profile.plugins:
+        manifest = manifests.get(entry.id)
+        if manifest is None:
+            continue
+        tools = list(manifest.contracts.get("tools", []))
+        families = sorted(
+            {
+                classify_tool_call(ToolCall(name=tool))[0]
+                for tool in tools
+                if tool
+            }
+        )
+        traces[entry.id] = RegistrationTrace(
+            plugin_id=entry.id,
+            tools=tools,
+            tool_families_seen=families,
+        )
+    return traces
+
+
 def write_submission_record(
    submissions_dir: Path, fingerprint_hash: str, report_dict: dict
 ) -> Path:
@ -162,6 +196,7 @@ def main() -> None:
    profile = PluginProfile.from_yaml_file(args.profile)
    plugin_ids = [e.id for e in profile.plugins]
    manifests = load_manifests(args.manifests, plugin_ids)
+    traces = infer_registration_traces_from_manifests(profile, manifests)
    db = HistoricalDatabase(path=args.db)

    actual_overall: float | None = None
@ -172,9 +207,16 @@ def main() -> None:
            sys.exit(2)
        results_data = json.loads(args.results.read_text(encoding="utf-8"))
        actual_overall = float(results_data.get("overall_score", 0.0))
-        actual_per_task = {
-            k: float(v) for k, v in results_data.get("per_task_score", {}).items()
-        }
+        if "per_task_score" in results_data:
+            actual_per_task = {
+                k: float(v) for k, v in results_data.get("per_task_score", {}).items()
+            }
+        else:
+            actual_per_task = {
+                str(item.get("task_id")): float(item.get("mean_task_score", 0.0))
+                for item in results_data.get("task_results", [])
+                if item.get("task_id")
+            }

    transcripts: dict[str, Transcript] | None = None
    if args.transcripts:
@ -208,6 +250,7 @@ def main() -> None:
            db=db,
            actual_overall_score=actual_overall,
            actual_per_task_scores=actual_per_task,
+            traces=traces,
            transcripts=transcripts,
            tier_of=tier_of,
        )
@ -223,6 +266,7 @@ def main() -> None:
            db=db,
            actual_overall_score=actual_overall,
            actual_per_task_scores=actual_per_task,
+            traces=traces,
            transcripts=transcripts,
            tier_of=tier_of,
        )
--- a/clawbench/harness.py
+++ b/clawbench/harness.py
@ -40,6 +40,9 @@ from clawbench.tasks import get_assets_dir, load_all_tasks
 logger = logging.getLogger(__name__)
 console = Console()

+KNOWN_ADAPTERS = ("openclaw", "hermes", "codex", "claude-code")
+EXECUTABLE_ADAPTERS = {"openclaw"}
+

 class _NullCtx:
    """A no-op async context manager used to skip the browser semaphore
@ -79,6 +82,7 @@ class BenchmarkHarness:
        quiet: bool = False,
        concurrency: int = 1,
        browser_concurrency: int = 1,
+        adapter: str = "openclaw",
    ) -> None:
        self.gateway_config = gateway_config
        self.model = model
@ -102,10 +106,21 @@ class BenchmarkHarness:
        self.quiet = quiet
        self.concurrency = max(1, int(concurrency))
        self.browser_concurrency = max(1, int(browser_concurrency))
+        self.adapter = adapter
        self.repo_root = Path(__file__).parent.parent
        self.last_task_runs: dict[str, list[TaskRunResult]] = {}

    async def run(self) -> BenchmarkResult:
+        if self.adapter not in KNOWN_ADAPTERS:
+            raise ValueError(
+                f"Unknown adapter '{self.adapter}'. Known adapters: {', '.join(KNOWN_ADAPTERS)}"
+            )
+        if self.adapter not in EXECUTABLE_ADAPTERS:
+            raise ValueError(
+                f"Adapter '{self.adapter}' is registered as a target but is not yet wired "
+                "into the end-to-end scoring harness. Use 'openclaw' for executable runs."
+            )
+
        tasks = load_all_tasks(
            tasks_dir=self.tasks_dir,
            tier=self.tier,
@ -129,6 +144,7 @@ class BenchmarkHarness:
        if not self.quiet:
            console.print(f"\n[bold]ClawBench v{__version__}[/bold] — {len(tasks)} tasks x {self.runs_per_task} runs")
            console.print(f"Model: [cyan]{self.model}[/cyan]")
+            console.print(f"Adapter: [cyan]{self.adapter}[/cyan]")
            if self.judge_model:
                console.print(f"Advisory judge: [magenta]{self.judge_model}[/magenta]")
            mode = "serial" if self.concurrency == 1 else f"parallel(concurrency={self.concurrency}, browser={self.browser_concurrency})"
@ -726,6 +742,9 @@ class BenchmarkHarness:
                "artifact_type": self.artifact_type or "all",
                "prompt_variant": self.prompt_variant,
                "judge_model": self.judge_model,
+                "adapter": self.adapter,
+                "known_adapters": list(KNOWN_ADAPTERS),
+                "executable_adapters": sorted(EXECUTABLE_ADAPTERS),
                "subsets": self.subsets,
                "capabilities": self.capabilities,
                "official_only": self.official_only,
--- a/tasks-domain/MANIFEST.yaml
+++ b/tasks-domain/MANIFEST.yaml
@ -0,0 +1,163 @@
+manifest_version: 1
+release: clawbench-domain-v0
+status: scaffold
+purpose: |
+  Domain coverage scaffold for proving that model + general harness + plugins
+  covers the jobs served by most agent SaaS products. This is not the small
+  public Core v1 benchmark. It is the planned expansion corpus.
+
+relationship_to_core_v1: |
+  tasks-public/Core v1 is the public, signal-curated reproducibility set.
+  tasks-domain is the domain coverage and ablation suite. Core v1 can stay
+  small; domain coverage should grow through templates and private variants.
+
+domains:
+  - id: crm
+    label: CRM
+    representative_jobs:
+      - lead enrichment
+      - account update from meeting notes
+      - opportunity risk summary
+      - duplicate contact cleanup
+      - follow-up task creation
+    plugin_requirements: [browser, crm_api, docs, search, memory]
+    verifier_contracts: [api_state, structured_artifact, cited_evidence]
+
+  - id: support
+    label: Support
+    representative_jobs:
+      - ticket triage
+      - macro draft with policy evidence
+      - escalation routing
+      - refund eligibility lookup
+      - customer timeline summary
+    plugin_requirements: [browser, support_api, knowledge_base, email]
+    verifier_contracts: [api_state, policy_match, cited_evidence]
+
+  - id: email_calendar
+    label: Email and calendar
+    representative_jobs:
+      - thread summarization
+      - meeting scheduling
+      - follow-up drafting
+      - conflict detection
+      - contact-aware prioritization
+    plugin_requirements: [email, calendar, contacts, memory]
+    verifier_contracts: [calendar_state, draft_content, no_duplicate_state]
+
+  - id: docs_sheets_slides
+    label: Docs, sheets, slides
+    representative_jobs:
+      - spreadsheet cleanup
+      - deck update
+      - document redaction
+      - chart generation
+      - report formatting
+    plugin_requirements: [filesystem, spreadsheet, document, slides, charting]
+    verifier_contracts: [file_structure, rendered_diff, formula_check]
+
+  - id: project_management
+    label: Project management
+    representative_jobs:
+      - issue grooming
+      - sprint status update
+      - dependency tracking
+      - stale task cleanup
+      - launch checklist synthesis
+    plugin_requirements: [pm_api, repo, docs, notifications]
+    verifier_contracts: [api_state, link_integrity, dependency_state]
+
+  - id: finance_ops
+    label: Finance ops
+    representative_jobs:
+      - invoice reconciliation
+      - expense categorization
+      - budget variance report
+      - payment exception triage
+      - tax document checklist
+    plugin_requirements: [spreadsheet, accounting_api, document, ocr]
+    verifier_contracts: [numeric_tolerance, ledger_delta, audit_trail]
+
+  - id: data_analytics
+    label: Data analytics
+    representative_jobs:
+      - SQL answer
+      - dashboard explanation
+      - ETL patch
+      - anomaly investigation
+      - chart specification
+    plugin_requirements: [database, notebook, filesystem, bi_api]
+    verifier_contracts: [query_result, execution_check, chart_spec]
+
+  - id: security_admin
+    label: Security admin
+    representative_jobs:
+      - access review
+      - incident timeline
+      - secret rotation plan
+      - policy exception review
+      - audit log evidence packet
+    plugin_requirements: [identity_api, logs, repo, policy_docs]
+    verifier_contracts: [policy_state, cited_logs, refusal_gate]
+
+  - id: ecommerce_ops
+    label: Ecommerce ops
+    representative_jobs:
+      - catalog update
+      - order exception handling
+      - promo QA
+      - inventory reconciliation
+      - returns policy response
+    plugin_requirements: [storefront_api, spreadsheet, browser, email]
+    verifier_contracts: [api_state, price_check, order_state]
+
+  - id: devtools
+    label: Devtools
+    representative_jobs:
+      - repo migration
+      - CI failure repair
+      - release note generation
+      - dependency update
+      - multi-repo contract change
+    plugin_requirements: [shell, git, filesystem, package_registry]
+    verifier_contracts: [test_pass, diff_assertion, changelog_check]
+
+  - id: research
+    label: Research
+    representative_jobs:
+      - evidence memo
+      - citation synthesis
+      - source contradiction handling
+      - market scan
+      - literature extraction
+    plugin_requirements: [browser, web_search, web_fetch, document]
+    verifier_contracts: [citation_check, no_fabrication, source_coverage]
+
+  - id: personal_ops
+    label: Personal ops
+    representative_jobs:
+      - travel planning
+      - household planning
+      - health admin summary
+      - personal finance checklist
+      - recurring reminder setup
+    plugin_requirements: [calendar, browser, memory, document]
+    verifier_contracts: [constraint_satisfaction, state_transition, refusal_gate]
+
+release_targets:
+  domain_count: 12
+  templates_per_domain: 5
+  private_variants_per_template: 3
+  runs_per_configuration: 3
+  public_templates_total: 60
+  private_variants_total: 180
+
+ablation_classes:
+  - id: model_only
+    description: Model with minimal shell/filesystem access.
+  - id: model_plus_harness
+    description: Model plus general OpenClaw-style harness, no domain plugins.
+  - id: core_plugins
+    description: Harness plus common browser, memory, filesystem, and execution plugins.
+  - id: domain_plugins
+    description: Harness plus the plugins needed for each domain state surface.
--- a/tasks-domain/README.md
+++ b/tasks-domain/README.md
@ -0,0 +1,59 @@
+# ClawBench Domain Suite
+
+`tasks-public/` is the small public Core v1 set. `tasks-domain/` is the
+coverage scaffold for the larger proof corpus: the domains served by most
+agent SaaS products, expressed as deterministic benchmark work.
+
+The claim this suite is meant to support is:
+
+> A capable model plus a general agent harness plus the right plugins can
+> cover the task domains that most agent SaaS products sell.
+
+This is intentionally not a clone of vendor products. It is a taxonomy of
+jobs, state transitions, and verifier contracts.
+
+## Domains
+
+| Domain | Representative jobs | Required plugin surface | Verification style |
+|---|---|---|---|
+| CRM | lead enrichment, account updates, meeting notes to opportunities | browser, CRM API, docs, search | API state assertions, fixture diffs |
+| Support | ticket triage, macro draft, escalation, refund lookup | browser/API, knowledge base, email | ticket state, cited evidence, policy checks |
+| Email and calendar | thread summarization, scheduling, follow-ups | mail, calendar, contacts, memory | event state, draft content, no-duplicate checks |
+| Docs, sheets, slides | spreadsheet cleanup, deck edits, document redaction | file, office docs, charting | structural file assertions, rendered diffs |
+| Project management | issue grooming, sprint updates, dependency tracking | PM API, repo, docs, notifications | issue state, links, blocked/unblocked status |
+| Finance ops | invoice reconciliation, expense coding, budget variance | spreadsheets, accounting API, OCR | ledger deltas, numeric tolerances, audit trail |
+| Data analytics | SQL, dashboard explanation, ETL patch, anomaly report | database, notebooks, BI API | query results, chart spec, report content |
+| Security admin | access review, incident timeline, secret rotation plan | identity, logs, repo, policy docs | policy state, log-derived evidence, refusal gates |
+| Ecommerce ops | catalog updates, order exception handling, promo QA | storefront API, spreadsheet, browser | product state, order workflow, price checks |
+| Devtools | repo migration, CI fix, release note, dependency update | shell, git, code, package registry | test pass, diff assertions, changelog checks |
+| Research | web evidence, citation synthesis, source contradiction | browser, web search, docs | citation verifier, no-fabrication checks |
+| Personal ops | travel, household planning, health/wellness admin | calendar, browser, memory, docs | constraint satisfaction, state updates |
+
+## Proof Standard
+
+Each domain task should declare:
+
+- `domain`: one of the domains above
+- `job`: the user-facing job being covered
+- `saas_equivalents`: examples of products whose core workflow overlaps
+- `plugin_requirements`: tool families and state surfaces needed
+- `deterministic_floor`: the verifier that must pass before any judge score
+- `holdout_variant_policy`: how private variants are generated
+- `ablation_axis`: which plugins or harness capabilities the task tests
+
+## Minimum Bar
+
+For a credible first domain release:
+
+- 12 domains
+- 5 task templates per domain
+- 3 private variants per template
+- 3 runs per configuration
+- at least 4 configuration classes:
+  - model only
+  - model plus harness
+  - model plus harness plus core plugins
+  - model plus harness plus domain plugins
+
+That yields 60 public templates and 180 private variants before repetitions.
+The public templates explain coverage; the private variants carry the proof.
--- a/tasks-public/MANIFEST.yaml
+++ b/tasks-public/MANIFEST.yaml
@ -43,12 +43,12 @@ selection_basis:
 coverage:
  tiers:
    tier1: 2
-    tier2: 7
+    tier2: 6
    tier3: 5
-    tier4: 4
+    tier4: 5
    tier5: 1
  families:
-    tools: 7
+    tools: 8
    coding: 2
    repo: 3
    browser: 2
--- a/tasks-public/README.md
+++ b/tasks-public/README.md
@ -33,9 +33,9 @@ against your own configuration.

 | Dimension | Breakdown |
 |---|---|
-| Tiers | T1=2, T2=7, T3=5, T4=4, T5=1 |
-| Families | tools=7, coding=2, repo=3, browser=2, multi_tool=3, adversarial=1 |
-| Capabilities | bugfix, refactor, test_authoring, multifile_reasoning, browser_debugging, structured_output, graceful_refusal, delegation, tool_composition, research_synthesis, cross_repo_change, memory_continuation |
+| Tiers | T1=2, T2=6, T3=5, T4=5, T5=1 |
+| Families | tools=8, coding=2, repo=3, browser=2, multi_tool=3, adversarial=1 |
+| Capabilities | bugfix, test_authoring, multifile_reasoning, browser_debugging, structured_output, graceful_refusal, delegation, tool_composition, research_synthesis, cross_repo_change, memory_continuation |

 ## Directory layout

@ -44,9 +44,9 @@ tasks-public/
 ├── MANIFEST.yaml          # Machine-readable task list + metadata
 ├── README.md              # This file
 ├── tier1/                 # 2 task YAMLs
-├── tier2/                 # 7 task YAMLs
+├── tier2/                 # 6 task YAMLs
 ├── tier3/                 # 5 task YAMLs
-├── tier4/                 # 4 task YAMLs
+├── tier4/                 # 5 task YAMLs
 ├── tier5/                 # 1 task YAML
 └── assets/                # 19 asset packs (verifier scripts + fixtures)
 ```
--- a/tests/test_harness.py
+++ b/tests/test_harness.py
@ -163,3 +163,55 @@ def test_compose_result_from_task_stats_supports_parallel_environment_metadata()
    assert merged_result.environment["parallel_lanes"] == 2
    assert merged_result.environment["requested_parallel_lanes"] == 3
    assert merged_result.environment["browser_tasks_serialized"] is False
+
+
+@pytest.mark.asyncio
+async def test_run_records_adapter_surface(monkeypatch):
+    task = next(task for task in load_all_tasks() if task.id == "t1-bugfix-discount")
+
+    async def fake_run_single(self, current_task, run_index: int):
+        return TaskRunResult(
+            task_id=current_task.id,
+            tier=current_task.tier.value,
+            family=current_task.family.value,
+            run_index=run_index,
+            run_score=1.0,
+            completion_result=CompletionResult(total_assertions=1, passed_assertions=1, score=1.0),
+        )
+
+    monkeypatch.setattr("clawbench.harness.load_all_tasks", lambda **_: [task])
+    monkeypatch.setattr(BenchmarkHarness, "_run_single", fake_run_single)
+
+    harness = BenchmarkHarness(
+        gateway_config=GatewayConfig(),
+        model="test-model",
+        adapter="openclaw",
+        runs_per_task=1,
+        randomize_order=False,
+        print_report=False,
+        quiet=True,
+    )
+
+    result = await harness.run()
+
+    assert result.environment["adapter"] == "openclaw"
+    assert "hermes" in result.environment["known_adapters"]
+
+
+@pytest.mark.asyncio
+async def test_run_rejects_registered_but_unwired_adapter(monkeypatch):
+    task = next(task for task in load_all_tasks() if task.id == "t1-bugfix-discount")
+    monkeypatch.setattr("clawbench.harness.load_all_tasks", lambda **_: [task])
+
+    harness = BenchmarkHarness(
+        gateway_config=GatewayConfig(),
+        model="test-model",
+        adapter="hermes",
+        runs_per_task=1,
+        randomize_order=False,
+        print_report=False,
+        quiet=True,
+    )
+
+    with pytest.raises(ValueError, match="not yet wired"):
+        await harness.run()
--- a/tests/test_v05_extensions.py
+++ b/tests/test_v05_extensions.py
@ -23,6 +23,7 @@ from pathlib import Path
 sys.path.insert(0, str(Path(__file__).resolve().parents[1]))

 from clawbench.diagnostic import build_diagnostic, submit_run
+from clawbench.diagnose_cli import infer_registration_traces_from_manifests
 from clawbench.factor_analysis import analyze
 from clawbench.insights import (
    compute_capability_gaps,
@ -139,6 +140,22 @@ def test_taguchi_sn_handles_zero_score_without_crashing():
 # ---------------------------------------------------------------------------


+def test_infer_registration_traces_from_manifests_uses_declared_tools():
+    profile = _make_profile("p", ["alpha", "missing"])
+    manifests = {
+        "alpha": _make_manifest(
+            "alpha",
+            tools=["read_file", "browser_click", "memory_write"],
+        ),
+    }
+
+    traces = infer_registration_traces_from_manifests(profile, manifests)
+
+    assert set(traces) == {"alpha"}
+    assert traces["alpha"].tools == ["read_file", "browser_click", "memory_write"]
+    assert traces["alpha"].tool_families_seen == ["browser", "memory", "read"]
+
+
 def test_audit_flags_dead_weight_plugin():
    profile = _make_profile("p", ["alpha", "beta"])
    manifests = {