From 595cdc910cf3288e78e2bebe1d409ec0ca1c8ea0 Mon Sep 17 00:00:00 2001 From: scoootscooob Date: Thu, 23 Apr 2026 12:40:23 -0700 Subject: [PATCH] Add public domain scaffold and adapter diagnostics --- README.md | 4 +- clawbench/cli.py | 35 +++++++- clawbench/diagnose_cli.py | 52 ++++++++++- clawbench/harness.py | 19 ++++ tasks-domain/MANIFEST.yaml | 163 +++++++++++++++++++++++++++++++++++ tasks-domain/README.md | 59 +++++++++++++ tasks-public/MANIFEST.yaml | 6 +- tasks-public/README.md | 10 +-- tests/test_harness.py | 52 +++++++++++ tests/test_v05_extensions.py | 17 ++++ 10 files changed, 402 insertions(+), 15 deletions(-) create mode 100644 tasks-domain/MANIFEST.yaml create mode 100644 tasks-domain/README.md diff --git a/README.md b/README.md index 17c6598..44c377f 100644 --- a/README.md +++ b/README.md @@ -245,9 +245,9 @@ Core v1 is a signal-curated public release of 19 tasks from the internal 40-task | Tier | Core v1 count | What it tests | Examples | |------|:---:|---|---| | **Tier 1** | 2 | Single-tool basics | Bugfix discount calc, quick file note | -| **Tier 2** | 7 | Multi-step, 2-3 tools | Config loader repair, browser form fix, priv redaction | +| **Tier 2** | 6 | Multi-step, 2-3 tools | Config loader repair, browser form fix, priv redaction | | **Tier 3** | 5 | Complex orchestration | SQL query analysis, inbox triage, data pipeline report | -| **Tier 4** | 4 | Cross-system reasoning | Cross-repo migration, delegation repair, browser research+code | +| **Tier 4** | 5 | Cross-system reasoning | Cross-repo migration, delegation repair, memory continuation, browser research+code | | **Tier 5** | 1 | Adversarial | Hallucination-resistant evidence | Full manifest: [`tasks-public/MANIFEST.yaml`](tasks-public/MANIFEST.yaml). diff --git a/clawbench/cli.py b/clawbench/cli.py index 523e42d..616e963 100644 --- a/clawbench/cli.py +++ b/clawbench/cli.py @@ -10,7 +10,7 @@ from pathlib import Path import click from clawbench.client import GatewayConfig -from clawbench.harness import BenchmarkHarness +from clawbench.harness import BenchmarkHarness, KNOWN_ADAPTERS SCENARIO_CHOICES = [ "file_system_ops", @@ -41,6 +41,13 @@ def cli(verbose: bool) -> None: @cli.command() @click.option("--model", "-m", required=True, help="Model to benchmark") +@click.option( + "--adapter", + type=click.Choice(KNOWN_ADAPTERS), + default="openclaw", + show_default=True, + help="Agent harness adapter. OpenClaw is executable today; other adapters are tracked targets.", +) @click.option("--gateway-token", envvar="OPENCLAW_GATEWAY_TOKEN", default="", help="Gateway auth token") @click.option( "--judge-model", @@ -123,6 +130,7 @@ def cli(verbose: bool) -> None: ) def run( model: str, + adapter: str, gateway_token: str, judge_model: str, runs: int, @@ -148,6 +156,7 @@ def run( harness = BenchmarkHarness( gateway_config=gateway_config, model=model, + adapter=adapter, judge_model=judge_model, runs_per_task=runs, tier=tier, @@ -178,6 +187,7 @@ def run( _run_v05_diagnostic( profile_path=profile, result=result, + task_runs=harness.last_task_runs, runs_per_task=runs, insights_dir=insights_dir, ) @@ -269,6 +279,7 @@ def _run_v05_diagnostic( *, profile_path: Path, result, + task_runs: dict[str, list] | None, runs_per_task: int, insights_dir: Path, ) -> None: @@ -278,6 +289,7 @@ def _run_v05_diagnostic( DEFAULT_MANIFEST_DIR, DEFAULT_SUBMISSIONS_DIR, ensure_data_dirs, + infer_registration_traces_from_manifests, load_manifests, write_submission_record, ) @@ -291,6 +303,7 @@ def _run_v05_diagnostic( plugin_profile = PluginProfile.from_yaml_file(profile_path) plugin_ids = [e.id for e in plugin_profile.plugins] manifests = load_manifests(DEFAULT_MANIFEST_DIR, plugin_ids) + traces = infer_registration_traces_from_manifests(plugin_profile, manifests) db = HistoricalDatabase(path=DEFAULT_DB_PATH) # Extract per-task scores + tier map from the BenchmarkResult @@ -301,12 +314,16 @@ def _run_v05_diagnostic( if getattr(task_stats, "tier", ""): tier_of[task_stats.task_id] = task_stats.tier + transcripts = _merge_task_transcripts_from_runs(task_runs or {}) + diagnostic = submit_run( profile=plugin_profile, manifests=manifests, db=db, actual_overall_score=float(result.overall_score), actual_per_task_scores=actual_per_task, + traces=traces, + transcripts=transcripts, tier_of=tier_of or None, n_runs_contributing=runs_per_task, ) @@ -329,6 +346,22 @@ def _run_v05_diagnostic( ) +def _merge_task_transcripts_from_runs(task_runs: dict[str, list]): + """Merge all run transcripts per task for the v0.5 utilization audit.""" + if not task_runs: + return None + from clawbench.schemas import Transcript + + merged: dict[str, Transcript] = {} + for task_id, runs in task_runs.items(): + transcript = Transcript() + for run in runs: + transcript.messages.extend(getattr(run.transcript, "messages", [])) + if transcript.messages: + merged[task_id] = transcript + return merged or None + + @cli.command() @click.argument("profile", type=click.Path(exists=True, path_type=Path)) @click.option( diff --git a/clawbench/diagnose_cli.py b/clawbench/diagnose_cli.py index b13f6ce..548233a 100644 --- a/clawbench/diagnose_cli.py +++ b/clawbench/diagnose_cli.py @@ -37,7 +37,8 @@ from clawbench.diagnostic import build_diagnostic, submit_run from clawbench.insights import publish_insights from clawbench.prediction import HistoricalDatabase from clawbench.profile import PluginManifest, PluginProfile, RegistrationTrace -from clawbench.schemas import Transcript +from clawbench.schemas import ToolCall, Transcript +from clawbench.trajectory import classify_tool_call DEFAULT_CLAWBENCH_ROOT = Path(".clawbench") @@ -80,6 +81,39 @@ def load_transcripts(path: Path) -> dict[str, Transcript]: return out +def infer_registration_traces_from_manifests( + profile: PluginProfile, + manifests: dict[str, PluginManifest], +) -> dict[str, RegistrationTrace]: + """Build best-effort registration traces from manifest-declared tools. + + Full runtime registration traces are better because they include hooks, + gateway methods, routes, and services. This fallback still gives the + diagnostic layer exact manifest-declared tool names, which is enough to + attribute many transcript tool calls instead of dropping all utilization + into the unassigned bucket. + """ + traces: dict[str, RegistrationTrace] = {} + for entry in profile.plugins: + manifest = manifests.get(entry.id) + if manifest is None: + continue + tools = list(manifest.contracts.get("tools", [])) + families = sorted( + { + classify_tool_call(ToolCall(name=tool))[0] + for tool in tools + if tool + } + ) + traces[entry.id] = RegistrationTrace( + plugin_id=entry.id, + tools=tools, + tool_families_seen=families, + ) + return traces + + def write_submission_record( submissions_dir: Path, fingerprint_hash: str, report_dict: dict ) -> Path: @@ -162,6 +196,7 @@ def main() -> None: profile = PluginProfile.from_yaml_file(args.profile) plugin_ids = [e.id for e in profile.plugins] manifests = load_manifests(args.manifests, plugin_ids) + traces = infer_registration_traces_from_manifests(profile, manifests) db = HistoricalDatabase(path=args.db) actual_overall: float | None = None @@ -172,9 +207,16 @@ def main() -> None: sys.exit(2) results_data = json.loads(args.results.read_text(encoding="utf-8")) actual_overall = float(results_data.get("overall_score", 0.0)) - actual_per_task = { - k: float(v) for k, v in results_data.get("per_task_score", {}).items() - } + if "per_task_score" in results_data: + actual_per_task = { + k: float(v) for k, v in results_data.get("per_task_score", {}).items() + } + else: + actual_per_task = { + str(item.get("task_id")): float(item.get("mean_task_score", 0.0)) + for item in results_data.get("task_results", []) + if item.get("task_id") + } transcripts: dict[str, Transcript] | None = None if args.transcripts: @@ -208,6 +250,7 @@ def main() -> None: db=db, actual_overall_score=actual_overall, actual_per_task_scores=actual_per_task, + traces=traces, transcripts=transcripts, tier_of=tier_of, ) @@ -223,6 +266,7 @@ def main() -> None: db=db, actual_overall_score=actual_overall, actual_per_task_scores=actual_per_task, + traces=traces, transcripts=transcripts, tier_of=tier_of, ) diff --git a/clawbench/harness.py b/clawbench/harness.py index b955aa6..cb416df 100644 --- a/clawbench/harness.py +++ b/clawbench/harness.py @@ -40,6 +40,9 @@ from clawbench.tasks import get_assets_dir, load_all_tasks logger = logging.getLogger(__name__) console = Console() +KNOWN_ADAPTERS = ("openclaw", "hermes", "codex", "claude-code") +EXECUTABLE_ADAPTERS = {"openclaw"} + class _NullCtx: """A no-op async context manager used to skip the browser semaphore @@ -79,6 +82,7 @@ class BenchmarkHarness: quiet: bool = False, concurrency: int = 1, browser_concurrency: int = 1, + adapter: str = "openclaw", ) -> None: self.gateway_config = gateway_config self.model = model @@ -102,10 +106,21 @@ class BenchmarkHarness: self.quiet = quiet self.concurrency = max(1, int(concurrency)) self.browser_concurrency = max(1, int(browser_concurrency)) + self.adapter = adapter self.repo_root = Path(__file__).parent.parent self.last_task_runs: dict[str, list[TaskRunResult]] = {} async def run(self) -> BenchmarkResult: + if self.adapter not in KNOWN_ADAPTERS: + raise ValueError( + f"Unknown adapter '{self.adapter}'. Known adapters: {', '.join(KNOWN_ADAPTERS)}" + ) + if self.adapter not in EXECUTABLE_ADAPTERS: + raise ValueError( + f"Adapter '{self.adapter}' is registered as a target but is not yet wired " + "into the end-to-end scoring harness. Use 'openclaw' for executable runs." + ) + tasks = load_all_tasks( tasks_dir=self.tasks_dir, tier=self.tier, @@ -129,6 +144,7 @@ class BenchmarkHarness: if not self.quiet: console.print(f"\n[bold]ClawBench v{__version__}[/bold] — {len(tasks)} tasks x {self.runs_per_task} runs") console.print(f"Model: [cyan]{self.model}[/cyan]") + console.print(f"Adapter: [cyan]{self.adapter}[/cyan]") if self.judge_model: console.print(f"Advisory judge: [magenta]{self.judge_model}[/magenta]") mode = "serial" if self.concurrency == 1 else f"parallel(concurrency={self.concurrency}, browser={self.browser_concurrency})" @@ -726,6 +742,9 @@ class BenchmarkHarness: "artifact_type": self.artifact_type or "all", "prompt_variant": self.prompt_variant, "judge_model": self.judge_model, + "adapter": self.adapter, + "known_adapters": list(KNOWN_ADAPTERS), + "executable_adapters": sorted(EXECUTABLE_ADAPTERS), "subsets": self.subsets, "capabilities": self.capabilities, "official_only": self.official_only, diff --git a/tasks-domain/MANIFEST.yaml b/tasks-domain/MANIFEST.yaml new file mode 100644 index 0000000..bcd1499 --- /dev/null +++ b/tasks-domain/MANIFEST.yaml @@ -0,0 +1,163 @@ +manifest_version: 1 +release: clawbench-domain-v0 +status: scaffold +purpose: | + Domain coverage scaffold for proving that model + general harness + plugins + covers the jobs served by most agent SaaS products. This is not the small + public Core v1 benchmark. It is the planned expansion corpus. + +relationship_to_core_v1: | + tasks-public/Core v1 is the public, signal-curated reproducibility set. + tasks-domain is the domain coverage and ablation suite. Core v1 can stay + small; domain coverage should grow through templates and private variants. + +domains: + - id: crm + label: CRM + representative_jobs: + - lead enrichment + - account update from meeting notes + - opportunity risk summary + - duplicate contact cleanup + - follow-up task creation + plugin_requirements: [browser, crm_api, docs, search, memory] + verifier_contracts: [api_state, structured_artifact, cited_evidence] + + - id: support + label: Support + representative_jobs: + - ticket triage + - macro draft with policy evidence + - escalation routing + - refund eligibility lookup + - customer timeline summary + plugin_requirements: [browser, support_api, knowledge_base, email] + verifier_contracts: [api_state, policy_match, cited_evidence] + + - id: email_calendar + label: Email and calendar + representative_jobs: + - thread summarization + - meeting scheduling + - follow-up drafting + - conflict detection + - contact-aware prioritization + plugin_requirements: [email, calendar, contacts, memory] + verifier_contracts: [calendar_state, draft_content, no_duplicate_state] + + - id: docs_sheets_slides + label: Docs, sheets, slides + representative_jobs: + - spreadsheet cleanup + - deck update + - document redaction + - chart generation + - report formatting + plugin_requirements: [filesystem, spreadsheet, document, slides, charting] + verifier_contracts: [file_structure, rendered_diff, formula_check] + + - id: project_management + label: Project management + representative_jobs: + - issue grooming + - sprint status update + - dependency tracking + - stale task cleanup + - launch checklist synthesis + plugin_requirements: [pm_api, repo, docs, notifications] + verifier_contracts: [api_state, link_integrity, dependency_state] + + - id: finance_ops + label: Finance ops + representative_jobs: + - invoice reconciliation + - expense categorization + - budget variance report + - payment exception triage + - tax document checklist + plugin_requirements: [spreadsheet, accounting_api, document, ocr] + verifier_contracts: [numeric_tolerance, ledger_delta, audit_trail] + + - id: data_analytics + label: Data analytics + representative_jobs: + - SQL answer + - dashboard explanation + - ETL patch + - anomaly investigation + - chart specification + plugin_requirements: [database, notebook, filesystem, bi_api] + verifier_contracts: [query_result, execution_check, chart_spec] + + - id: security_admin + label: Security admin + representative_jobs: + - access review + - incident timeline + - secret rotation plan + - policy exception review + - audit log evidence packet + plugin_requirements: [identity_api, logs, repo, policy_docs] + verifier_contracts: [policy_state, cited_logs, refusal_gate] + + - id: ecommerce_ops + label: Ecommerce ops + representative_jobs: + - catalog update + - order exception handling + - promo QA + - inventory reconciliation + - returns policy response + plugin_requirements: [storefront_api, spreadsheet, browser, email] + verifier_contracts: [api_state, price_check, order_state] + + - id: devtools + label: Devtools + representative_jobs: + - repo migration + - CI failure repair + - release note generation + - dependency update + - multi-repo contract change + plugin_requirements: [shell, git, filesystem, package_registry] + verifier_contracts: [test_pass, diff_assertion, changelog_check] + + - id: research + label: Research + representative_jobs: + - evidence memo + - citation synthesis + - source contradiction handling + - market scan + - literature extraction + plugin_requirements: [browser, web_search, web_fetch, document] + verifier_contracts: [citation_check, no_fabrication, source_coverage] + + - id: personal_ops + label: Personal ops + representative_jobs: + - travel planning + - household planning + - health admin summary + - personal finance checklist + - recurring reminder setup + plugin_requirements: [calendar, browser, memory, document] + verifier_contracts: [constraint_satisfaction, state_transition, refusal_gate] + +release_targets: + domain_count: 12 + templates_per_domain: 5 + private_variants_per_template: 3 + runs_per_configuration: 3 + public_templates_total: 60 + private_variants_total: 180 + +ablation_classes: + - id: model_only + description: Model with minimal shell/filesystem access. + - id: model_plus_harness + description: Model plus general OpenClaw-style harness, no domain plugins. + - id: core_plugins + description: Harness plus common browser, memory, filesystem, and execution plugins. + - id: domain_plugins + description: Harness plus the plugins needed for each domain state surface. diff --git a/tasks-domain/README.md b/tasks-domain/README.md new file mode 100644 index 0000000..3112a98 --- /dev/null +++ b/tasks-domain/README.md @@ -0,0 +1,59 @@ +# ClawBench Domain Suite + +`tasks-public/` is the small public Core v1 set. `tasks-domain/` is the +coverage scaffold for the larger proof corpus: the domains served by most +agent SaaS products, expressed as deterministic benchmark work. + +The claim this suite is meant to support is: + +> A capable model plus a general agent harness plus the right plugins can +> cover the task domains that most agent SaaS products sell. + +This is intentionally not a clone of vendor products. It is a taxonomy of +jobs, state transitions, and verifier contracts. + +## Domains + +| Domain | Representative jobs | Required plugin surface | Verification style | +|---|---|---|---| +| CRM | lead enrichment, account updates, meeting notes to opportunities | browser, CRM API, docs, search | API state assertions, fixture diffs | +| Support | ticket triage, macro draft, escalation, refund lookup | browser/API, knowledge base, email | ticket state, cited evidence, policy checks | +| Email and calendar | thread summarization, scheduling, follow-ups | mail, calendar, contacts, memory | event state, draft content, no-duplicate checks | +| Docs, sheets, slides | spreadsheet cleanup, deck edits, document redaction | file, office docs, charting | structural file assertions, rendered diffs | +| Project management | issue grooming, sprint updates, dependency tracking | PM API, repo, docs, notifications | issue state, links, blocked/unblocked status | +| Finance ops | invoice reconciliation, expense coding, budget variance | spreadsheets, accounting API, OCR | ledger deltas, numeric tolerances, audit trail | +| Data analytics | SQL, dashboard explanation, ETL patch, anomaly report | database, notebooks, BI API | query results, chart spec, report content | +| Security admin | access review, incident timeline, secret rotation plan | identity, logs, repo, policy docs | policy state, log-derived evidence, refusal gates | +| Ecommerce ops | catalog updates, order exception handling, promo QA | storefront API, spreadsheet, browser | product state, order workflow, price checks | +| Devtools | repo migration, CI fix, release note, dependency update | shell, git, code, package registry | test pass, diff assertions, changelog checks | +| Research | web evidence, citation synthesis, source contradiction | browser, web search, docs | citation verifier, no-fabrication checks | +| Personal ops | travel, household planning, health/wellness admin | calendar, browser, memory, docs | constraint satisfaction, state updates | + +## Proof Standard + +Each domain task should declare: + +- `domain`: one of the domains above +- `job`: the user-facing job being covered +- `saas_equivalents`: examples of products whose core workflow overlaps +- `plugin_requirements`: tool families and state surfaces needed +- `deterministic_floor`: the verifier that must pass before any judge score +- `holdout_variant_policy`: how private variants are generated +- `ablation_axis`: which plugins or harness capabilities the task tests + +## Minimum Bar + +For a credible first domain release: + +- 12 domains +- 5 task templates per domain +- 3 private variants per template +- 3 runs per configuration +- at least 4 configuration classes: + - model only + - model plus harness + - model plus harness plus core plugins + - model plus harness plus domain plugins + +That yields 60 public templates and 180 private variants before repetitions. +The public templates explain coverage; the private variants carry the proof. diff --git a/tasks-public/MANIFEST.yaml b/tasks-public/MANIFEST.yaml index 72569cf..eadedfc 100644 --- a/tasks-public/MANIFEST.yaml +++ b/tasks-public/MANIFEST.yaml @@ -43,12 +43,12 @@ selection_basis: coverage: tiers: tier1: 2 - tier2: 7 + tier2: 6 tier3: 5 - tier4: 4 + tier4: 5 tier5: 1 families: - tools: 7 + tools: 8 coding: 2 repo: 3 browser: 2 diff --git a/tasks-public/README.md b/tasks-public/README.md index 80ddd3b..ffa35d8 100644 --- a/tasks-public/README.md +++ b/tasks-public/README.md @@ -33,9 +33,9 @@ against your own configuration. | Dimension | Breakdown | |---|---| -| Tiers | T1=2, T2=7, T3=5, T4=4, T5=1 | -| Families | tools=7, coding=2, repo=3, browser=2, multi_tool=3, adversarial=1 | -| Capabilities | bugfix, refactor, test_authoring, multifile_reasoning, browser_debugging, structured_output, graceful_refusal, delegation, tool_composition, research_synthesis, cross_repo_change, memory_continuation | +| Tiers | T1=2, T2=6, T3=5, T4=5, T5=1 | +| Families | tools=8, coding=2, repo=3, browser=2, multi_tool=3, adversarial=1 | +| Capabilities | bugfix, test_authoring, multifile_reasoning, browser_debugging, structured_output, graceful_refusal, delegation, tool_composition, research_synthesis, cross_repo_change, memory_continuation | ## Directory layout @@ -44,9 +44,9 @@ tasks-public/ ├── MANIFEST.yaml # Machine-readable task list + metadata ├── README.md # This file ├── tier1/ # 2 task YAMLs -├── tier2/ # 7 task YAMLs +├── tier2/ # 6 task YAMLs ├── tier3/ # 5 task YAMLs -├── tier4/ # 4 task YAMLs +├── tier4/ # 5 task YAMLs ├── tier5/ # 1 task YAML └── assets/ # 19 asset packs (verifier scripts + fixtures) ``` diff --git a/tests/test_harness.py b/tests/test_harness.py index 93ab66b..85227a4 100644 --- a/tests/test_harness.py +++ b/tests/test_harness.py @@ -163,3 +163,55 @@ def test_compose_result_from_task_stats_supports_parallel_environment_metadata() assert merged_result.environment["parallel_lanes"] == 2 assert merged_result.environment["requested_parallel_lanes"] == 3 assert merged_result.environment["browser_tasks_serialized"] is False + + +@pytest.mark.asyncio +async def test_run_records_adapter_surface(monkeypatch): + task = next(task for task in load_all_tasks() if task.id == "t1-bugfix-discount") + + async def fake_run_single(self, current_task, run_index: int): + return TaskRunResult( + task_id=current_task.id, + tier=current_task.tier.value, + family=current_task.family.value, + run_index=run_index, + run_score=1.0, + completion_result=CompletionResult(total_assertions=1, passed_assertions=1, score=1.0), + ) + + monkeypatch.setattr("clawbench.harness.load_all_tasks", lambda **_: [task]) + monkeypatch.setattr(BenchmarkHarness, "_run_single", fake_run_single) + + harness = BenchmarkHarness( + gateway_config=GatewayConfig(), + model="test-model", + adapter="openclaw", + runs_per_task=1, + randomize_order=False, + print_report=False, + quiet=True, + ) + + result = await harness.run() + + assert result.environment["adapter"] == "openclaw" + assert "hermes" in result.environment["known_adapters"] + + +@pytest.mark.asyncio +async def test_run_rejects_registered_but_unwired_adapter(monkeypatch): + task = next(task for task in load_all_tasks() if task.id == "t1-bugfix-discount") + monkeypatch.setattr("clawbench.harness.load_all_tasks", lambda **_: [task]) + + harness = BenchmarkHarness( + gateway_config=GatewayConfig(), + model="test-model", + adapter="hermes", + runs_per_task=1, + randomize_order=False, + print_report=False, + quiet=True, + ) + + with pytest.raises(ValueError, match="not yet wired"): + await harness.run() diff --git a/tests/test_v05_extensions.py b/tests/test_v05_extensions.py index 1b9a290..e3cff45 100644 --- a/tests/test_v05_extensions.py +++ b/tests/test_v05_extensions.py @@ -23,6 +23,7 @@ from pathlib import Path sys.path.insert(0, str(Path(__file__).resolve().parents[1])) from clawbench.diagnostic import build_diagnostic, submit_run +from clawbench.diagnose_cli import infer_registration_traces_from_manifests from clawbench.factor_analysis import analyze from clawbench.insights import ( compute_capability_gaps, @@ -139,6 +140,22 @@ def test_taguchi_sn_handles_zero_score_without_crashing(): # --------------------------------------------------------------------------- +def test_infer_registration_traces_from_manifests_uses_declared_tools(): + profile = _make_profile("p", ["alpha", "missing"]) + manifests = { + "alpha": _make_manifest( + "alpha", + tools=["read_file", "browser_click", "memory_write"], + ), + } + + traces = infer_registration_traces_from_manifests(profile, manifests) + + assert set(traces) == {"alpha"} + assert traces["alpha"].tools == ["read_file", "browser_click", "memory_write"] + assert traces["alpha"].tool_families_seen == ["browser", "memory", "read"] + + def test_audit_flags_dead_weight_plugin(): profile = _make_profile("p", ["alpha", "beta"]) manifests = {