Add public domain scaffold and adapter diagnostics

This commit is contained in:
scoootscooob 2026-04-23 12:40:23 -07:00
parent df32a5f073
commit 595cdc910c
10 changed files with 402 additions and 15 deletions

View File

@ -245,9 +245,9 @@ Core v1 is a signal-curated public release of 19 tasks from the internal 40-task
| Tier | Core v1 count | What it tests | Examples |
|------|:---:|---|---|
| **Tier 1** | 2 | Single-tool basics | Bugfix discount calc, quick file note |
| **Tier 2** | 7 | Multi-step, 2-3 tools | Config loader repair, browser form fix, priv redaction |
| **Tier 2** | 6 | Multi-step, 2-3 tools | Config loader repair, browser form fix, priv redaction |
| **Tier 3** | 5 | Complex orchestration | SQL query analysis, inbox triage, data pipeline report |
| **Tier 4** | 4 | Cross-system reasoning | Cross-repo migration, delegation repair, browser research+code |
| **Tier 4** | 5 | Cross-system reasoning | Cross-repo migration, delegation repair, memory continuation, browser research+code |
| **Tier 5** | 1 | Adversarial | Hallucination-resistant evidence |
Full manifest: [`tasks-public/MANIFEST.yaml`](tasks-public/MANIFEST.yaml).

View File

@ -10,7 +10,7 @@ from pathlib import Path
import click
from clawbench.client import GatewayConfig
from clawbench.harness import BenchmarkHarness
from clawbench.harness import BenchmarkHarness, KNOWN_ADAPTERS
SCENARIO_CHOICES = [
"file_system_ops",
@ -41,6 +41,13 @@ def cli(verbose: bool) -> None:
@cli.command()
@click.option("--model", "-m", required=True, help="Model to benchmark")
@click.option(
"--adapter",
type=click.Choice(KNOWN_ADAPTERS),
default="openclaw",
show_default=True,
help="Agent harness adapter. OpenClaw is executable today; other adapters are tracked targets.",
)
@click.option("--gateway-token", envvar="OPENCLAW_GATEWAY_TOKEN", default="", help="Gateway auth token")
@click.option(
"--judge-model",
@ -123,6 +130,7 @@ def cli(verbose: bool) -> None:
)
def run(
model: str,
adapter: str,
gateway_token: str,
judge_model: str,
runs: int,
@ -148,6 +156,7 @@ def run(
harness = BenchmarkHarness(
gateway_config=gateway_config,
model=model,
adapter=adapter,
judge_model=judge_model,
runs_per_task=runs,
tier=tier,
@ -178,6 +187,7 @@ def run(
_run_v05_diagnostic(
profile_path=profile,
result=result,
task_runs=harness.last_task_runs,
runs_per_task=runs,
insights_dir=insights_dir,
)
@ -269,6 +279,7 @@ def _run_v05_diagnostic(
*,
profile_path: Path,
result,
task_runs: dict[str, list] | None,
runs_per_task: int,
insights_dir: Path,
) -> None:
@ -278,6 +289,7 @@ def _run_v05_diagnostic(
DEFAULT_MANIFEST_DIR,
DEFAULT_SUBMISSIONS_DIR,
ensure_data_dirs,
infer_registration_traces_from_manifests,
load_manifests,
write_submission_record,
)
@ -291,6 +303,7 @@ def _run_v05_diagnostic(
plugin_profile = PluginProfile.from_yaml_file(profile_path)
plugin_ids = [e.id for e in plugin_profile.plugins]
manifests = load_manifests(DEFAULT_MANIFEST_DIR, plugin_ids)
traces = infer_registration_traces_from_manifests(plugin_profile, manifests)
db = HistoricalDatabase(path=DEFAULT_DB_PATH)
# Extract per-task scores + tier map from the BenchmarkResult
@ -301,12 +314,16 @@ def _run_v05_diagnostic(
if getattr(task_stats, "tier", ""):
tier_of[task_stats.task_id] = task_stats.tier
transcripts = _merge_task_transcripts_from_runs(task_runs or {})
diagnostic = submit_run(
profile=plugin_profile,
manifests=manifests,
db=db,
actual_overall_score=float(result.overall_score),
actual_per_task_scores=actual_per_task,
traces=traces,
transcripts=transcripts,
tier_of=tier_of or None,
n_runs_contributing=runs_per_task,
)
@ -329,6 +346,22 @@ def _run_v05_diagnostic(
)
def _merge_task_transcripts_from_runs(task_runs: dict[str, list]):
"""Merge all run transcripts per task for the v0.5 utilization audit."""
if not task_runs:
return None
from clawbench.schemas import Transcript
merged: dict[str, Transcript] = {}
for task_id, runs in task_runs.items():
transcript = Transcript()
for run in runs:
transcript.messages.extend(getattr(run.transcript, "messages", []))
if transcript.messages:
merged[task_id] = transcript
return merged or None
@cli.command()
@click.argument("profile", type=click.Path(exists=True, path_type=Path))
@click.option(

View File

@ -37,7 +37,8 @@ from clawbench.diagnostic import build_diagnostic, submit_run
from clawbench.insights import publish_insights
from clawbench.prediction import HistoricalDatabase
from clawbench.profile import PluginManifest, PluginProfile, RegistrationTrace
from clawbench.schemas import Transcript
from clawbench.schemas import ToolCall, Transcript
from clawbench.trajectory import classify_tool_call
DEFAULT_CLAWBENCH_ROOT = Path(".clawbench")
@ -80,6 +81,39 @@ def load_transcripts(path: Path) -> dict[str, Transcript]:
return out
def infer_registration_traces_from_manifests(
profile: PluginProfile,
manifests: dict[str, PluginManifest],
) -> dict[str, RegistrationTrace]:
"""Build best-effort registration traces from manifest-declared tools.
Full runtime registration traces are better because they include hooks,
gateway methods, routes, and services. This fallback still gives the
diagnostic layer exact manifest-declared tool names, which is enough to
attribute many transcript tool calls instead of dropping all utilization
into the unassigned bucket.
"""
traces: dict[str, RegistrationTrace] = {}
for entry in profile.plugins:
manifest = manifests.get(entry.id)
if manifest is None:
continue
tools = list(manifest.contracts.get("tools", []))
families = sorted(
{
classify_tool_call(ToolCall(name=tool))[0]
for tool in tools
if tool
}
)
traces[entry.id] = RegistrationTrace(
plugin_id=entry.id,
tools=tools,
tool_families_seen=families,
)
return traces
def write_submission_record(
submissions_dir: Path, fingerprint_hash: str, report_dict: dict
) -> Path:
@ -162,6 +196,7 @@ def main() -> None:
profile = PluginProfile.from_yaml_file(args.profile)
plugin_ids = [e.id for e in profile.plugins]
manifests = load_manifests(args.manifests, plugin_ids)
traces = infer_registration_traces_from_manifests(profile, manifests)
db = HistoricalDatabase(path=args.db)
actual_overall: float | None = None
@ -172,9 +207,16 @@ def main() -> None:
sys.exit(2)
results_data = json.loads(args.results.read_text(encoding="utf-8"))
actual_overall = float(results_data.get("overall_score", 0.0))
actual_per_task = {
k: float(v) for k, v in results_data.get("per_task_score", {}).items()
}
if "per_task_score" in results_data:
actual_per_task = {
k: float(v) for k, v in results_data.get("per_task_score", {}).items()
}
else:
actual_per_task = {
str(item.get("task_id")): float(item.get("mean_task_score", 0.0))
for item in results_data.get("task_results", [])
if item.get("task_id")
}
transcripts: dict[str, Transcript] | None = None
if args.transcripts:
@ -208,6 +250,7 @@ def main() -> None:
db=db,
actual_overall_score=actual_overall,
actual_per_task_scores=actual_per_task,
traces=traces,
transcripts=transcripts,
tier_of=tier_of,
)
@ -223,6 +266,7 @@ def main() -> None:
db=db,
actual_overall_score=actual_overall,
actual_per_task_scores=actual_per_task,
traces=traces,
transcripts=transcripts,
tier_of=tier_of,
)

View File

@ -40,6 +40,9 @@ from clawbench.tasks import get_assets_dir, load_all_tasks
logger = logging.getLogger(__name__)
console = Console()
KNOWN_ADAPTERS = ("openclaw", "hermes", "codex", "claude-code")
EXECUTABLE_ADAPTERS = {"openclaw"}
class _NullCtx:
"""A no-op async context manager used to skip the browser semaphore
@ -79,6 +82,7 @@ class BenchmarkHarness:
quiet: bool = False,
concurrency: int = 1,
browser_concurrency: int = 1,
adapter: str = "openclaw",
) -> None:
self.gateway_config = gateway_config
self.model = model
@ -102,10 +106,21 @@ class BenchmarkHarness:
self.quiet = quiet
self.concurrency = max(1, int(concurrency))
self.browser_concurrency = max(1, int(browser_concurrency))
self.adapter = adapter
self.repo_root = Path(__file__).parent.parent
self.last_task_runs: dict[str, list[TaskRunResult]] = {}
async def run(self) -> BenchmarkResult:
if self.adapter not in KNOWN_ADAPTERS:
raise ValueError(
f"Unknown adapter '{self.adapter}'. Known adapters: {', '.join(KNOWN_ADAPTERS)}"
)
if self.adapter not in EXECUTABLE_ADAPTERS:
raise ValueError(
f"Adapter '{self.adapter}' is registered as a target but is not yet wired "
"into the end-to-end scoring harness. Use 'openclaw' for executable runs."
)
tasks = load_all_tasks(
tasks_dir=self.tasks_dir,
tier=self.tier,
@ -129,6 +144,7 @@ class BenchmarkHarness:
if not self.quiet:
console.print(f"\n[bold]ClawBench v{__version__}[/bold] — {len(tasks)} tasks x {self.runs_per_task} runs")
console.print(f"Model: [cyan]{self.model}[/cyan]")
console.print(f"Adapter: [cyan]{self.adapter}[/cyan]")
if self.judge_model:
console.print(f"Advisory judge: [magenta]{self.judge_model}[/magenta]")
mode = "serial" if self.concurrency == 1 else f"parallel(concurrency={self.concurrency}, browser={self.browser_concurrency})"
@ -726,6 +742,9 @@ class BenchmarkHarness:
"artifact_type": self.artifact_type or "all",
"prompt_variant": self.prompt_variant,
"judge_model": self.judge_model,
"adapter": self.adapter,
"known_adapters": list(KNOWN_ADAPTERS),
"executable_adapters": sorted(EXECUTABLE_ADAPTERS),
"subsets": self.subsets,
"capabilities": self.capabilities,
"official_only": self.official_only,

163
tasks-domain/MANIFEST.yaml Normal file
View File

@ -0,0 +1,163 @@
manifest_version: 1
release: clawbench-domain-v0
status: scaffold
purpose: |
Domain coverage scaffold for proving that model + general harness + plugins
covers the jobs served by most agent SaaS products. This is not the small
public Core v1 benchmark. It is the planned expansion corpus.
relationship_to_core_v1: |
tasks-public/Core v1 is the public, signal-curated reproducibility set.
tasks-domain is the domain coverage and ablation suite. Core v1 can stay
small; domain coverage should grow through templates and private variants.
domains:
- id: crm
label: CRM
representative_jobs:
- lead enrichment
- account update from meeting notes
- opportunity risk summary
- duplicate contact cleanup
- follow-up task creation
plugin_requirements: [browser, crm_api, docs, search, memory]
verifier_contracts: [api_state, structured_artifact, cited_evidence]
- id: support
label: Support
representative_jobs:
- ticket triage
- macro draft with policy evidence
- escalation routing
- refund eligibility lookup
- customer timeline summary
plugin_requirements: [browser, support_api, knowledge_base, email]
verifier_contracts: [api_state, policy_match, cited_evidence]
- id: email_calendar
label: Email and calendar
representative_jobs:
- thread summarization
- meeting scheduling
- follow-up drafting
- conflict detection
- contact-aware prioritization
plugin_requirements: [email, calendar, contacts, memory]
verifier_contracts: [calendar_state, draft_content, no_duplicate_state]
- id: docs_sheets_slides
label: Docs, sheets, slides
representative_jobs:
- spreadsheet cleanup
- deck update
- document redaction
- chart generation
- report formatting
plugin_requirements: [filesystem, spreadsheet, document, slides, charting]
verifier_contracts: [file_structure, rendered_diff, formula_check]
- id: project_management
label: Project management
representative_jobs:
- issue grooming
- sprint status update
- dependency tracking
- stale task cleanup
- launch checklist synthesis
plugin_requirements: [pm_api, repo, docs, notifications]
verifier_contracts: [api_state, link_integrity, dependency_state]
- id: finance_ops
label: Finance ops
representative_jobs:
- invoice reconciliation
- expense categorization
- budget variance report
- payment exception triage
- tax document checklist
plugin_requirements: [spreadsheet, accounting_api, document, ocr]
verifier_contracts: [numeric_tolerance, ledger_delta, audit_trail]
- id: data_analytics
label: Data analytics
representative_jobs:
- SQL answer
- dashboard explanation
- ETL patch
- anomaly investigation
- chart specification
plugin_requirements: [database, notebook, filesystem, bi_api]
verifier_contracts: [query_result, execution_check, chart_spec]
- id: security_admin
label: Security admin
representative_jobs:
- access review
- incident timeline
- secret rotation plan
- policy exception review
- audit log evidence packet
plugin_requirements: [identity_api, logs, repo, policy_docs]
verifier_contracts: [policy_state, cited_logs, refusal_gate]
- id: ecommerce_ops
label: Ecommerce ops
representative_jobs:
- catalog update
- order exception handling
- promo QA
- inventory reconciliation
- returns policy response
plugin_requirements: [storefront_api, spreadsheet, browser, email]
verifier_contracts: [api_state, price_check, order_state]
- id: devtools
label: Devtools
representative_jobs:
- repo migration
- CI failure repair
- release note generation
- dependency update
- multi-repo contract change
plugin_requirements: [shell, git, filesystem, package_registry]
verifier_contracts: [test_pass, diff_assertion, changelog_check]
- id: research
label: Research
representative_jobs:
- evidence memo
- citation synthesis
- source contradiction handling
- market scan
- literature extraction
plugin_requirements: [browser, web_search, web_fetch, document]
verifier_contracts: [citation_check, no_fabrication, source_coverage]
- id: personal_ops
label: Personal ops
representative_jobs:
- travel planning
- household planning
- health admin summary
- personal finance checklist
- recurring reminder setup
plugin_requirements: [calendar, browser, memory, document]
verifier_contracts: [constraint_satisfaction, state_transition, refusal_gate]
release_targets:
domain_count: 12
templates_per_domain: 5
private_variants_per_template: 3
runs_per_configuration: 3
public_templates_total: 60
private_variants_total: 180
ablation_classes:
- id: model_only
description: Model with minimal shell/filesystem access.
- id: model_plus_harness
description: Model plus general OpenClaw-style harness, no domain plugins.
- id: core_plugins
description: Harness plus common browser, memory, filesystem, and execution plugins.
- id: domain_plugins
description: Harness plus the plugins needed for each domain state surface.

59
tasks-domain/README.md Normal file
View File

@ -0,0 +1,59 @@
# ClawBench Domain Suite
`tasks-public/` is the small public Core v1 set. `tasks-domain/` is the
coverage scaffold for the larger proof corpus: the domains served by most
agent SaaS products, expressed as deterministic benchmark work.
The claim this suite is meant to support is:
> A capable model plus a general agent harness plus the right plugins can
> cover the task domains that most agent SaaS products sell.
This is intentionally not a clone of vendor products. It is a taxonomy of
jobs, state transitions, and verifier contracts.
## Domains
| Domain | Representative jobs | Required plugin surface | Verification style |
|---|---|---|---|
| CRM | lead enrichment, account updates, meeting notes to opportunities | browser, CRM API, docs, search | API state assertions, fixture diffs |
| Support | ticket triage, macro draft, escalation, refund lookup | browser/API, knowledge base, email | ticket state, cited evidence, policy checks |
| Email and calendar | thread summarization, scheduling, follow-ups | mail, calendar, contacts, memory | event state, draft content, no-duplicate checks |
| Docs, sheets, slides | spreadsheet cleanup, deck edits, document redaction | file, office docs, charting | structural file assertions, rendered diffs |
| Project management | issue grooming, sprint updates, dependency tracking | PM API, repo, docs, notifications | issue state, links, blocked/unblocked status |
| Finance ops | invoice reconciliation, expense coding, budget variance | spreadsheets, accounting API, OCR | ledger deltas, numeric tolerances, audit trail |
| Data analytics | SQL, dashboard explanation, ETL patch, anomaly report | database, notebooks, BI API | query results, chart spec, report content |
| Security admin | access review, incident timeline, secret rotation plan | identity, logs, repo, policy docs | policy state, log-derived evidence, refusal gates |
| Ecommerce ops | catalog updates, order exception handling, promo QA | storefront API, spreadsheet, browser | product state, order workflow, price checks |
| Devtools | repo migration, CI fix, release note, dependency update | shell, git, code, package registry | test pass, diff assertions, changelog checks |
| Research | web evidence, citation synthesis, source contradiction | browser, web search, docs | citation verifier, no-fabrication checks |
| Personal ops | travel, household planning, health/wellness admin | calendar, browser, memory, docs | constraint satisfaction, state updates |
## Proof Standard
Each domain task should declare:
- `domain`: one of the domains above
- `job`: the user-facing job being covered
- `saas_equivalents`: examples of products whose core workflow overlaps
- `plugin_requirements`: tool families and state surfaces needed
- `deterministic_floor`: the verifier that must pass before any judge score
- `holdout_variant_policy`: how private variants are generated
- `ablation_axis`: which plugins or harness capabilities the task tests
## Minimum Bar
For a credible first domain release:
- 12 domains
- 5 task templates per domain
- 3 private variants per template
- 3 runs per configuration
- at least 4 configuration classes:
- model only
- model plus harness
- model plus harness plus core plugins
- model plus harness plus domain plugins
That yields 60 public templates and 180 private variants before repetitions.
The public templates explain coverage; the private variants carry the proof.

View File

@ -43,12 +43,12 @@ selection_basis:
coverage:
tiers:
tier1: 2
tier2: 7
tier2: 6
tier3: 5
tier4: 4
tier4: 5
tier5: 1
families:
tools: 7
tools: 8
coding: 2
repo: 3
browser: 2

View File

@ -33,9 +33,9 @@ against your own configuration.
| Dimension | Breakdown |
|---|---|
| Tiers | T1=2, T2=7, T3=5, T4=4, T5=1 |
| Families | tools=7, coding=2, repo=3, browser=2, multi_tool=3, adversarial=1 |
| Capabilities | bugfix, refactor, test_authoring, multifile_reasoning, browser_debugging, structured_output, graceful_refusal, delegation, tool_composition, research_synthesis, cross_repo_change, memory_continuation |
| Tiers | T1=2, T2=6, T3=5, T4=5, T5=1 |
| Families | tools=8, coding=2, repo=3, browser=2, multi_tool=3, adversarial=1 |
| Capabilities | bugfix, test_authoring, multifile_reasoning, browser_debugging, structured_output, graceful_refusal, delegation, tool_composition, research_synthesis, cross_repo_change, memory_continuation |
## Directory layout
@ -44,9 +44,9 @@ tasks-public/
├── MANIFEST.yaml # Machine-readable task list + metadata
├── README.md # This file
├── tier1/ # 2 task YAMLs
├── tier2/ # 7 task YAMLs
├── tier2/ # 6 task YAMLs
├── tier3/ # 5 task YAMLs
├── tier4/ # 4 task YAMLs
├── tier4/ # 5 task YAMLs
├── tier5/ # 1 task YAML
└── assets/ # 19 asset packs (verifier scripts + fixtures)
```

View File

@ -163,3 +163,55 @@ def test_compose_result_from_task_stats_supports_parallel_environment_metadata()
assert merged_result.environment["parallel_lanes"] == 2
assert merged_result.environment["requested_parallel_lanes"] == 3
assert merged_result.environment["browser_tasks_serialized"] is False
@pytest.mark.asyncio
async def test_run_records_adapter_surface(monkeypatch):
task = next(task for task in load_all_tasks() if task.id == "t1-bugfix-discount")
async def fake_run_single(self, current_task, run_index: int):
return TaskRunResult(
task_id=current_task.id,
tier=current_task.tier.value,
family=current_task.family.value,
run_index=run_index,
run_score=1.0,
completion_result=CompletionResult(total_assertions=1, passed_assertions=1, score=1.0),
)
monkeypatch.setattr("clawbench.harness.load_all_tasks", lambda **_: [task])
monkeypatch.setattr(BenchmarkHarness, "_run_single", fake_run_single)
harness = BenchmarkHarness(
gateway_config=GatewayConfig(),
model="test-model",
adapter="openclaw",
runs_per_task=1,
randomize_order=False,
print_report=False,
quiet=True,
)
result = await harness.run()
assert result.environment["adapter"] == "openclaw"
assert "hermes" in result.environment["known_adapters"]
@pytest.mark.asyncio
async def test_run_rejects_registered_but_unwired_adapter(monkeypatch):
task = next(task for task in load_all_tasks() if task.id == "t1-bugfix-discount")
monkeypatch.setattr("clawbench.harness.load_all_tasks", lambda **_: [task])
harness = BenchmarkHarness(
gateway_config=GatewayConfig(),
model="test-model",
adapter="hermes",
runs_per_task=1,
randomize_order=False,
print_report=False,
quiet=True,
)
with pytest.raises(ValueError, match="not yet wired"):
await harness.run()

View File

@ -23,6 +23,7 @@ from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
from clawbench.diagnostic import build_diagnostic, submit_run
from clawbench.diagnose_cli import infer_registration_traces_from_manifests
from clawbench.factor_analysis import analyze
from clawbench.insights import (
compute_capability_gaps,
@ -139,6 +140,22 @@ def test_taguchi_sn_handles_zero_score_without_crashing():
# ---------------------------------------------------------------------------
def test_infer_registration_traces_from_manifests_uses_declared_tools():
profile = _make_profile("p", ["alpha", "missing"])
manifests = {
"alpha": _make_manifest(
"alpha",
tools=["read_file", "browser_click", "memory_write"],
),
}
traces = infer_registration_traces_from_manifests(profile, manifests)
assert set(traces) == {"alpha"}
assert traces["alpha"].tools == ["read_file", "browser_click", "memory_write"]
assert traces["alpha"].tool_families_seen == ["browser", "memory", "read"]
def test_audit_flags_dead_weight_plugin():
profile = _make_profile("p", ["alpha", "beta"])
manifests = {