Add public domain scaffold and adapter diagnostics
This commit is contained in:
parent
df32a5f073
commit
595cdc910c
@ -245,9 +245,9 @@ Core v1 is a signal-curated public release of 19 tasks from the internal 40-task
|
||||
| Tier | Core v1 count | What it tests | Examples |
|
||||
|------|:---:|---|---|
|
||||
| **Tier 1** | 2 | Single-tool basics | Bugfix discount calc, quick file note |
|
||||
| **Tier 2** | 7 | Multi-step, 2-3 tools | Config loader repair, browser form fix, priv redaction |
|
||||
| **Tier 2** | 6 | Multi-step, 2-3 tools | Config loader repair, browser form fix, priv redaction |
|
||||
| **Tier 3** | 5 | Complex orchestration | SQL query analysis, inbox triage, data pipeline report |
|
||||
| **Tier 4** | 4 | Cross-system reasoning | Cross-repo migration, delegation repair, browser research+code |
|
||||
| **Tier 4** | 5 | Cross-system reasoning | Cross-repo migration, delegation repair, memory continuation, browser research+code |
|
||||
| **Tier 5** | 1 | Adversarial | Hallucination-resistant evidence |
|
||||
|
||||
Full manifest: [`tasks-public/MANIFEST.yaml`](tasks-public/MANIFEST.yaml).
|
||||
|
||||
@ -10,7 +10,7 @@ from pathlib import Path
|
||||
import click
|
||||
|
||||
from clawbench.client import GatewayConfig
|
||||
from clawbench.harness import BenchmarkHarness
|
||||
from clawbench.harness import BenchmarkHarness, KNOWN_ADAPTERS
|
||||
|
||||
SCENARIO_CHOICES = [
|
||||
"file_system_ops",
|
||||
@ -41,6 +41,13 @@ def cli(verbose: bool) -> None:
|
||||
|
||||
@cli.command()
|
||||
@click.option("--model", "-m", required=True, help="Model to benchmark")
|
||||
@click.option(
|
||||
"--adapter",
|
||||
type=click.Choice(KNOWN_ADAPTERS),
|
||||
default="openclaw",
|
||||
show_default=True,
|
||||
help="Agent harness adapter. OpenClaw is executable today; other adapters are tracked targets.",
|
||||
)
|
||||
@click.option("--gateway-token", envvar="OPENCLAW_GATEWAY_TOKEN", default="", help="Gateway auth token")
|
||||
@click.option(
|
||||
"--judge-model",
|
||||
@ -123,6 +130,7 @@ def cli(verbose: bool) -> None:
|
||||
)
|
||||
def run(
|
||||
model: str,
|
||||
adapter: str,
|
||||
gateway_token: str,
|
||||
judge_model: str,
|
||||
runs: int,
|
||||
@ -148,6 +156,7 @@ def run(
|
||||
harness = BenchmarkHarness(
|
||||
gateway_config=gateway_config,
|
||||
model=model,
|
||||
adapter=adapter,
|
||||
judge_model=judge_model,
|
||||
runs_per_task=runs,
|
||||
tier=tier,
|
||||
@ -178,6 +187,7 @@ def run(
|
||||
_run_v05_diagnostic(
|
||||
profile_path=profile,
|
||||
result=result,
|
||||
task_runs=harness.last_task_runs,
|
||||
runs_per_task=runs,
|
||||
insights_dir=insights_dir,
|
||||
)
|
||||
@ -269,6 +279,7 @@ def _run_v05_diagnostic(
|
||||
*,
|
||||
profile_path: Path,
|
||||
result,
|
||||
task_runs: dict[str, list] | None,
|
||||
runs_per_task: int,
|
||||
insights_dir: Path,
|
||||
) -> None:
|
||||
@ -278,6 +289,7 @@ def _run_v05_diagnostic(
|
||||
DEFAULT_MANIFEST_DIR,
|
||||
DEFAULT_SUBMISSIONS_DIR,
|
||||
ensure_data_dirs,
|
||||
infer_registration_traces_from_manifests,
|
||||
load_manifests,
|
||||
write_submission_record,
|
||||
)
|
||||
@ -291,6 +303,7 @@ def _run_v05_diagnostic(
|
||||
plugin_profile = PluginProfile.from_yaml_file(profile_path)
|
||||
plugin_ids = [e.id for e in plugin_profile.plugins]
|
||||
manifests = load_manifests(DEFAULT_MANIFEST_DIR, plugin_ids)
|
||||
traces = infer_registration_traces_from_manifests(plugin_profile, manifests)
|
||||
db = HistoricalDatabase(path=DEFAULT_DB_PATH)
|
||||
|
||||
# Extract per-task scores + tier map from the BenchmarkResult
|
||||
@ -301,12 +314,16 @@ def _run_v05_diagnostic(
|
||||
if getattr(task_stats, "tier", ""):
|
||||
tier_of[task_stats.task_id] = task_stats.tier
|
||||
|
||||
transcripts = _merge_task_transcripts_from_runs(task_runs or {})
|
||||
|
||||
diagnostic = submit_run(
|
||||
profile=plugin_profile,
|
||||
manifests=manifests,
|
||||
db=db,
|
||||
actual_overall_score=float(result.overall_score),
|
||||
actual_per_task_scores=actual_per_task,
|
||||
traces=traces,
|
||||
transcripts=transcripts,
|
||||
tier_of=tier_of or None,
|
||||
n_runs_contributing=runs_per_task,
|
||||
)
|
||||
@ -329,6 +346,22 @@ def _run_v05_diagnostic(
|
||||
)
|
||||
|
||||
|
||||
def _merge_task_transcripts_from_runs(task_runs: dict[str, list]):
|
||||
"""Merge all run transcripts per task for the v0.5 utilization audit."""
|
||||
if not task_runs:
|
||||
return None
|
||||
from clawbench.schemas import Transcript
|
||||
|
||||
merged: dict[str, Transcript] = {}
|
||||
for task_id, runs in task_runs.items():
|
||||
transcript = Transcript()
|
||||
for run in runs:
|
||||
transcript.messages.extend(getattr(run.transcript, "messages", []))
|
||||
if transcript.messages:
|
||||
merged[task_id] = transcript
|
||||
return merged or None
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.argument("profile", type=click.Path(exists=True, path_type=Path))
|
||||
@click.option(
|
||||
|
||||
@ -37,7 +37,8 @@ from clawbench.diagnostic import build_diagnostic, submit_run
|
||||
from clawbench.insights import publish_insights
|
||||
from clawbench.prediction import HistoricalDatabase
|
||||
from clawbench.profile import PluginManifest, PluginProfile, RegistrationTrace
|
||||
from clawbench.schemas import Transcript
|
||||
from clawbench.schemas import ToolCall, Transcript
|
||||
from clawbench.trajectory import classify_tool_call
|
||||
|
||||
|
||||
DEFAULT_CLAWBENCH_ROOT = Path(".clawbench")
|
||||
@ -80,6 +81,39 @@ def load_transcripts(path: Path) -> dict[str, Transcript]:
|
||||
return out
|
||||
|
||||
|
||||
def infer_registration_traces_from_manifests(
|
||||
profile: PluginProfile,
|
||||
manifests: dict[str, PluginManifest],
|
||||
) -> dict[str, RegistrationTrace]:
|
||||
"""Build best-effort registration traces from manifest-declared tools.
|
||||
|
||||
Full runtime registration traces are better because they include hooks,
|
||||
gateway methods, routes, and services. This fallback still gives the
|
||||
diagnostic layer exact manifest-declared tool names, which is enough to
|
||||
attribute many transcript tool calls instead of dropping all utilization
|
||||
into the unassigned bucket.
|
||||
"""
|
||||
traces: dict[str, RegistrationTrace] = {}
|
||||
for entry in profile.plugins:
|
||||
manifest = manifests.get(entry.id)
|
||||
if manifest is None:
|
||||
continue
|
||||
tools = list(manifest.contracts.get("tools", []))
|
||||
families = sorted(
|
||||
{
|
||||
classify_tool_call(ToolCall(name=tool))[0]
|
||||
for tool in tools
|
||||
if tool
|
||||
}
|
||||
)
|
||||
traces[entry.id] = RegistrationTrace(
|
||||
plugin_id=entry.id,
|
||||
tools=tools,
|
||||
tool_families_seen=families,
|
||||
)
|
||||
return traces
|
||||
|
||||
|
||||
def write_submission_record(
|
||||
submissions_dir: Path, fingerprint_hash: str, report_dict: dict
|
||||
) -> Path:
|
||||
@ -162,6 +196,7 @@ def main() -> None:
|
||||
profile = PluginProfile.from_yaml_file(args.profile)
|
||||
plugin_ids = [e.id for e in profile.plugins]
|
||||
manifests = load_manifests(args.manifests, plugin_ids)
|
||||
traces = infer_registration_traces_from_manifests(profile, manifests)
|
||||
db = HistoricalDatabase(path=args.db)
|
||||
|
||||
actual_overall: float | None = None
|
||||
@ -172,9 +207,16 @@ def main() -> None:
|
||||
sys.exit(2)
|
||||
results_data = json.loads(args.results.read_text(encoding="utf-8"))
|
||||
actual_overall = float(results_data.get("overall_score", 0.0))
|
||||
actual_per_task = {
|
||||
k: float(v) for k, v in results_data.get("per_task_score", {}).items()
|
||||
}
|
||||
if "per_task_score" in results_data:
|
||||
actual_per_task = {
|
||||
k: float(v) for k, v in results_data.get("per_task_score", {}).items()
|
||||
}
|
||||
else:
|
||||
actual_per_task = {
|
||||
str(item.get("task_id")): float(item.get("mean_task_score", 0.0))
|
||||
for item in results_data.get("task_results", [])
|
||||
if item.get("task_id")
|
||||
}
|
||||
|
||||
transcripts: dict[str, Transcript] | None = None
|
||||
if args.transcripts:
|
||||
@ -208,6 +250,7 @@ def main() -> None:
|
||||
db=db,
|
||||
actual_overall_score=actual_overall,
|
||||
actual_per_task_scores=actual_per_task,
|
||||
traces=traces,
|
||||
transcripts=transcripts,
|
||||
tier_of=tier_of,
|
||||
)
|
||||
@ -223,6 +266,7 @@ def main() -> None:
|
||||
db=db,
|
||||
actual_overall_score=actual_overall,
|
||||
actual_per_task_scores=actual_per_task,
|
||||
traces=traces,
|
||||
transcripts=transcripts,
|
||||
tier_of=tier_of,
|
||||
)
|
||||
|
||||
@ -40,6 +40,9 @@ from clawbench.tasks import get_assets_dir, load_all_tasks
|
||||
logger = logging.getLogger(__name__)
|
||||
console = Console()
|
||||
|
||||
KNOWN_ADAPTERS = ("openclaw", "hermes", "codex", "claude-code")
|
||||
EXECUTABLE_ADAPTERS = {"openclaw"}
|
||||
|
||||
|
||||
class _NullCtx:
|
||||
"""A no-op async context manager used to skip the browser semaphore
|
||||
@ -79,6 +82,7 @@ class BenchmarkHarness:
|
||||
quiet: bool = False,
|
||||
concurrency: int = 1,
|
||||
browser_concurrency: int = 1,
|
||||
adapter: str = "openclaw",
|
||||
) -> None:
|
||||
self.gateway_config = gateway_config
|
||||
self.model = model
|
||||
@ -102,10 +106,21 @@ class BenchmarkHarness:
|
||||
self.quiet = quiet
|
||||
self.concurrency = max(1, int(concurrency))
|
||||
self.browser_concurrency = max(1, int(browser_concurrency))
|
||||
self.adapter = adapter
|
||||
self.repo_root = Path(__file__).parent.parent
|
||||
self.last_task_runs: dict[str, list[TaskRunResult]] = {}
|
||||
|
||||
async def run(self) -> BenchmarkResult:
|
||||
if self.adapter not in KNOWN_ADAPTERS:
|
||||
raise ValueError(
|
||||
f"Unknown adapter '{self.adapter}'. Known adapters: {', '.join(KNOWN_ADAPTERS)}"
|
||||
)
|
||||
if self.adapter not in EXECUTABLE_ADAPTERS:
|
||||
raise ValueError(
|
||||
f"Adapter '{self.adapter}' is registered as a target but is not yet wired "
|
||||
"into the end-to-end scoring harness. Use 'openclaw' for executable runs."
|
||||
)
|
||||
|
||||
tasks = load_all_tasks(
|
||||
tasks_dir=self.tasks_dir,
|
||||
tier=self.tier,
|
||||
@ -129,6 +144,7 @@ class BenchmarkHarness:
|
||||
if not self.quiet:
|
||||
console.print(f"\n[bold]ClawBench v{__version__}[/bold] — {len(tasks)} tasks x {self.runs_per_task} runs")
|
||||
console.print(f"Model: [cyan]{self.model}[/cyan]")
|
||||
console.print(f"Adapter: [cyan]{self.adapter}[/cyan]")
|
||||
if self.judge_model:
|
||||
console.print(f"Advisory judge: [magenta]{self.judge_model}[/magenta]")
|
||||
mode = "serial" if self.concurrency == 1 else f"parallel(concurrency={self.concurrency}, browser={self.browser_concurrency})"
|
||||
@ -726,6 +742,9 @@ class BenchmarkHarness:
|
||||
"artifact_type": self.artifact_type or "all",
|
||||
"prompt_variant": self.prompt_variant,
|
||||
"judge_model": self.judge_model,
|
||||
"adapter": self.adapter,
|
||||
"known_adapters": list(KNOWN_ADAPTERS),
|
||||
"executable_adapters": sorted(EXECUTABLE_ADAPTERS),
|
||||
"subsets": self.subsets,
|
||||
"capabilities": self.capabilities,
|
||||
"official_only": self.official_only,
|
||||
|
||||
163
tasks-domain/MANIFEST.yaml
Normal file
163
tasks-domain/MANIFEST.yaml
Normal file
@ -0,0 +1,163 @@
|
||||
manifest_version: 1
|
||||
release: clawbench-domain-v0
|
||||
status: scaffold
|
||||
purpose: |
|
||||
Domain coverage scaffold for proving that model + general harness + plugins
|
||||
covers the jobs served by most agent SaaS products. This is not the small
|
||||
public Core v1 benchmark. It is the planned expansion corpus.
|
||||
|
||||
relationship_to_core_v1: |
|
||||
tasks-public/Core v1 is the public, signal-curated reproducibility set.
|
||||
tasks-domain is the domain coverage and ablation suite. Core v1 can stay
|
||||
small; domain coverage should grow through templates and private variants.
|
||||
|
||||
domains:
|
||||
- id: crm
|
||||
label: CRM
|
||||
representative_jobs:
|
||||
- lead enrichment
|
||||
- account update from meeting notes
|
||||
- opportunity risk summary
|
||||
- duplicate contact cleanup
|
||||
- follow-up task creation
|
||||
plugin_requirements: [browser, crm_api, docs, search, memory]
|
||||
verifier_contracts: [api_state, structured_artifact, cited_evidence]
|
||||
|
||||
- id: support
|
||||
label: Support
|
||||
representative_jobs:
|
||||
- ticket triage
|
||||
- macro draft with policy evidence
|
||||
- escalation routing
|
||||
- refund eligibility lookup
|
||||
- customer timeline summary
|
||||
plugin_requirements: [browser, support_api, knowledge_base, email]
|
||||
verifier_contracts: [api_state, policy_match, cited_evidence]
|
||||
|
||||
- id: email_calendar
|
||||
label: Email and calendar
|
||||
representative_jobs:
|
||||
- thread summarization
|
||||
- meeting scheduling
|
||||
- follow-up drafting
|
||||
- conflict detection
|
||||
- contact-aware prioritization
|
||||
plugin_requirements: [email, calendar, contacts, memory]
|
||||
verifier_contracts: [calendar_state, draft_content, no_duplicate_state]
|
||||
|
||||
- id: docs_sheets_slides
|
||||
label: Docs, sheets, slides
|
||||
representative_jobs:
|
||||
- spreadsheet cleanup
|
||||
- deck update
|
||||
- document redaction
|
||||
- chart generation
|
||||
- report formatting
|
||||
plugin_requirements: [filesystem, spreadsheet, document, slides, charting]
|
||||
verifier_contracts: [file_structure, rendered_diff, formula_check]
|
||||
|
||||
- id: project_management
|
||||
label: Project management
|
||||
representative_jobs:
|
||||
- issue grooming
|
||||
- sprint status update
|
||||
- dependency tracking
|
||||
- stale task cleanup
|
||||
- launch checklist synthesis
|
||||
plugin_requirements: [pm_api, repo, docs, notifications]
|
||||
verifier_contracts: [api_state, link_integrity, dependency_state]
|
||||
|
||||
- id: finance_ops
|
||||
label: Finance ops
|
||||
representative_jobs:
|
||||
- invoice reconciliation
|
||||
- expense categorization
|
||||
- budget variance report
|
||||
- payment exception triage
|
||||
- tax document checklist
|
||||
plugin_requirements: [spreadsheet, accounting_api, document, ocr]
|
||||
verifier_contracts: [numeric_tolerance, ledger_delta, audit_trail]
|
||||
|
||||
- id: data_analytics
|
||||
label: Data analytics
|
||||
representative_jobs:
|
||||
- SQL answer
|
||||
- dashboard explanation
|
||||
- ETL patch
|
||||
- anomaly investigation
|
||||
- chart specification
|
||||
plugin_requirements: [database, notebook, filesystem, bi_api]
|
||||
verifier_contracts: [query_result, execution_check, chart_spec]
|
||||
|
||||
- id: security_admin
|
||||
label: Security admin
|
||||
representative_jobs:
|
||||
- access review
|
||||
- incident timeline
|
||||
- secret rotation plan
|
||||
- policy exception review
|
||||
- audit log evidence packet
|
||||
plugin_requirements: [identity_api, logs, repo, policy_docs]
|
||||
verifier_contracts: [policy_state, cited_logs, refusal_gate]
|
||||
|
||||
- id: ecommerce_ops
|
||||
label: Ecommerce ops
|
||||
representative_jobs:
|
||||
- catalog update
|
||||
- order exception handling
|
||||
- promo QA
|
||||
- inventory reconciliation
|
||||
- returns policy response
|
||||
plugin_requirements: [storefront_api, spreadsheet, browser, email]
|
||||
verifier_contracts: [api_state, price_check, order_state]
|
||||
|
||||
- id: devtools
|
||||
label: Devtools
|
||||
representative_jobs:
|
||||
- repo migration
|
||||
- CI failure repair
|
||||
- release note generation
|
||||
- dependency update
|
||||
- multi-repo contract change
|
||||
plugin_requirements: [shell, git, filesystem, package_registry]
|
||||
verifier_contracts: [test_pass, diff_assertion, changelog_check]
|
||||
|
||||
- id: research
|
||||
label: Research
|
||||
representative_jobs:
|
||||
- evidence memo
|
||||
- citation synthesis
|
||||
- source contradiction handling
|
||||
- market scan
|
||||
- literature extraction
|
||||
plugin_requirements: [browser, web_search, web_fetch, document]
|
||||
verifier_contracts: [citation_check, no_fabrication, source_coverage]
|
||||
|
||||
- id: personal_ops
|
||||
label: Personal ops
|
||||
representative_jobs:
|
||||
- travel planning
|
||||
- household planning
|
||||
- health admin summary
|
||||
- personal finance checklist
|
||||
- recurring reminder setup
|
||||
plugin_requirements: [calendar, browser, memory, document]
|
||||
verifier_contracts: [constraint_satisfaction, state_transition, refusal_gate]
|
||||
|
||||
release_targets:
|
||||
domain_count: 12
|
||||
templates_per_domain: 5
|
||||
private_variants_per_template: 3
|
||||
runs_per_configuration: 3
|
||||
public_templates_total: 60
|
||||
private_variants_total: 180
|
||||
|
||||
ablation_classes:
|
||||
- id: model_only
|
||||
description: Model with minimal shell/filesystem access.
|
||||
- id: model_plus_harness
|
||||
description: Model plus general OpenClaw-style harness, no domain plugins.
|
||||
- id: core_plugins
|
||||
description: Harness plus common browser, memory, filesystem, and execution plugins.
|
||||
- id: domain_plugins
|
||||
description: Harness plus the plugins needed for each domain state surface.
|
||||
59
tasks-domain/README.md
Normal file
59
tasks-domain/README.md
Normal file
@ -0,0 +1,59 @@
|
||||
# ClawBench Domain Suite
|
||||
|
||||
`tasks-public/` is the small public Core v1 set. `tasks-domain/` is the
|
||||
coverage scaffold for the larger proof corpus: the domains served by most
|
||||
agent SaaS products, expressed as deterministic benchmark work.
|
||||
|
||||
The claim this suite is meant to support is:
|
||||
|
||||
> A capable model plus a general agent harness plus the right plugins can
|
||||
> cover the task domains that most agent SaaS products sell.
|
||||
|
||||
This is intentionally not a clone of vendor products. It is a taxonomy of
|
||||
jobs, state transitions, and verifier contracts.
|
||||
|
||||
## Domains
|
||||
|
||||
| Domain | Representative jobs | Required plugin surface | Verification style |
|
||||
|---|---|---|---|
|
||||
| CRM | lead enrichment, account updates, meeting notes to opportunities | browser, CRM API, docs, search | API state assertions, fixture diffs |
|
||||
| Support | ticket triage, macro draft, escalation, refund lookup | browser/API, knowledge base, email | ticket state, cited evidence, policy checks |
|
||||
| Email and calendar | thread summarization, scheduling, follow-ups | mail, calendar, contacts, memory | event state, draft content, no-duplicate checks |
|
||||
| Docs, sheets, slides | spreadsheet cleanup, deck edits, document redaction | file, office docs, charting | structural file assertions, rendered diffs |
|
||||
| Project management | issue grooming, sprint updates, dependency tracking | PM API, repo, docs, notifications | issue state, links, blocked/unblocked status |
|
||||
| Finance ops | invoice reconciliation, expense coding, budget variance | spreadsheets, accounting API, OCR | ledger deltas, numeric tolerances, audit trail |
|
||||
| Data analytics | SQL, dashboard explanation, ETL patch, anomaly report | database, notebooks, BI API | query results, chart spec, report content |
|
||||
| Security admin | access review, incident timeline, secret rotation plan | identity, logs, repo, policy docs | policy state, log-derived evidence, refusal gates |
|
||||
| Ecommerce ops | catalog updates, order exception handling, promo QA | storefront API, spreadsheet, browser | product state, order workflow, price checks |
|
||||
| Devtools | repo migration, CI fix, release note, dependency update | shell, git, code, package registry | test pass, diff assertions, changelog checks |
|
||||
| Research | web evidence, citation synthesis, source contradiction | browser, web search, docs | citation verifier, no-fabrication checks |
|
||||
| Personal ops | travel, household planning, health/wellness admin | calendar, browser, memory, docs | constraint satisfaction, state updates |
|
||||
|
||||
## Proof Standard
|
||||
|
||||
Each domain task should declare:
|
||||
|
||||
- `domain`: one of the domains above
|
||||
- `job`: the user-facing job being covered
|
||||
- `saas_equivalents`: examples of products whose core workflow overlaps
|
||||
- `plugin_requirements`: tool families and state surfaces needed
|
||||
- `deterministic_floor`: the verifier that must pass before any judge score
|
||||
- `holdout_variant_policy`: how private variants are generated
|
||||
- `ablation_axis`: which plugins or harness capabilities the task tests
|
||||
|
||||
## Minimum Bar
|
||||
|
||||
For a credible first domain release:
|
||||
|
||||
- 12 domains
|
||||
- 5 task templates per domain
|
||||
- 3 private variants per template
|
||||
- 3 runs per configuration
|
||||
- at least 4 configuration classes:
|
||||
- model only
|
||||
- model plus harness
|
||||
- model plus harness plus core plugins
|
||||
- model plus harness plus domain plugins
|
||||
|
||||
That yields 60 public templates and 180 private variants before repetitions.
|
||||
The public templates explain coverage; the private variants carry the proof.
|
||||
@ -43,12 +43,12 @@ selection_basis:
|
||||
coverage:
|
||||
tiers:
|
||||
tier1: 2
|
||||
tier2: 7
|
||||
tier2: 6
|
||||
tier3: 5
|
||||
tier4: 4
|
||||
tier4: 5
|
||||
tier5: 1
|
||||
families:
|
||||
tools: 7
|
||||
tools: 8
|
||||
coding: 2
|
||||
repo: 3
|
||||
browser: 2
|
||||
|
||||
@ -33,9 +33,9 @@ against your own configuration.
|
||||
|
||||
| Dimension | Breakdown |
|
||||
|---|---|
|
||||
| Tiers | T1=2, T2=7, T3=5, T4=4, T5=1 |
|
||||
| Families | tools=7, coding=2, repo=3, browser=2, multi_tool=3, adversarial=1 |
|
||||
| Capabilities | bugfix, refactor, test_authoring, multifile_reasoning, browser_debugging, structured_output, graceful_refusal, delegation, tool_composition, research_synthesis, cross_repo_change, memory_continuation |
|
||||
| Tiers | T1=2, T2=6, T3=5, T4=5, T5=1 |
|
||||
| Families | tools=8, coding=2, repo=3, browser=2, multi_tool=3, adversarial=1 |
|
||||
| Capabilities | bugfix, test_authoring, multifile_reasoning, browser_debugging, structured_output, graceful_refusal, delegation, tool_composition, research_synthesis, cross_repo_change, memory_continuation |
|
||||
|
||||
## Directory layout
|
||||
|
||||
@ -44,9 +44,9 @@ tasks-public/
|
||||
├── MANIFEST.yaml # Machine-readable task list + metadata
|
||||
├── README.md # This file
|
||||
├── tier1/ # 2 task YAMLs
|
||||
├── tier2/ # 7 task YAMLs
|
||||
├── tier2/ # 6 task YAMLs
|
||||
├── tier3/ # 5 task YAMLs
|
||||
├── tier4/ # 4 task YAMLs
|
||||
├── tier4/ # 5 task YAMLs
|
||||
├── tier5/ # 1 task YAML
|
||||
└── assets/ # 19 asset packs (verifier scripts + fixtures)
|
||||
```
|
||||
|
||||
@ -163,3 +163,55 @@ def test_compose_result_from_task_stats_supports_parallel_environment_metadata()
|
||||
assert merged_result.environment["parallel_lanes"] == 2
|
||||
assert merged_result.environment["requested_parallel_lanes"] == 3
|
||||
assert merged_result.environment["browser_tasks_serialized"] is False
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_run_records_adapter_surface(monkeypatch):
|
||||
task = next(task for task in load_all_tasks() if task.id == "t1-bugfix-discount")
|
||||
|
||||
async def fake_run_single(self, current_task, run_index: int):
|
||||
return TaskRunResult(
|
||||
task_id=current_task.id,
|
||||
tier=current_task.tier.value,
|
||||
family=current_task.family.value,
|
||||
run_index=run_index,
|
||||
run_score=1.0,
|
||||
completion_result=CompletionResult(total_assertions=1, passed_assertions=1, score=1.0),
|
||||
)
|
||||
|
||||
monkeypatch.setattr("clawbench.harness.load_all_tasks", lambda **_: [task])
|
||||
monkeypatch.setattr(BenchmarkHarness, "_run_single", fake_run_single)
|
||||
|
||||
harness = BenchmarkHarness(
|
||||
gateway_config=GatewayConfig(),
|
||||
model="test-model",
|
||||
adapter="openclaw",
|
||||
runs_per_task=1,
|
||||
randomize_order=False,
|
||||
print_report=False,
|
||||
quiet=True,
|
||||
)
|
||||
|
||||
result = await harness.run()
|
||||
|
||||
assert result.environment["adapter"] == "openclaw"
|
||||
assert "hermes" in result.environment["known_adapters"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_run_rejects_registered_but_unwired_adapter(monkeypatch):
|
||||
task = next(task for task in load_all_tasks() if task.id == "t1-bugfix-discount")
|
||||
monkeypatch.setattr("clawbench.harness.load_all_tasks", lambda **_: [task])
|
||||
|
||||
harness = BenchmarkHarness(
|
||||
gateway_config=GatewayConfig(),
|
||||
model="test-model",
|
||||
adapter="hermes",
|
||||
runs_per_task=1,
|
||||
randomize_order=False,
|
||||
print_report=False,
|
||||
quiet=True,
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match="not yet wired"):
|
||||
await harness.run()
|
||||
|
||||
@ -23,6 +23,7 @@ from pathlib import Path
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
|
||||
|
||||
from clawbench.diagnostic import build_diagnostic, submit_run
|
||||
from clawbench.diagnose_cli import infer_registration_traces_from_manifests
|
||||
from clawbench.factor_analysis import analyze
|
||||
from clawbench.insights import (
|
||||
compute_capability_gaps,
|
||||
@ -139,6 +140,22 @@ def test_taguchi_sn_handles_zero_score_without_crashing():
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_infer_registration_traces_from_manifests_uses_declared_tools():
|
||||
profile = _make_profile("p", ["alpha", "missing"])
|
||||
manifests = {
|
||||
"alpha": _make_manifest(
|
||||
"alpha",
|
||||
tools=["read_file", "browser_click", "memory_write"],
|
||||
),
|
||||
}
|
||||
|
||||
traces = infer_registration_traces_from_manifests(profile, manifests)
|
||||
|
||||
assert set(traces) == {"alpha"}
|
||||
assert traces["alpha"].tools == ["read_file", "browser_click", "memory_write"]
|
||||
assert traces["alpha"].tool_families_seen == ["browser", "memory", "read"]
|
||||
|
||||
|
||||
def test_audit_flags_dead_weight_plugin():
|
||||
profile = _make_profile("p", ["alpha", "beta"])
|
||||
manifests = {
|
||||
|
||||
Loading…
Reference in New Issue
Block a user