835 lines
29 KiB
Python
835 lines
29 KiB
Python
"""CLI entry point for ClawBench."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import json
|
|
import logging
|
|
from pathlib import Path
|
|
|
|
import click
|
|
|
|
from clawbench.client import GatewayConfig
|
|
from clawbench.harness import BenchmarkHarness, KNOWN_ADAPTERS
|
|
|
|
SCENARIO_CHOICES = [
|
|
"file_system_ops",
|
|
"web_info_ops",
|
|
"calendar_reminders",
|
|
"communication_messaging",
|
|
"data_processing_analysis",
|
|
"coding_dev_assist",
|
|
"personal_life_assistant",
|
|
"multi_step_compound",
|
|
"context_continuation",
|
|
"error_boundary_cases",
|
|
"skill_calling",
|
|
"system_capabilities",
|
|
]
|
|
|
|
|
|
@click.group()
|
|
@click.option("--verbose", "-v", is_flag=True, help="Enable debug logging")
|
|
def cli(verbose: bool) -> None:
|
|
level = logging.DEBUG if verbose else logging.INFO
|
|
logging.basicConfig(
|
|
level=level,
|
|
format="%(asctime)s %(levelname)s %(name)s: %(message)s",
|
|
datefmt="%H:%M:%S",
|
|
)
|
|
|
|
|
|
@cli.command()
|
|
@click.option("--model", "-m", required=True, help="Model to benchmark")
|
|
@click.option(
|
|
"--adapter",
|
|
type=click.Choice(KNOWN_ADAPTERS),
|
|
default="openclaw",
|
|
show_default=True,
|
|
help="Agent harness adapter. OpenClaw is executable today; other adapters are tracked targets.",
|
|
)
|
|
@click.option("--gateway-token", envvar="OPENCLAW_GATEWAY_TOKEN", default="", help="Gateway auth token")
|
|
@click.option(
|
|
"--judge-model",
|
|
envvar="CLAWBENCH_JUDGE_MODEL",
|
|
default="",
|
|
help="Optional advisory LLM judge model (does not affect official score)",
|
|
)
|
|
@click.option("--runs", "-n", default=5, help="Runs per task (reliability uses all runs)")
|
|
@click.option("--tier", type=click.Choice(["tier1", "tier2", "tier3", "tier4", "tier5"]), help="Filter tier")
|
|
@click.option("--scenario", type=click.Choice(SCENARIO_CHOICES), help="Filter query scenario")
|
|
@click.option("--artifact-type", type=click.Choice(["file", "information", "operation", "code", "external_action", "memory", "automation", "mixed"]), help="Filter expected artifact type")
|
|
@click.option("--prompt-variant", type=click.Choice(["clear", "ambiguous"]), default="clear", show_default=True, help="Prompt variant to run")
|
|
@click.option("--pool", type=click.Choice(["public_dev", "official_hidden"]), help="Filter task pool")
|
|
@click.option("--subset", multiple=True, type=click.Choice(["consensus", "hard"]), help="Filter task subset")
|
|
@click.option(
|
|
"--capability",
|
|
multiple=True,
|
|
type=click.Choice(
|
|
[
|
|
"bugfix",
|
|
"refactor",
|
|
"test_authoring",
|
|
"multifile_reasoning",
|
|
"browser_debugging",
|
|
"structured_output",
|
|
"memory_continuation",
|
|
"delegation",
|
|
"tool_composition",
|
|
"research_synthesis",
|
|
"graceful_refusal",
|
|
"spec_revision",
|
|
"cross_repo_change",
|
|
"automation",
|
|
]
|
|
),
|
|
help="Filter by capability tag",
|
|
)
|
|
@click.option("--official-only", is_flag=True, help="Only run tasks marked official")
|
|
@click.option("--task", "-t", multiple=True, help="Specific task IDs to run")
|
|
@click.option("--output", "-o", type=click.Path(), help="Output JSON file path")
|
|
@click.option("--no-randomize", is_flag=True, help="Run tasks in definition order")
|
|
@click.option("--upload", is_flag=True, help="Upload results to HF Dataset")
|
|
@click.option(
|
|
"--concurrency",
|
|
"-c",
|
|
default=1,
|
|
show_default=True,
|
|
type=int,
|
|
envvar="CLAWBENCH_CONCURRENCY",
|
|
help="Number of (task, run) work items to execute in parallel against the gateway. "
|
|
"Set to 4-8 for dramatic speedup. Browser tasks are still serialized.",
|
|
)
|
|
@click.option(
|
|
"--browser-concurrency",
|
|
default=1,
|
|
show_default=True,
|
|
type=int,
|
|
help="Maximum browser tasks to run concurrently. Should normally stay 1 — "
|
|
"Chromium uses a fixed port that does not parallelize.",
|
|
)
|
|
@click.option(
|
|
"--profile",
|
|
type=click.Path(exists=True, path_type=Path),
|
|
default=None,
|
|
help="Optional Plugin Profile YAML. When provided, after the benchmark run "
|
|
"completes the v0.5 Configuration Diagnostic Report is generated and "
|
|
"the run is recorded in the historical profile database.",
|
|
)
|
|
@click.option(
|
|
"--insights-dir",
|
|
type=click.Path(path_type=Path),
|
|
default=Path(".clawbench/insights"),
|
|
show_default=True,
|
|
help="Where to write ecosystem insight files after a --profile run.",
|
|
)
|
|
@click.option(
|
|
"--dynamics",
|
|
is_flag=True,
|
|
help="Run quick post-benchmark dynamics analysis. Prefer dynamics-report for offline cache/archive analysis.",
|
|
)
|
|
def run(
|
|
model: str,
|
|
adapter: str,
|
|
gateway_token: str,
|
|
judge_model: str,
|
|
runs: int,
|
|
tier: str | None,
|
|
scenario: str | None,
|
|
artifact_type: str | None,
|
|
prompt_variant: str,
|
|
pool: str | None,
|
|
subset: tuple[str, ...],
|
|
capability: tuple[str, ...],
|
|
official_only: bool,
|
|
task: tuple[str, ...],
|
|
output: str | None,
|
|
no_randomize: bool,
|
|
upload: bool,
|
|
concurrency: int,
|
|
browser_concurrency: int,
|
|
profile: Path | None,
|
|
insights_dir: Path,
|
|
dynamics: bool,
|
|
) -> None:
|
|
gateway_config = GatewayConfig(token=gateway_token)
|
|
harness = BenchmarkHarness(
|
|
gateway_config=gateway_config,
|
|
model=model,
|
|
adapter=adapter,
|
|
judge_model=judge_model,
|
|
runs_per_task=runs,
|
|
tier=tier,
|
|
scenario=scenario,
|
|
artifact_type=artifact_type,
|
|
prompt_variant=prompt_variant,
|
|
pool=pool,
|
|
subsets=list(subset),
|
|
capabilities=list(capability),
|
|
official_only=official_only,
|
|
task_ids=list(task) if task else None,
|
|
randomize_order=not no_randomize,
|
|
concurrency=concurrency,
|
|
browser_concurrency=browser_concurrency,
|
|
)
|
|
|
|
result = asyncio.run(harness.run())
|
|
out_path = output or f"results/{result.submission_id}.json"
|
|
Path(out_path).parent.mkdir(parents=True, exist_ok=True)
|
|
with open(out_path, "w", encoding="utf-8") as handle:
|
|
json.dump(result.model_dump(), handle, indent=2)
|
|
click.echo(f"\nResults saved to {out_path}")
|
|
|
|
if dynamics:
|
|
_run_dynamics_analysis(harness.last_task_runs, out_path)
|
|
|
|
if profile is not None:
|
|
_run_v05_diagnostic(
|
|
profile_path=profile,
|
|
result=result,
|
|
task_runs=harness.last_task_runs,
|
|
runs_per_task=runs,
|
|
insights_dir=insights_dir,
|
|
)
|
|
|
|
if upload:
|
|
from clawbench.upload import upload_result
|
|
|
|
asyncio.run(upload_result(result))
|
|
|
|
|
|
@cli.command("dynamics-report")
|
|
@click.option(
|
|
"--archive-dir",
|
|
type=click.Path(exists=True, file_okay=False, path_type=Path),
|
|
required=True,
|
|
help="Path to a run cache/archive root or a single model cache directory.",
|
|
)
|
|
@click.option(
|
|
"--model",
|
|
default=None,
|
|
help="Model id to select when the archive root contains multiple model directories.",
|
|
)
|
|
@click.option("--tier", type=click.Choice(["tier1", "tier2", "tier3", "tier4", "tier5"]))
|
|
@click.option("--task", "task_ids", multiple=True, help="Specific task IDs to include from the archive.")
|
|
@click.option(
|
|
"--output-dir",
|
|
type=click.Path(path_type=Path),
|
|
default=Path("results/offline_dynamics"),
|
|
show_default=True,
|
|
help="Directory where dynamics.json and plots will be written.",
|
|
)
|
|
@click.option(
|
|
"--no-plots",
|
|
is_flag=True,
|
|
help="Write only dynamics.json and skip plot rendering.",
|
|
)
|
|
def dynamics_report(
|
|
archive_dir: Path,
|
|
model: str | None,
|
|
tier: str | None,
|
|
task_ids: tuple[str, ...],
|
|
output_dir: Path,
|
|
no_plots: bool,
|
|
) -> None:
|
|
"""Generate dynamics plots and a JSON report from cached TaskRunResult archives."""
|
|
from clawbench.dynamics_archive import load_task_runs_archive
|
|
|
|
try:
|
|
task_runs = load_task_runs_archive(
|
|
archive_dir=archive_dir,
|
|
model=model,
|
|
task_ids=task_ids,
|
|
tier=tier,
|
|
)
|
|
except ValueError as exc:
|
|
raise click.ClickException(str(exc)) from exc
|
|
|
|
if not task_runs:
|
|
raise click.ClickException(f"No cached runs found under {archive_dir}")
|
|
|
|
report_path, plots, n_runs = _write_dynamics_report(
|
|
task_runs,
|
|
output_dir,
|
|
generate_plots=not no_plots,
|
|
)
|
|
click.echo(f"Loaded {n_runs} cached runs across {len(task_runs)} tasks")
|
|
click.echo(f"Dynamics report saved to {report_path}")
|
|
click.echo(f"Saved {len(plots)} plots to {output_dir}/")
|
|
|
|
|
|
def _write_dynamics_report(
|
|
task_runs: dict[str, list],
|
|
output_dir: Path,
|
|
*,
|
|
generate_plots: bool = True,
|
|
) -> tuple[Path, list[Path], int]:
|
|
from clawbench.dynamics_archive import write_dynamics_report
|
|
|
|
report_path, plots = write_dynamics_report(
|
|
task_runs,
|
|
output_dir,
|
|
generate_plots=generate_plots,
|
|
)
|
|
n_runs = sum(len(runs) for runs in task_runs.values())
|
|
return report_path, plots, n_runs
|
|
|
|
|
|
def _run_v05_diagnostic(
|
|
*,
|
|
profile_path: Path,
|
|
result,
|
|
task_runs: dict[str, list] | None,
|
|
runs_per_task: int,
|
|
insights_dir: Path,
|
|
) -> None:
|
|
"""Post-benchmark v0.5 diagnostic: fingerprint + predict + record + publish."""
|
|
from clawbench.diagnose_cli import (
|
|
DEFAULT_DB_PATH,
|
|
DEFAULT_MANIFEST_DIR,
|
|
DEFAULT_SUBMISSIONS_DIR,
|
|
ensure_data_dirs,
|
|
infer_registration_traces_from_manifests,
|
|
load_manifests,
|
|
write_submission_record,
|
|
)
|
|
from clawbench.diagnostic import submit_run
|
|
from clawbench.insights import publish_insights
|
|
from clawbench.prediction import HistoricalDatabase
|
|
from clawbench.profile import PluginProfile
|
|
|
|
ensure_data_dirs()
|
|
|
|
plugin_profile = PluginProfile.from_yaml_file(profile_path)
|
|
plugin_ids = [e.id for e in plugin_profile.plugins]
|
|
manifests = load_manifests(DEFAULT_MANIFEST_DIR, plugin_ids)
|
|
traces = infer_registration_traces_from_manifests(plugin_profile, manifests)
|
|
db = HistoricalDatabase(path=DEFAULT_DB_PATH)
|
|
|
|
# Extract per-task scores + tier map from the BenchmarkResult
|
|
actual_per_task: dict[str, float] = {}
|
|
tier_of: dict[str, str] = {}
|
|
for task_stats in result.task_results:
|
|
actual_per_task[task_stats.task_id] = float(task_stats.mean_task_score)
|
|
if getattr(task_stats, "tier", ""):
|
|
tier_of[task_stats.task_id] = task_stats.tier
|
|
|
|
transcripts = _merge_task_transcripts_from_runs(task_runs or {})
|
|
|
|
diagnostic = submit_run(
|
|
profile=plugin_profile,
|
|
manifests=manifests,
|
|
db=db,
|
|
actual_overall_score=float(result.overall_score),
|
|
actual_per_task_scores=actual_per_task,
|
|
traces=traces,
|
|
transcripts=transcripts,
|
|
tier_of=tier_of or None,
|
|
n_runs_contributing=runs_per_task,
|
|
)
|
|
|
|
write_submission_record(
|
|
DEFAULT_SUBMISSIONS_DIR,
|
|
diagnostic.fingerprint_hash,
|
|
diagnostic.to_dict(),
|
|
)
|
|
publish_insights(
|
|
db, insights_dir, factor_report=diagnostic.factor_analysis
|
|
)
|
|
|
|
click.echo("")
|
|
click.echo(diagnostic.render_text())
|
|
click.echo(
|
|
f"\nv0.5 diagnostic recorded for profile '{plugin_profile.name}' "
|
|
f"(fingerprint {diagnostic.fingerprint_hash}). "
|
|
f"Insights published to {insights_dir}."
|
|
)
|
|
|
|
|
|
def _merge_task_transcripts_from_runs(task_runs: dict[str, list]):
|
|
"""Merge all run transcripts per task for the v0.5 utilization audit."""
|
|
if not task_runs:
|
|
return None
|
|
from clawbench.schemas import Transcript
|
|
|
|
merged: dict[str, Transcript] = {}
|
|
for task_id, runs in task_runs.items():
|
|
transcript = Transcript()
|
|
for run in runs:
|
|
transcript.messages.extend(getattr(run.transcript, "messages", []))
|
|
if transcript.messages:
|
|
merged[task_id] = transcript
|
|
return merged or None
|
|
|
|
|
|
@cli.command()
|
|
@click.argument("profile", type=click.Path(exists=True, path_type=Path))
|
|
@click.option(
|
|
"--results",
|
|
type=click.Path(path_type=Path),
|
|
default=None,
|
|
help="Optional v0.4 BenchmarkResult JSON; enables post-run analysis.",
|
|
)
|
|
@click.option(
|
|
"--manifests",
|
|
type=click.Path(path_type=Path),
|
|
default=Path(".clawbench/manifests"),
|
|
show_default=True,
|
|
help="Directory of plugin manifest JSON files.",
|
|
)
|
|
@click.option(
|
|
"--db",
|
|
type=click.Path(path_type=Path),
|
|
default=Path(".clawbench/historical/profile_runs.json"),
|
|
show_default=True,
|
|
help="Path to the historical profile database.",
|
|
)
|
|
@click.option(
|
|
"--insights-dir",
|
|
type=click.Path(path_type=Path),
|
|
default=Path(".clawbench/insights"),
|
|
show_default=True,
|
|
)
|
|
@click.option("--json-out", is_flag=True, help="Print diagnostic as JSON")
|
|
def diagnose(
|
|
profile: Path,
|
|
results: Path | None,
|
|
manifests: Path,
|
|
db: Path,
|
|
insights_dir: Path,
|
|
json_out: bool,
|
|
) -> None:
|
|
"""Run the ClawBench v0.5 Configuration Diagnostic for a plugin profile."""
|
|
from clawbench.diagnose_cli import (
|
|
DEFAULT_SUBMISSIONS_DIR,
|
|
ensure_data_dirs,
|
|
load_manifests,
|
|
write_submission_record,
|
|
)
|
|
from clawbench.diagnostic import build_diagnostic, submit_run
|
|
from clawbench.insights import publish_insights
|
|
from clawbench.prediction import HistoricalDatabase
|
|
from clawbench.profile import PluginProfile
|
|
from clawbench.schemas import BenchmarkResult
|
|
|
|
ensure_data_dirs()
|
|
|
|
plugin_profile = PluginProfile.from_yaml_file(profile)
|
|
plugin_ids = [e.id for e in plugin_profile.plugins]
|
|
manifest_map = load_manifests(manifests, plugin_ids)
|
|
database = HistoricalDatabase(path=db)
|
|
|
|
actual_overall: float | None = None
|
|
actual_per_task: dict[str, float] | None = None
|
|
tier_of: dict[str, str] | None = None
|
|
|
|
if results is not None:
|
|
with open(results, encoding="utf-8") as handle:
|
|
raw = json.load(handle)
|
|
br = BenchmarkResult(**raw)
|
|
actual_overall = float(br.overall_score)
|
|
actual_per_task = {
|
|
ts.task_id: float(ts.mean_task_score) for ts in br.task_results
|
|
}
|
|
tier_of = {
|
|
ts.task_id: ts.tier for ts in br.task_results if getattr(ts, "tier", "")
|
|
}
|
|
|
|
if results is not None and actual_per_task is not None and actual_overall is not None:
|
|
report = submit_run(
|
|
profile=plugin_profile,
|
|
manifests=manifest_map,
|
|
db=database,
|
|
actual_overall_score=actual_overall,
|
|
actual_per_task_scores=actual_per_task,
|
|
tier_of=tier_of,
|
|
)
|
|
publish_insights(database, insights_dir, factor_report=report.factor_analysis)
|
|
else:
|
|
report = build_diagnostic(
|
|
profile=plugin_profile,
|
|
manifests=manifest_map,
|
|
db=database,
|
|
actual_overall_score=actual_overall,
|
|
actual_per_task_scores=actual_per_task,
|
|
tier_of=tier_of,
|
|
)
|
|
|
|
write_submission_record(
|
|
DEFAULT_SUBMISSIONS_DIR, report.fingerprint_hash, report.to_dict()
|
|
)
|
|
|
|
if json_out:
|
|
click.echo(json.dumps(report.to_dict(), indent=2, default=str))
|
|
else:
|
|
click.echo(report.render_text())
|
|
|
|
|
|
@cli.command()
|
|
@click.option("--release-id", required=True, help="Identifier for the hidden release snapshot")
|
|
@click.option("--tasks-dir", type=click.Path(exists=True), help="Optional source tasks directory")
|
|
@click.option("--tier", type=click.Choice(["tier1", "tier2", "tier3", "tier4", "tier5"]), help="Filter tier")
|
|
@click.option("--scenario", type=click.Choice(SCENARIO_CHOICES), help="Filter query scenario")
|
|
@click.option("--artifact-type", type=click.Choice(["file", "information", "operation", "code", "external_action", "memory", "automation", "mixed"]), help="Filter expected artifact type")
|
|
@click.option("--prompt-variant", type=click.Choice(["clear", "ambiguous"]), default="clear", show_default=True, help="Filter prompt variant support")
|
|
@click.option("--subset", multiple=True, type=click.Choice(["consensus", "hard"]), help="Filter task subset")
|
|
@click.option(
|
|
"--capability",
|
|
multiple=True,
|
|
type=click.Choice(
|
|
[
|
|
"bugfix",
|
|
"refactor",
|
|
"test_authoring",
|
|
"multifile_reasoning",
|
|
"browser_debugging",
|
|
"structured_output",
|
|
"memory_continuation",
|
|
"delegation",
|
|
"tool_composition",
|
|
"research_synthesis",
|
|
"graceful_refusal",
|
|
"spec_revision",
|
|
"cross_repo_change",
|
|
"automation",
|
|
]
|
|
),
|
|
help="Filter by capability tag",
|
|
)
|
|
@click.option("--task", "-t", multiple=True, help="Specific source task IDs to include")
|
|
@click.option("--max-tasks", type=int, default=0, show_default=True, help="Limit the snapshot to the first N matching tasks")
|
|
@click.option(
|
|
"--private-tasks-dir",
|
|
type=click.Path(path_type=Path),
|
|
default=None,
|
|
help="Override the private release root directory",
|
|
)
|
|
@click.option(
|
|
"--active-release-path",
|
|
type=click.Path(path_type=Path),
|
|
default=None,
|
|
help="Override where the active hidden-release manifest is written",
|
|
)
|
|
@click.option("--activate/--no-activate", default=True, show_default=True, help="Set the new hidden release as active")
|
|
def build_release(
|
|
release_id: str,
|
|
tasks_dir: str | None,
|
|
tier: str | None,
|
|
scenario: str | None,
|
|
artifact_type: str | None,
|
|
prompt_variant: str,
|
|
subset: tuple[str, ...],
|
|
capability: tuple[str, ...],
|
|
task: tuple[str, ...],
|
|
max_tasks: int,
|
|
private_tasks_dir: Path | None,
|
|
active_release_path: Path | None,
|
|
activate: bool,
|
|
) -> None:
|
|
from clawbench.releases import build_hidden_release
|
|
from clawbench.tasks import load_all_tasks
|
|
|
|
tasks = load_all_tasks(
|
|
tasks_dir=Path(tasks_dir) if tasks_dir else None,
|
|
tier=tier,
|
|
task_ids=list(task) if task else None,
|
|
scenario=scenario,
|
|
artifact_type=artifact_type,
|
|
prompt_variant=prompt_variant,
|
|
pool="public_dev",
|
|
subsets=list(subset),
|
|
capabilities=list(capability),
|
|
)
|
|
if not tasks:
|
|
raise click.ClickException("No public tasks matched the requested filters.")
|
|
if max_tasks > 0:
|
|
tasks = tasks[:max_tasks]
|
|
|
|
manifest = build_hidden_release(
|
|
tasks=tasks,
|
|
release_id=release_id,
|
|
private_tasks_root=private_tasks_dir,
|
|
activate=activate,
|
|
active_release_path=active_release_path,
|
|
)
|
|
click.echo(
|
|
f"Built hidden release '{manifest.release_id}' with {len(manifest.task_ids)} task(s) at "
|
|
f"{manifest.hidden_tasks_dir}"
|
|
)
|
|
click.echo(f"Snapshot fingerprint: {manifest.task_snapshot_fingerprint}")
|
|
if activate:
|
|
click.echo("Active hidden release manifest updated.")
|
|
|
|
|
|
@cli.command()
|
|
@click.option("--input", "input_path", required=True, type=click.Path(exists=True, path_type=Path), help="JSON or JSONL file of raw trace records")
|
|
@click.option(
|
|
"--source-kind",
|
|
required=True,
|
|
type=click.Choice(["hf_open_trace", "partner_trace", "internal_run", "synthetic"]),
|
|
help="Origin of the traces being ingested",
|
|
)
|
|
@click.option(
|
|
"--privacy-tier",
|
|
default="public",
|
|
show_default=True,
|
|
type=click.Choice(["public", "private", "partner_restricted"]),
|
|
help="Privacy level for the ingested traces",
|
|
)
|
|
@click.option("--partner-name", default="", help="Optional partner/source label")
|
|
@click.option(
|
|
"--factory-root",
|
|
type=click.Path(path_type=Path),
|
|
default=None,
|
|
help="Override the local task-factory registry root",
|
|
)
|
|
@click.option("--emit-templates/--no-emit-templates", default=True, show_default=True, help="Also derive reusable task templates from the normalized seeds")
|
|
def ingest_traces(
|
|
input_path: Path,
|
|
source_kind: str,
|
|
privacy_tier: str,
|
|
partner_name: str,
|
|
factory_root: Path | None,
|
|
emit_templates: bool,
|
|
) -> None:
|
|
from clawbench.task_factory import ingest_trace_file
|
|
|
|
traces, seeds, templates = ingest_trace_file(
|
|
input_path=input_path,
|
|
source_kind=source_kind,
|
|
privacy_tier=privacy_tier,
|
|
partner_name=partner_name,
|
|
factory_root=factory_root,
|
|
emit_templates=emit_templates,
|
|
)
|
|
click.echo(
|
|
f"Ingested {len(traces)} trace(s) -> {len(seeds)} seed(s)"
|
|
+ (f" -> {len(templates)} template(s)" if emit_templates else "")
|
|
)
|
|
if seeds:
|
|
click.echo(f"First seed: {seeds[0].seed_id} family={seeds[0].family} scenario={seeds[0].scenario}")
|
|
|
|
|
|
@cli.command()
|
|
@click.option(
|
|
"--kind",
|
|
default="seeds",
|
|
show_default=True,
|
|
type=click.Choice(["traces", "seeds", "templates"]),
|
|
help="Registry slice to inspect",
|
|
)
|
|
@click.option(
|
|
"--factory-root",
|
|
type=click.Path(path_type=Path),
|
|
default=None,
|
|
help="Override the local task-factory registry root",
|
|
)
|
|
def list_factory(kind: str, factory_root: Path | None) -> None:
|
|
from clawbench.task_factory import ensure_task_factory_dirs
|
|
|
|
dirs = ensure_task_factory_dirs(factory_root)
|
|
files = sorted(dirs[kind].glob("*.json"))
|
|
click.echo(f"{kind}: {len(files)} file(s)")
|
|
for path in files[:50]:
|
|
click.echo(f" {path.name}")
|
|
|
|
|
|
@cli.command()
|
|
@click.option("--threshold", type=float, default=0.72, show_default=True, help="Similarity threshold for reporting findings")
|
|
@click.option(
|
|
"--factory-root",
|
|
type=click.Path(path_type=Path),
|
|
default=None,
|
|
help="Override the local task-factory registry root",
|
|
)
|
|
@click.option("--include-public/--no-include-public", default=True, show_default=True, help="Compare templates against public tasks")
|
|
@click.option("--include-hidden/--no-include-hidden", default=True, show_default=True, help="Compare templates against the active hidden release")
|
|
def audit_contamination(
|
|
threshold: float,
|
|
factory_root: Path | None,
|
|
include_public: bool,
|
|
include_hidden: bool,
|
|
) -> None:
|
|
from clawbench.task_factory import audit_contamination as run_audit
|
|
|
|
report = run_audit(
|
|
threshold=threshold,
|
|
factory_root=factory_root,
|
|
include_public_tasks=include_public,
|
|
include_hidden_tasks=include_hidden,
|
|
)
|
|
click.echo(
|
|
f"Audit complete: {len(report.findings)} finding(s) at threshold >= {report.threshold:.2f} "
|
|
f"(templates={report.template_count}, public={report.public_task_count}, hidden={report.hidden_task_count})"
|
|
)
|
|
click.echo(f"Report: {report.report_path}")
|
|
for finding in report.findings[:10]:
|
|
click.echo(
|
|
f" {finding.score:.2f} {finding.left_kind}:{finding.left_id} ~ "
|
|
f"{finding.right_kind}:{finding.right_id}"
|
|
)
|
|
|
|
|
|
@cli.command()
|
|
@click.option("--release-id", required=True, help="Identifier for the hidden release built from templates")
|
|
@click.option("--template-id", multiple=True, help="Specific template IDs to promote")
|
|
@click.option("--max-templates", type=int, default=0, show_default=True, help="Limit promotion to the first N matching templates")
|
|
@click.option(
|
|
"--factory-root",
|
|
type=click.Path(path_type=Path),
|
|
default=None,
|
|
help="Override the local task-factory registry root",
|
|
)
|
|
@click.option(
|
|
"--private-tasks-dir",
|
|
type=click.Path(path_type=Path),
|
|
default=None,
|
|
help="Override the private release root directory",
|
|
)
|
|
@click.option(
|
|
"--active-release-path",
|
|
type=click.Path(path_type=Path),
|
|
default=None,
|
|
help="Override where the active hidden-release manifest is written",
|
|
)
|
|
@click.option("--activate/--no-activate", default=True, show_default=True, help="Set the new hidden release as active")
|
|
def promote_templates(
|
|
release_id: str,
|
|
template_id: tuple[str, ...],
|
|
max_templates: int,
|
|
factory_root: Path | None,
|
|
private_tasks_dir: Path | None,
|
|
active_release_path: Path | None,
|
|
activate: bool,
|
|
) -> None:
|
|
from clawbench.task_factory import build_hidden_release_from_templates
|
|
|
|
manifest, tasks = build_hidden_release_from_templates(
|
|
release_id=release_id,
|
|
template_ids=list(template_id) if template_id else None,
|
|
max_templates=max_templates,
|
|
factory_root=factory_root,
|
|
private_tasks_root=private_tasks_dir,
|
|
active_release_path=active_release_path,
|
|
activate=activate,
|
|
)
|
|
click.echo(
|
|
f"Promoted {len(tasks)} template-derived task(s) into hidden release '{manifest.release_id}' at "
|
|
f"{manifest.hidden_tasks_dir}"
|
|
)
|
|
click.echo(f"Snapshot fingerprint: {manifest.task_snapshot_fingerprint}")
|
|
if tasks:
|
|
click.echo(f"First promoted task: {tasks[0].id} template={tasks[0].template_id}")
|
|
if activate:
|
|
click.echo("Active hidden release manifest updated.")
|
|
|
|
|
|
@cli.command()
|
|
@click.option("--tasks-dir", type=click.Path(exists=True), help="Custom tasks directory")
|
|
@click.option("--scenario", type=click.Choice(SCENARIO_CHOICES), help="Filter query scenario")
|
|
@click.option("--prompt-variant", type=click.Choice(["clear", "ambiguous"]), help="Filter prompt variant support")
|
|
@click.option("--pool", type=click.Choice(["public_dev", "official_hidden"]), help="Filter task pool")
|
|
@click.option("--subset", multiple=True, type=click.Choice(["consensus", "hard"]), help="Filter task subset")
|
|
def list_tasks(tasks_dir: str | None, scenario: str | None, prompt_variant: str | None, pool: str | None, subset: tuple[str, ...]) -> None:
|
|
from clawbench.tasks import load_all_tasks
|
|
|
|
tasks = load_all_tasks(
|
|
tasks_dir=Path(tasks_dir) if tasks_dir else None,
|
|
scenario=scenario,
|
|
prompt_variant=prompt_variant,
|
|
pool=pool,
|
|
subsets=list(subset),
|
|
)
|
|
click.echo(f"\n{'ID':<34} {'Tier':<7} {'Scene':<24} {'Prompt':<10} {'Pool':<15} {'Family':<12}")
|
|
click.echo("-" * 116)
|
|
for task in tasks:
|
|
click.echo(
|
|
f" {task.id:<32} {task.tier.value:<7} "
|
|
f"{(task.scenario.value if task.scenario else '-'): <24} "
|
|
f"{'/'.join(variant.value for variant in task.prompt_variants):<10} "
|
|
f"{task.pool.value:<15} {task.family.value:<12}"
|
|
)
|
|
|
|
|
|
@cli.command()
|
|
@click.argument("result_file", type=click.Path(exists=True))
|
|
def show(result_file: str) -> None:
|
|
from rich.console import Console
|
|
from clawbench.schemas import BenchmarkResult
|
|
|
|
with open(result_file, encoding="utf-8") as handle:
|
|
data = json.load(handle)
|
|
result = BenchmarkResult(**data)
|
|
|
|
console = Console()
|
|
console.print(f"\n[bold]Model:[/] {result.model}")
|
|
console.print(
|
|
f"[bold]Score:[/] {result.overall_score:.3f} "
|
|
f"(CI: {result.overall_ci_lower:.3f}-{result.overall_ci_upper:.3f})"
|
|
)
|
|
console.print(
|
|
f" [green]Completion: {result.overall_completion:.3f}[/] "
|
|
f"[blue]Trajectory: {result.overall_trajectory:.3f}[/] "
|
|
f"[yellow]Behavior: {result.overall_behavior:.3f}[/] "
|
|
f"[magenta]Reliability: {result.overall_reliability:.3f}[/]"
|
|
)
|
|
if result.judge_model:
|
|
console.print(
|
|
f" [magenta]Judge: {result.overall_judge_score:.3f}[/] "
|
|
f"Confidence: {result.overall_judge_confidence:.3f} "
|
|
f"Pass rate: {result.overall_judge_pass_rate:.0%} "
|
|
f"Coverage: {result.judge_task_coverage:.0%}"
|
|
)
|
|
console.print(
|
|
f" Weighted query: {result.overall_weighted_query_score:.3f} "
|
|
f"Clear prompt: {result.clear_prompt_score:.3f} "
|
|
f"Ambiguous prompt: {result.ambiguous_prompt_score:.3f}"
|
|
)
|
|
console.print(
|
|
f" Latency p50={result.overall_median_latency_ms:.0f}ms "
|
|
f"p95={result.overall_p95_latency_ms:.0f}ms "
|
|
f"Tokens/pass={result.overall_tokens_per_pass:.0f} "
|
|
f"Cost/pass=${result.overall_cost_per_pass:.4f}"
|
|
)
|
|
console.print(
|
|
f" Hard subset: {result.hard_subset_score:.3f} "
|
|
f"Consensus subset: {result.consensus_subset_score:.3f}"
|
|
)
|
|
console.print(f" [bold]pass^k reliability: {result.overall_pass_hat_k:.0%}[/]\n")
|
|
|
|
for task in result.task_results:
|
|
color = "green" if task.mean_task_score >= 0.7 else "yellow" if task.mean_task_score >= 0.4 else "red"
|
|
top_failure = max(task.failure_mode_counts.items(), key=lambda item: item[1])[0] if task.failure_mode_counts else "-"
|
|
judge_value = f"{task.mean_judge_score:.2f}" if task.judged_runs > 0 else "-"
|
|
console.print(
|
|
f" [{color}]{task.mean_task_score:.3f}[/] {task.task_id} "
|
|
f"scene={task.scenario or '-'} prompt={task.prompt_variant} "
|
|
f"run={task.mean_run_score:.2f} comp={task.mean_completion_score:.2f} "
|
|
f"traj={task.mean_trajectory_score:.2f} beh={task.mean_behavior_score:.2f} "
|
|
f"judge={judge_value} "
|
|
f"rel={task.reliability_score:.2f} delivery={task.delivery_outcome_counts} "
|
|
f"tok/pass={task.tokens_per_pass:.0f} p50={task.median_duration_ms:.0f}ms fail={top_failure}"
|
|
)
|
|
|
|
|
|
def _run_dynamics_analysis(
|
|
task_runs: dict[str, list],
|
|
result_path: str,
|
|
) -> None:
|
|
"""Compute stratified dynamics from raw TaskRunResult objects."""
|
|
run_stem = Path(result_path).stem
|
|
dyn_dir = Path(result_path).parent / f"{run_stem}_dynamics"
|
|
try:
|
|
dyn_path, plots, n_runs = _write_dynamics_report(task_runs, dyn_dir)
|
|
except ValueError as exc:
|
|
click.echo(str(exc))
|
|
return
|
|
|
|
click.echo(f"\n[dynamics] Analysed {n_runs} cached runs")
|
|
click.echo(f" Dynamics report saved to {dyn_path}")
|
|
click.echo(f" Saved {len(plots)} plots to {dyn_dir}/")
|
|
|
|
|
|
def main() -> None:
|
|
cli()
|