569 lines
18 KiB
Python
569 lines
18 KiB
Python
"""Unit tests for the v0.5 extensions shipped in this pass:
|
|
|
|
- Taguchi S/N robustness profile
|
|
- Plugin Utilization Audit
|
|
- Manifest-vs-Reality Gap
|
|
- Calibration tracking in HistoricalDatabase
|
|
- Recommendations generator
|
|
- Surprise cause attribution
|
|
- Insights publishing
|
|
- End-to-end diagnostic with all sections populated
|
|
|
|
These tests run in isolation from the larger harness; they build small
|
|
synthetic fixtures and exercise the pure-function paths only.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import math
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
|
|
|
|
from clawbench.diagnostic import build_diagnostic, submit_run
|
|
from clawbench.diagnose_cli import infer_registration_traces_from_manifests
|
|
from clawbench.factor_analysis import analyze
|
|
from clawbench.insights import (
|
|
compute_capability_gaps,
|
|
compute_plugin_leaderboard,
|
|
publish_insights,
|
|
)
|
|
from clawbench.prediction import (
|
|
HistoricalDatabase,
|
|
HistoricalRun,
|
|
attribute_surprise,
|
|
)
|
|
from clawbench.profile import (
|
|
PluginManifest,
|
|
PluginProfile,
|
|
PluginProfileEntry,
|
|
ProfileFingerprint,
|
|
RegistrationTrace,
|
|
)
|
|
from clawbench.recommendations import generate_recommendations
|
|
from clawbench.schemas import ToolCall, Transcript, TranscriptMessage
|
|
from clawbench.stats import (
|
|
compute_robustness_profile,
|
|
taguchi_sn_larger_is_better,
|
|
)
|
|
from clawbench.utilization import (
|
|
audit_plugin_utilization,
|
|
compute_manifest_reality_gap,
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _make_profile(
|
|
name: str, plugin_ids: list[str], *, base_model: str = "claude-sonnet-4"
|
|
) -> PluginProfile:
|
|
entries = [PluginProfileEntry(id=pid, source="bundled") for pid in plugin_ids]
|
|
return PluginProfile(
|
|
name=name,
|
|
base_model=base_model,
|
|
plugins=entries,
|
|
slots={"memory": "memory-lancedb"} if "memory-lancedb" in plugin_ids else {},
|
|
)
|
|
|
|
|
|
def _make_manifest(
|
|
pid: str,
|
|
*,
|
|
tools: list[str] | None = None,
|
|
kind: list[str] | None = None,
|
|
) -> PluginManifest:
|
|
return PluginManifest(
|
|
id=pid,
|
|
kind=kind or [],
|
|
contracts={"tools": tools or []},
|
|
)
|
|
|
|
|
|
def _make_transcript(tool_calls: list[tuple[str, str]]) -> Transcript:
|
|
"""Build a transcript from [(tool_name, family), ...]."""
|
|
calls = [
|
|
ToolCall(name=name, family=family, output="ok", success=True)
|
|
for name, family in tool_calls
|
|
]
|
|
return Transcript(
|
|
messages=[TranscriptMessage(role="assistant", tool_calls=calls)]
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Taguchi S/N + robustness profile
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_taguchi_sn_penalizes_worst_case_harder_than_mean():
|
|
"""A configuration that averages high but crashes on one task should
|
|
have a worse S/N ratio than one with a lower mean but no crashes."""
|
|
balanced = [0.68, 0.70, 0.68, 0.72, 0.70]
|
|
spikey = [0.95, 0.97, 0.96, 0.94, 0.05]
|
|
|
|
mean_balanced = sum(balanced) / len(balanced)
|
|
mean_spikey = sum(spikey) / len(spikey)
|
|
assert mean_spikey > mean_balanced # spikey wins on mean
|
|
|
|
sn_balanced = taguchi_sn_larger_is_better(balanced)
|
|
sn_spikey = taguchi_sn_larger_is_better(spikey)
|
|
# S/N should flip the ranking — the 0.05 crash in spikey drags
|
|
# -10*log10(mean_inv_square) down below the steady 0.70 baseline.
|
|
assert sn_balanced > sn_spikey
|
|
|
|
|
|
def test_compute_robustness_profile_fills_tier_means():
|
|
scores = {"t1-a": 0.9, "t1-b": 0.8, "t2-a": 0.5, "t3-a": 0.3}
|
|
tiers = {"t1-a": "tier1", "t1-b": "tier1", "t2-a": "tier2", "t3-a": "tier3"}
|
|
rp = compute_robustness_profile(scores, tier_of=tiers)
|
|
assert rp.n_tasks == 4
|
|
assert rp.worst_of_n == 0.3
|
|
assert rp.best_of_n == 0.9
|
|
assert abs(rp.tier_means["tier1"] - 0.85) < 1e-6
|
|
assert abs(rp.tier_means["tier2"] - 0.5) < 1e-6
|
|
|
|
|
|
def test_taguchi_sn_handles_zero_score_without_crashing():
|
|
# 0 would be 1/0 without the floor — should not raise
|
|
sn = taguchi_sn_larger_is_better([0.0, 0.5, 1.0])
|
|
assert math.isfinite(sn)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Plugin Utilization Audit
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_infer_registration_traces_from_manifests_uses_declared_tools():
|
|
profile = _make_profile("p", ["alpha", "missing"])
|
|
manifests = {
|
|
"alpha": _make_manifest(
|
|
"alpha",
|
|
tools=["read_file", "browser_click", "memory_write"],
|
|
),
|
|
}
|
|
|
|
traces = infer_registration_traces_from_manifests(profile, manifests)
|
|
|
|
assert set(traces) == {"alpha"}
|
|
assert traces["alpha"].tools == ["read_file", "browser_click", "memory_write"]
|
|
assert traces["alpha"].tool_families_seen == ["browser", "memory", "read"]
|
|
|
|
|
|
def test_audit_flags_dead_weight_plugin():
|
|
profile = _make_profile("p", ["alpha", "beta"])
|
|
manifests = {
|
|
"alpha": _make_manifest("alpha", tools=["alpha_tool"]),
|
|
"beta": _make_manifest("beta", tools=["beta_tool"]),
|
|
}
|
|
traces = {
|
|
"alpha": RegistrationTrace(
|
|
plugin_id="alpha",
|
|
tools=["alpha_tool"],
|
|
tool_families_seen=["read"],
|
|
),
|
|
"beta": RegistrationTrace(
|
|
plugin_id="beta",
|
|
tools=["beta_tool"],
|
|
tool_families_seen=["edit"],
|
|
),
|
|
}
|
|
transcripts = {
|
|
"task-1": _make_transcript([("alpha_tool", "read")]),
|
|
"task-2": _make_transcript([("alpha_tool", "read")]),
|
|
}
|
|
report = audit_plugin_utilization(
|
|
profile=profile,
|
|
transcripts=transcripts,
|
|
manifests=manifests,
|
|
traces=traces,
|
|
)
|
|
assert report.n_plugins == 2
|
|
assert report.n_invoked == 1
|
|
assert report.n_dead_weight == 1
|
|
|
|
alpha = next(p for p in report.per_plugin if p.plugin_id == "alpha")
|
|
beta = next(p for p in report.per_plugin if p.plugin_id == "beta")
|
|
assert alpha.invocation_count == 2
|
|
assert not alpha.dead_weight
|
|
assert beta.invocation_count == 0
|
|
assert beta.dead_weight
|
|
|
|
|
|
def test_audit_family_fallback_when_trace_missing_tool_name():
|
|
profile = _make_profile("p", ["alpha"])
|
|
manifests = {"alpha": _make_manifest("alpha")}
|
|
traces = {
|
|
"alpha": RegistrationTrace(
|
|
plugin_id="alpha",
|
|
tools=[], # intentionally empty
|
|
tool_families_seen=["search"],
|
|
)
|
|
}
|
|
transcripts = {
|
|
"t": _make_transcript([("unknown_tool", "search")]),
|
|
}
|
|
report = audit_plugin_utilization(
|
|
profile=profile, transcripts=transcripts, manifests=manifests, traces=traces
|
|
)
|
|
alpha = next(p for p in report.per_plugin if p.plugin_id == "alpha")
|
|
assert alpha.invoked
|
|
assert alpha.invocation_count == 1
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Manifest-vs-Reality gap
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_manifest_reality_gap_flags_unused_claims():
|
|
profile = _make_profile("p", ["alpha"])
|
|
manifest = PluginManifest(
|
|
id="alpha",
|
|
kind=["memory"], # claims memory family
|
|
contracts={"tools": ["alpha_tool"]},
|
|
)
|
|
manifests = {"alpha": manifest}
|
|
traces = {
|
|
"alpha": RegistrationTrace(
|
|
plugin_id="alpha",
|
|
tools=["alpha_tool"],
|
|
tool_families_seen=["read"], # observed, not claimed
|
|
)
|
|
}
|
|
transcripts = {
|
|
"t": _make_transcript([("alpha_tool", "read")]),
|
|
}
|
|
util = audit_plugin_utilization(
|
|
profile=profile, transcripts=transcripts, manifests=manifests, traces=traces
|
|
)
|
|
gap = compute_manifest_reality_gap(
|
|
profile=profile, manifests=manifests, utilization=util
|
|
)
|
|
assert len(gap.per_plugin) == 1
|
|
g = gap.per_plugin[0]
|
|
assert "memory" in g.claimed_capabilities
|
|
assert "read" in g.observed_capabilities
|
|
assert "memory" in g.unused_capabilities
|
|
assert "read" in g.unclaimed_capabilities
|
|
assert g.claim_coverage == 0.0
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Calibration tracking
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_calibration_metrics_accumulate_across_runs(tmp_path):
|
|
db = HistoricalDatabase(path=tmp_path / "runs.json")
|
|
fp_kwargs = dict(
|
|
base_model="m",
|
|
capability_coverage=[],
|
|
hook_footprint=[],
|
|
tool_family_surface=[],
|
|
capability_tags_union=[],
|
|
memory_slot="",
|
|
context_engine_slot="",
|
|
n_plugins=0,
|
|
n_clawhub_plugins=0,
|
|
n_custom_plugins=0,
|
|
n_official_plugins=0,
|
|
n_tools_total=0,
|
|
n_hooks_total=0,
|
|
plugin_ids=[],
|
|
tools_allow=[],
|
|
fingerprint_hash="aaa",
|
|
)
|
|
db.add(HistoricalRun(
|
|
profile_name="A",
|
|
fingerprint=ProfileFingerprint(profile_name="A", **fp_kwargs),
|
|
overall_score=0.80,
|
|
predicted_score_at_submission=0.75,
|
|
))
|
|
db.add(HistoricalRun(
|
|
profile_name="B",
|
|
fingerprint=ProfileFingerprint(profile_name="B", **fp_kwargs),
|
|
overall_score=0.60,
|
|
predicted_score_at_submission=0.65,
|
|
))
|
|
m = db.calibration_metrics()
|
|
assert m["n"] == 2
|
|
# |0.80-0.75| + |0.60-0.65| = 0.10 → MAE = 0.05
|
|
assert abs(m["mae"] - 0.05) < 1e-4
|
|
# Not enough data for the target to be "met"
|
|
assert m["mae_target_met"] is False
|
|
|
|
|
|
def test_calibration_metrics_handle_runs_without_prediction():
|
|
db = HistoricalDatabase()
|
|
fp_kwargs = dict(
|
|
base_model="m",
|
|
capability_coverage=[],
|
|
hook_footprint=[],
|
|
tool_family_surface=[],
|
|
capability_tags_union=[],
|
|
memory_slot="",
|
|
context_engine_slot="",
|
|
n_plugins=0,
|
|
n_clawhub_plugins=0,
|
|
n_custom_plugins=0,
|
|
n_official_plugins=0,
|
|
n_tools_total=0,
|
|
n_hooks_total=0,
|
|
plugin_ids=[],
|
|
tools_allow=[],
|
|
fingerprint_hash="aaa",
|
|
)
|
|
db.runs.append(HistoricalRun(
|
|
profile_name="legacy",
|
|
fingerprint=ProfileFingerprint(profile_name="legacy", **fp_kwargs),
|
|
overall_score=0.5,
|
|
predicted_score_at_submission=None,
|
|
))
|
|
m = db.calibration_metrics()
|
|
assert m["n"] == 0
|
|
assert m["mae"] == 0.0
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Recommendations + surprise attribution + insights
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _fp(name: str, **kwargs) -> ProfileFingerprint:
|
|
defaults = dict(
|
|
profile_name=name,
|
|
base_model="claude-sonnet-4",
|
|
capability_coverage=[],
|
|
hook_footprint=[],
|
|
tool_family_surface=[],
|
|
capability_tags_union=[],
|
|
memory_slot="",
|
|
context_engine_slot="",
|
|
n_plugins=0,
|
|
n_clawhub_plugins=0,
|
|
n_custom_plugins=0,
|
|
n_official_plugins=0,
|
|
n_tools_total=0,
|
|
n_hooks_total=0,
|
|
plugin_ids=[],
|
|
tools_allow=[],
|
|
fingerprint_hash=name,
|
|
)
|
|
defaults.update(kwargs)
|
|
return ProfileFingerprint(**defaults)
|
|
|
|
|
|
def _seed_database() -> HistoricalDatabase:
|
|
"""Build a database where 'magic' plugin clearly lifts scores."""
|
|
db = HistoricalDatabase()
|
|
for i, (pids, score) in enumerate([
|
|
(["alpha", "magic"], 0.90),
|
|
(["alpha", "magic"], 0.85),
|
|
(["alpha", "magic", "beta"], 0.88),
|
|
(["alpha"], 0.60),
|
|
(["alpha"], 0.55),
|
|
(["alpha", "beta"], 0.62),
|
|
]):
|
|
db.runs.append(HistoricalRun(
|
|
profile_name=f"run-{i}",
|
|
fingerprint=_fp(
|
|
f"run-{i}",
|
|
plugin_ids=sorted(pids),
|
|
capability_coverage=sorted(set(pids)),
|
|
n_plugins=len(pids),
|
|
),
|
|
overall_score=score,
|
|
per_task_score={"task-A": score},
|
|
))
|
|
return db
|
|
|
|
|
|
def test_generate_recommendations_suggests_adding_strong_plugin():
|
|
db = _seed_database()
|
|
our_fp = _fp(
|
|
"candidate",
|
|
plugin_ids=["alpha"],
|
|
capability_coverage=["alpha"],
|
|
n_plugins=1,
|
|
)
|
|
recs = generate_recommendations(
|
|
fingerprint=our_fp,
|
|
db=db,
|
|
factor=analyze(db),
|
|
utilization=None,
|
|
)
|
|
assert recs.recommendations, "expected recommendations"
|
|
targets = {r.target for r in recs.recommendations if r.kind == "add_plugin"}
|
|
assert "magic" in targets
|
|
|
|
|
|
def test_generate_recommendations_empty_when_db_too_small():
|
|
db = HistoricalDatabase()
|
|
our_fp = _fp("candidate")
|
|
recs = generate_recommendations(
|
|
fingerprint=our_fp, db=db, factor=None, utilization=None
|
|
)
|
|
assert recs.recommendations == []
|
|
assert "historical database" in recs.note
|
|
|
|
|
|
def test_attribute_surprise_names_missing_capability_on_negative_delta():
|
|
db = _seed_database()
|
|
our_fp = _fp(
|
|
"candidate",
|
|
plugin_ids=["alpha"],
|
|
capability_coverage=["alpha"],
|
|
)
|
|
cause = attribute_surprise(our_fp, "task-A", -0.3, db)
|
|
# High scorers all have 'magic'; we don't — should be named
|
|
assert "magic" in cause
|
|
|
|
|
|
def test_plugin_leaderboard_ranks_by_delta():
|
|
db = _seed_database()
|
|
leaderboard = compute_plugin_leaderboard(db, min_sample=2)
|
|
assert leaderboard
|
|
top = leaderboard[0]
|
|
# 'magic' should be on top with the highest positive delta
|
|
assert top.plugin_id == "magic"
|
|
assert top.impact_delta > 0
|
|
|
|
|
|
def test_capability_gaps_detects_missing_threshold(tmp_path):
|
|
db = HistoricalDatabase()
|
|
db.runs.append(HistoricalRun(
|
|
profile_name="A",
|
|
fingerprint=_fp("A"),
|
|
overall_score=0.4,
|
|
per_task_score={"hard-task": 0.3, "easy-task": 0.9},
|
|
))
|
|
db.runs.append(HistoricalRun(
|
|
profile_name="B",
|
|
fingerprint=_fp("B"),
|
|
overall_score=0.5,
|
|
per_task_score={"hard-task": 0.4, "easy-task": 0.95},
|
|
))
|
|
gaps = compute_capability_gaps(db, threshold=0.7)
|
|
gap_ids = {g.capability for g in gaps}
|
|
assert "hard-task" in gap_ids
|
|
assert "easy-task" not in gap_ids
|
|
|
|
|
|
def test_publish_insights_writes_all_files(tmp_path):
|
|
db = _seed_database()
|
|
written = publish_insights(db, tmp_path)
|
|
for name in (
|
|
"plugin_leaderboard",
|
|
"factor_importance",
|
|
"interactions",
|
|
"gaps",
|
|
"calibration",
|
|
"summary",
|
|
):
|
|
assert name in written, name
|
|
assert written[name].exists()
|
|
# Summary file should be valid JSON
|
|
data = json.loads((tmp_path / "summary.json").read_text())
|
|
assert data["n_runs"] == len(db)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# End-to-end diagnostic with every new section
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_build_diagnostic_populates_v05_sections(tmp_path):
|
|
db = _seed_database()
|
|
# Insert a few more runs so the factor analysis activates
|
|
for i in range(3):
|
|
db.runs.append(HistoricalRun(
|
|
profile_name=f"extra-{i}",
|
|
fingerprint=_fp(
|
|
f"extra-{i}",
|
|
plugin_ids=sorted(["alpha", "magic"]),
|
|
capability_coverage=sorted(["alpha", "magic"]),
|
|
n_plugins=2,
|
|
),
|
|
overall_score=0.9,
|
|
per_task_score={"task-A": 0.9},
|
|
))
|
|
|
|
profile = _make_profile("candidate", ["alpha"])
|
|
manifests = {"alpha": _make_manifest("alpha", tools=["alpha_tool"])}
|
|
traces = {
|
|
"alpha": RegistrationTrace(
|
|
plugin_id="alpha",
|
|
tools=["alpha_tool"],
|
|
tool_families_seen=["read"],
|
|
)
|
|
}
|
|
transcripts = {
|
|
"task-A": _make_transcript([("alpha_tool", "read")]),
|
|
}
|
|
|
|
report = build_diagnostic(
|
|
profile=profile,
|
|
manifests=manifests,
|
|
db=db,
|
|
actual_overall_score=0.55,
|
|
actual_per_task_scores={"task-A": 0.55},
|
|
transcripts=transcripts,
|
|
traces=traces,
|
|
tier_of={"task-A": "tier1"},
|
|
)
|
|
|
|
assert report.calibration_error is not None
|
|
assert report.robustness_profile is not None
|
|
assert report.utilization is not None
|
|
assert report.manifest_reality is not None
|
|
assert report.recommendations is not None
|
|
assert report.fingerprint_hash
|
|
assert report.robustness_profile.n_tasks == 1
|
|
assert report.utilization.n_plugins == 1
|
|
text = report.render_text()
|
|
assert "Robustness Profile" in text
|
|
assert "Plugin Utilization Audit" in text
|
|
# Recommendations might be empty but the section header/note should
|
|
# appear either way; we just check the report renders without error.
|
|
assert "Configuration Diagnostic" in text
|
|
|
|
|
|
def test_submit_run_records_prediction_for_calibration_tracking(tmp_path):
|
|
db_path = tmp_path / "runs.json"
|
|
db = HistoricalDatabase(path=db_path)
|
|
for i in range(5):
|
|
db.runs.append(HistoricalRun(
|
|
profile_name=f"seed-{i}",
|
|
fingerprint=_fp(
|
|
f"seed-{i}",
|
|
plugin_ids=["alpha"],
|
|
capability_coverage=["alpha"],
|
|
n_plugins=1,
|
|
),
|
|
overall_score=0.7,
|
|
per_task_score={"t": 0.7},
|
|
predicted_score_at_submission=0.65,
|
|
))
|
|
|
|
profile = _make_profile("new", ["alpha"])
|
|
manifests = {"alpha": _make_manifest("alpha", tools=["tool"])}
|
|
|
|
report = submit_run(
|
|
profile=profile,
|
|
manifests=manifests,
|
|
db=db,
|
|
actual_overall_score=0.72,
|
|
actual_per_task_scores={"t": 0.72},
|
|
)
|
|
# The brand-new run should be recorded with its prediction snapshot
|
|
latest = db.runs[-1]
|
|
assert latest.profile_name == "new"
|
|
assert latest.predicted_score_at_submission is not None
|
|
assert report.calibration_error is not None
|