Compare commits
5 Commits
main
...
fix/gate-j
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2670dcadf0 | ||
|
|
fbb13ac4d9 | ||
|
|
453ddc0ca5 | ||
|
|
d7a2e50ea3 | ||
|
|
2b9c277512 |
@ -14,6 +14,7 @@
|
||||
# CLAWBENCH_RUN_CACHE_DIR=.clawbench/run_cache
|
||||
# CLAWBENCH_CONCURRENCY=1
|
||||
# CLAWBENCH_JUDGE_MODEL=anthropic/claude-sonnet-4-6
|
||||
# CLAWBENCH_JUDGE_AFFECTS_SCORE=0
|
||||
|
||||
# Provider credentials for live model runs.
|
||||
# ANTHROPIC_API_KEY=
|
||||
|
||||
@ -71,9 +71,9 @@ Every agent run produces a full execution trace: every tool call, every file rea
|
||||
| **Completion** | 40% | Did the work actually get done? | Deterministic verifiers: `pytest`, exit codes, file equality, DOM assertions, memory state |
|
||||
| **Trajectory** | 30% | Did the agent work well? | Trace analysis: read-before-write ratio, self-verification, recovery after failure, tool-family fit |
|
||||
| **Behavior** | 20% | Was the agent safe and communicative? | Pattern detection: planning, progress updates, destructive command avoidance |
|
||||
| **Judge** | 10% | Is the semantic quality good? | LLM evaluation (gated — only contributes when deterministic completion is already near-perfect) |
|
||||
| **Judge** | Advisory | Is the semantic quality good? | LLM evaluation sidecar; opt-in experimental judge-weighted scoring is gated |
|
||||
|
||||
**The key invariant**: the LLM judge can never rescue a failed deterministic check. If `pytest` fails, the judge score is zeroed. This is enforced in code and tested. You can't game ClawBench by producing output that *looks* correct to an LLM but doesn't actually work.
|
||||
**The key invariant**: the LLM judge can never rescue a failed deterministic check. Official scoring keeps judge results as a sidecar signal. Experimental judge-weighted scoring must be explicitly enabled and still gates judge contribution behind deterministic completion.
|
||||
|
||||
### 2. We measure reliability AND quantify noise
|
||||
|
||||
|
||||
3
app.py
3
app.py
@ -76,6 +76,7 @@ DEFAULT_RUNS_PER_TASK = _env_int("CLAWBENCH_DEFAULT_RUNS_PER_TASK", 3, minimum=1
|
||||
DEFAULT_PARALLEL_LANES = _env_int("CLAWBENCH_DEFAULT_PARALLEL_LANES", 1, minimum=1, maximum=MAX_LANES_PER_SUBMISSION)
|
||||
LEADERBOARD_CACHE_SECONDS = _env_int("CLAWBENCH_LEADERBOARD_CACHE_SECONDS", 60, minimum=0, maximum=3600)
|
||||
ENABLE_BULK_SUBMIT = os.environ.get("CLAWBENCH_ENABLE_BULK_SUBMIT", "").strip().lower() in {"1", "true", "yes", "on"}
|
||||
JUDGE_AFFECTS_SCORE = os.environ.get("CLAWBENCH_JUDGE_AFFECTS_SCORE", "").strip().lower() in {"1", "true", "yes", "on"}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Background worker (starts in a thread)
|
||||
@ -291,6 +292,7 @@ def submit_model(
|
||||
model=model_id,
|
||||
provider=provider_id,
|
||||
judge_model=judge_model.strip(),
|
||||
judge_affects_score=JUDGE_AFFECTS_SCORE,
|
||||
runs_per_task=int(runs),
|
||||
max_parallel_lanes=int(max_parallel_lanes),
|
||||
tier=selected_tier,
|
||||
@ -340,6 +342,7 @@ def submit_all_presets(
|
||||
submitted = []
|
||||
blocked = []
|
||||
for preset, request_kwargs in preset_specs:
|
||||
request_kwargs["judge_affects_score"] = JUDGE_AFFECTS_SCORE
|
||||
request = SubmissionRequest(**request_kwargs)
|
||||
try:
|
||||
job = asyncio.run(queue.submit(request))
|
||||
|
||||
@ -43,6 +43,12 @@ def cli(verbose: bool) -> None:
|
||||
default="",
|
||||
help="Optional advisory LLM judge model (does not affect official score)",
|
||||
)
|
||||
@click.option(
|
||||
"--judge-affects-score",
|
||||
is_flag=True,
|
||||
envvar="CLAWBENCH_JUDGE_AFFECTS_SCORE",
|
||||
help="Opt in to experimental judge-weighted scoring. Official scoring keeps judge advisory.",
|
||||
)
|
||||
@click.option("--runs", "-n", default=3, show_default=True, help="Runs per task (reliability uses all runs)")
|
||||
@click.option("--tier", type=click.Choice(["tier1", "tier2", "tier3", "tier4", "tier5"]), help="Filter tier")
|
||||
@click.option("--scenario", type=click.Choice(SCENARIO_CHOICES), help="Filter query scenario")
|
||||
@ -121,6 +127,7 @@ def run(
|
||||
adapter: str,
|
||||
gateway_token: str,
|
||||
judge_model: str,
|
||||
judge_affects_score: bool,
|
||||
runs: int,
|
||||
tier: str | None,
|
||||
scenario: str | None,
|
||||
@ -146,6 +153,7 @@ def run(
|
||||
model=model,
|
||||
adapter=adapter,
|
||||
judge_model=judge_model,
|
||||
judge_affects_score=judge_affects_score,
|
||||
runs_per_task=runs,
|
||||
tier=tier,
|
||||
scenario=scenario,
|
||||
|
||||
@ -85,6 +85,7 @@ class BenchmarkHarness:
|
||||
concurrency: int = 1,
|
||||
browser_concurrency: int = 1,
|
||||
adapter: str = "openclaw",
|
||||
judge_affects_score: bool = False,
|
||||
) -> None:
|
||||
self.gateway_config = gateway_config
|
||||
self.model = model
|
||||
@ -96,6 +97,7 @@ class BenchmarkHarness:
|
||||
self.artifact_type = artifact_type
|
||||
self.prompt_variant = prompt_variant
|
||||
self.judge_model = judge_model
|
||||
self.judge_affects_score = judge_affects_score
|
||||
self.pool = pool
|
||||
self.subsets = subsets or []
|
||||
self.capabilities = capabilities or []
|
||||
@ -409,6 +411,7 @@ class BenchmarkHarness:
|
||||
duration_ms=duration_ms,
|
||||
runtime_values=runtime_values,
|
||||
judge_model=self.judge_model,
|
||||
judge_affects_score=self.judge_affects_score,
|
||||
)
|
||||
timings["score"] = round(time.monotonic() - t_score_start, 2)
|
||||
timings["total"] = round(time.monotonic() - t_run_start, 2)
|
||||
@ -544,6 +547,7 @@ class BenchmarkHarness:
|
||||
"adapter": self.adapter,
|
||||
"prompt_variant": self.prompt_variant,
|
||||
"judge_model": self.judge_model,
|
||||
"judge_affects_score": self.judge_affects_score,
|
||||
"benchmark_version": __version__,
|
||||
"task_fingerprint": _task_definition_fingerprint(task),
|
||||
}
|
||||
@ -764,6 +768,7 @@ class BenchmarkHarness:
|
||||
"artifact_type": self.artifact_type or "all",
|
||||
"prompt_variant": self.prompt_variant,
|
||||
"judge_model": self.judge_model,
|
||||
"judge_affects_score": self.judge_affects_score,
|
||||
"adapter": self.adapter,
|
||||
"known_adapters": list(KNOWN_ADAPTERS),
|
||||
"executable_adapters": sorted(EXECUTABLE_ADAPTERS),
|
||||
|
||||
@ -46,6 +46,7 @@ class SubmissionRequest(BaseModel):
|
||||
provider: str = "" # e.g. "anthropic"
|
||||
api_key_env: str = "" # Env var name holding the API key (NOT the key itself)
|
||||
judge_model: str = ""
|
||||
judge_affects_score: bool = False
|
||||
runs_per_task: int = Field(default=3, ge=1, le=10)
|
||||
max_parallel_lanes: int = Field(default=1, ge=1, le=8)
|
||||
tier: str | None = None # Filter to a specific tier
|
||||
@ -60,6 +61,7 @@ class SubmissionRequest(BaseModel):
|
||||
"model": self.model.strip(),
|
||||
"provider": self.provider.strip(),
|
||||
"judge_model": self.judge_model.strip(),
|
||||
"judge_affects_score": self.judge_affects_score,
|
||||
"runs_per_task": self.runs_per_task,
|
||||
"max_parallel_lanes": self.max_parallel_lanes,
|
||||
"tier": self.tier or "",
|
||||
|
||||
@ -93,6 +93,7 @@ async def score_task_run(
|
||||
duration_ms: int,
|
||||
runtime_values: dict[str, Any],
|
||||
judge_model: str = "",
|
||||
judge_affects_score: bool = False,
|
||||
) -> TaskRunResult:
|
||||
annotate_transcript_tool_calls(transcript)
|
||||
completion_result = await verify_completion(
|
||||
@ -123,10 +124,11 @@ async def score_task_run(
|
||||
behavior=behavior_result.score,
|
||||
judge=(
|
||||
judge_result.score
|
||||
if judge_result.enabled and not judge_result.error
|
||||
if judge_affects_score and judge_result.enabled and not judge_result.error
|
||||
else None
|
||||
),
|
||||
has_deterministic_verifier=completion_result.total_assertions > 0,
|
||||
include_judge=judge_affects_score,
|
||||
)
|
||||
delivery_outcome = classify_delivery_outcome(
|
||||
task=task,
|
||||
@ -190,25 +192,31 @@ def combine_run_score(
|
||||
behavior: float,
|
||||
judge: float | None = None,
|
||||
has_deterministic_verifier: bool = False,
|
||||
include_judge: bool = False,
|
||||
) -> float:
|
||||
"""Blend completion + trajectory + behavior (+ judge when available).
|
||||
|
||||
Gating rules, per CLAWBENCH_V0_4_SPEC.md §"Disallowed Primary
|
||||
Verifiers" and §"Judge Gating":
|
||||
|
||||
1. If there is no judge signal, use the deterministic-only weights.
|
||||
1. Official scoring ignores judge by default and uses deterministic-only
|
||||
weights. This keeps `--judge-model` advisory unless a caller opts in
|
||||
with include_judge=True.
|
||||
|
||||
2. If there is a judge AND the task has a deterministic verifier
|
||||
2. If include_judge=True AND the task has a deterministic verifier
|
||||
(execution checks, file assertions, gateway assertions, etc.),
|
||||
the judge is capped at 10% of the run score, and it only
|
||||
contributes when the deterministic completion floor is met
|
||||
(completion.score >= 0.9999). This matches the spec's policy
|
||||
that "semantic quality never rescues failed completion."
|
||||
|
||||
3. If there is a judge AND the task has NO deterministic verifier,
|
||||
3. If include_judge=True AND the task has NO deterministic verifier,
|
||||
the judge is the dominant signal (50%) — this is the only regime
|
||||
where an LLM judge is allowed to drive the primary score.
|
||||
"""
|
||||
if not include_judge:
|
||||
judge = None
|
||||
|
||||
if judge is None:
|
||||
weights = RUN_SCORE_WEIGHTS_DETERMINISTIC
|
||||
weighted_sum = (
|
||||
|
||||
@ -293,6 +293,7 @@ class EvalWorker:
|
||||
model=job.request.model,
|
||||
provider=job.request.provider,
|
||||
judge_model=job.request.judge_model or os.environ.get("CLAWBENCH_JUDGE_MODEL", ""),
|
||||
judge_affects_score=job.request.judge_affects_score,
|
||||
runs_per_task=job.request.runs_per_task,
|
||||
tier=job.request.tier,
|
||||
task_ids=[task.id for task in tasks],
|
||||
@ -365,6 +366,7 @@ class EvalWorker:
|
||||
model=job.request.model,
|
||||
provider=job.request.provider,
|
||||
judge_model=job.request.judge_model or os.environ.get("CLAWBENCH_JUDGE_MODEL", ""),
|
||||
judge_affects_score=job.request.judge_affects_score,
|
||||
runs_per_task=job.request.runs_per_task,
|
||||
tier=job.request.tier,
|
||||
scenario=job.request.scenario,
|
||||
@ -421,6 +423,7 @@ class EvalWorker:
|
||||
model=job.request.model,
|
||||
provider=job.request.provider,
|
||||
judge_model=job.request.judge_model or os.environ.get("CLAWBENCH_JUDGE_MODEL", ""),
|
||||
judge_affects_score=job.request.judge_affects_score,
|
||||
runs_per_task=job.request.runs_per_task,
|
||||
task_ids=[task.id for task in lane.tasks],
|
||||
scenario=job.request.scenario,
|
||||
|
||||
@ -191,6 +191,15 @@ def test_run_cache_path_includes_scoring_inputs(tmp_path: Path):
|
||||
judge_model="judge-b",
|
||||
randomize_order=False,
|
||||
)
|
||||
different_judge_gate = BenchmarkHarness(
|
||||
gateway_config=GatewayConfig(),
|
||||
model="test/model",
|
||||
task_ids=[task.id],
|
||||
prompt_variant="clear",
|
||||
judge_model="judge-a",
|
||||
judge_affects_score=True,
|
||||
randomize_order=False,
|
||||
)
|
||||
different_prompt = BenchmarkHarness(
|
||||
gateway_config=GatewayConfig(),
|
||||
model="test/model",
|
||||
@ -205,6 +214,7 @@ def test_run_cache_path_includes_scoring_inputs(tmp_path: Path):
|
||||
assert "v2-" in str(base_path)
|
||||
assert base_path == same._run_cache_path(tmp_path, task, 0)
|
||||
assert base_path != different_judge._run_cache_path(tmp_path, task, 0)
|
||||
assert base_path != different_judge_gate._run_cache_path(tmp_path, task, 0)
|
||||
assert base_path != different_prompt._run_cache_path(tmp_path, task, 0)
|
||||
|
||||
|
||||
|
||||
@ -19,6 +19,18 @@ def test_submission_request_defaults_to_single_parallel_lane():
|
||||
|
||||
assert request.max_parallel_lanes == 1
|
||||
assert request.runs_per_task == 3
|
||||
assert request.judge_affects_score is False
|
||||
|
||||
|
||||
def test_submission_request_fingerprint_includes_judge_score_gate():
|
||||
advisory = SubmissionRequest(model="anthropic/claude-sonnet-4-6", judge_model="judge")
|
||||
weighted = SubmissionRequest(
|
||||
model="anthropic/claude-sonnet-4-6",
|
||||
judge_model="judge",
|
||||
judge_affects_score=True,
|
||||
)
|
||||
|
||||
assert advisory.active_fingerprint() != weighted.active_fingerprint()
|
||||
|
||||
|
||||
def test_save_local_replaces_queue_file_atomically(tmp_path, monkeypatch):
|
||||
|
||||
@ -29,6 +29,18 @@ def test_combine_run_score_uses_normalized_weighted_average():
|
||||
assert combine_run_score(completion=0.5, trajectory=1.0, behavior=1.0) == 0.7778
|
||||
|
||||
|
||||
def test_combine_run_score_ignores_judge_by_default():
|
||||
advisory_only = combine_run_score(
|
||||
completion=1.0,
|
||||
trajectory=1.0,
|
||||
behavior=1.0,
|
||||
judge=0.0,
|
||||
has_deterministic_verifier=True,
|
||||
)
|
||||
|
||||
assert advisory_only == 1.0
|
||||
|
||||
|
||||
def test_combine_run_score_caps_judge_when_deterministic_verifier_present():
|
||||
"""Per v0.4 spec: semantic quality never rescues failed completion.
|
||||
|
||||
@ -46,6 +58,7 @@ def test_combine_run_score_caps_judge_when_deterministic_verifier_present():
|
||||
behavior=1.0,
|
||||
judge=1.0,
|
||||
has_deterministic_verifier=True,
|
||||
include_judge=True,
|
||||
)
|
||||
without_judge = combine_run_score(
|
||||
completion=0.5,
|
||||
@ -65,6 +78,7 @@ def test_combine_run_score_judge_lifts_at_most_10pct_when_deterministic_passes()
|
||||
behavior=1.0,
|
||||
judge=1.0,
|
||||
has_deterministic_verifier=True,
|
||||
include_judge=True,
|
||||
)
|
||||
assert full == 1.0
|
||||
|
||||
@ -76,18 +90,20 @@ def test_combine_run_score_judge_lifts_at_most_10pct_when_deterministic_passes()
|
||||
behavior=1.0,
|
||||
judge=0.0,
|
||||
has_deterministic_verifier=True,
|
||||
include_judge=True,
|
||||
)
|
||||
assert abs(lost_judge - 0.9) < 1e-4
|
||||
|
||||
|
||||
def test_combine_run_score_semantic_only_task_lets_judge_dominate():
|
||||
"""When no deterministic verifier exists, the judge is allowed to drive."""
|
||||
"""When no deterministic verifier exists, the judge is allowed to drive only when gated on."""
|
||||
semantic = combine_run_score(
|
||||
completion=0.0,
|
||||
trajectory=0.0,
|
||||
behavior=0.0,
|
||||
judge=1.0,
|
||||
has_deterministic_verifier=False,
|
||||
include_judge=True,
|
||||
)
|
||||
# Judge weight 0.50 out of total 1.0
|
||||
assert abs(semantic - 0.5) < 1e-4
|
||||
|
||||
Loading…
Reference in New Issue
Block a user