clawbench/tests/test_tasks.py
scoootscooob 0e250e3fe1 fix(ci): tasks-public fallback + leaderboard removed from README
README.md: removed the inline reference leaderboard per user request.
The Core v1 manifest still carries the established ranking, the
README still documents methodology + dynamical-systems diagnostics.

clawbench/tasks.py: extend _resolve_tasks_dir() with a tasks-public/
fallback layer (resolver step 5). Local dev with the private tasks/
present is unchanged; CI without tasks/ now falls back to the public
Core v1 set instead of returning an empty corpus. Has been broken
since deb3d5d (the "stop tracking current task set" commit) — this
restores green CI now that tasks-public/ is available.

tests/test_tasks.py: three updates so tests pass against either the
private 40-task set OR the public 19-task set:
  - test_load_all_tasks_returns_full_corpus: threshold lowered from
    >= 20 to >= 19 (Core v1 size)
  - test_workspace_setup_preserves_nested_asset_paths: switched from
    t1-architecture-brief (private) to t4-browser-research-and-code
    (public) which exercises the same flat+nested asset behaviour
  - test_selected_tasks_include_judge_rubrics: replaced 3 task IDs
    not in the public Core release (t1-architecture-brief,
    t5-contradictory-requirements, t5-impossible-graceful-fail) with
    public-set equivalents (t1-bugfix-discount, t3-feature-export)

Verified locally with both branches:
  - private tasks/ present:    156 passed, 1 skipped
  - private tasks/ hidden:     152 passed, 5 skipped (CI-equivalent)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 20:32:26 -07:00

67 lines
3.2 KiB
Python

from pathlib import Path
from clawbench.client import GatewayConfig
from clawbench.harness import BenchmarkHarness
from clawbench.tasks import load_all_tasks
def test_load_all_tasks_returns_full_corpus():
tasks = load_all_tasks()
# Public Core release has 19 tasks; full private dev set has 40.
# Either must cover tiers 1-5 and carry capability/subset/judge metadata.
assert len(tasks) >= 19
assert {task.tier.value for task in tasks} == {"tier1", "tier2", "tier3", "tier4", "tier5"}
assert any(task.capabilities for task in tasks)
assert any(task.subsets for task in tasks)
assert any(task.scenario is not None for task in tasks)
assert any("ambiguous" in [variant.value for variant in task.prompt_variants] for task in tasks)
assert sum(1 for task in tasks if task.judge is not None) >= 6
def test_load_all_tasks_supports_pool_subset_and_capability_filters():
hard_tasks = load_all_tasks(subsets=["hard"])
consensus_tasks = load_all_tasks(subsets=["consensus"])
bugfix_tasks = load_all_tasks(capabilities=["bugfix"])
coding_scene_tasks = load_all_tasks(scenario="coding_dev_assist")
ambiguous_tasks = load_all_tasks(prompt_variant="ambiguous")
assert hard_tasks
assert consensus_tasks
assert bugfix_tasks
assert coding_scene_tasks
assert ambiguous_tasks
assert all("hard" in [subset.value for subset in task.subsets] for task in hard_tasks)
assert all("consensus" in [subset.value for subset in task.subsets] for task in consensus_tasks)
assert all("bugfix" in [capability.value for capability in task.capabilities] for task in bugfix_tasks)
assert all(task.scenario and task.scenario.value == "coding_dev_assist" for task in coding_scene_tasks)
assert all("ambiguous" in [variant.value for variant in task.prompt_variants] for task in ambiguous_tasks)
def test_workspace_setup_preserves_nested_asset_paths(tmp_path: Path):
# Use a task from the Core v1 public set (tasks-public/) so this test
# passes whether the dev has private tasks/ or only the public release.
# t4-browser-research-and-code has both flat files (report_client.py,
# serve_docs.py) and nested dirs (docs/, tests/).
task = next(task for task in load_all_tasks() if task.id == "t4-browser-research-and-code")
harness = BenchmarkHarness(gateway_config=GatewayConfig(), model="test-model", randomize_order=False)
workspace = tmp_path / "workspace"
workspace.mkdir()
harness._setup_workspace(task, workspace)
assert (workspace / "report_client.py").exists()
assert (workspace / "docs" / "index.html").exists()
assert (workspace / "tests" / "test_report_client.py").exists()
def test_selected_tasks_include_judge_rubrics():
# All assertions use task IDs from the Core v1 public set so CI
# (without the private tasks/) reproduces locally.
tasks = {task.id: task for task in load_all_tasks()}
assert tasks["t1-bugfix-discount"].judge is not None
assert tasks["t3-feature-export"].judge is not None
assert tasks["t4-browser-research-and-code"].judge is not None
assert tasks["t4-delegation-repair"].judge is not None
assert tasks["t5-hallucination-resistant-evidence"].judge is not None