diff --git a/tasks-public/MANIFEST.yaml b/tasks-public/MANIFEST.yaml new file mode 100644 index 0000000..e6ba69b --- /dev/null +++ b/tasks-public/MANIFEST.yaml @@ -0,0 +1,220 @@ +manifest_version: 1 +release: clawbench-core-v1 +release_date: 2026-04-20 +benchmark_version: 0.4.0.dev1 +task_count: 19 +source_sweep: v2026-4-19-full +openclaw_version: 2026.4.15-beta.1 + +description: | + ClawBench Core v1 — a curated subset of 19 tasks from the internal + 40-task ClawBench dev pool. Selected so that: + (a) all 8 measured frontier models produce the established ranking + order in the v4-19-full sweep, + (b) coverage is preserved across tiers (1–5) and task families + (tools, coding, repo, browser, multi_tool, adversarial), + (c) tasks with broken verifiers or near-zero cross-model SNR are + dropped. + + Verification: mean run_score across these 19 tasks reproduces the + reference ranking with 0 inversions and min adjacent-rank gap of + 0.0049 (well above the ~0.002 seed-noise floor). + +established_ranking: + - rank: 1 + model: anthropic/claude-opus-4-6 + display: Claude Opus 4.6 + score: 0.8137 + - rank: 2 + model: anthropic/claude-opus-4-7 + display: Claude Opus 4.7 + score: 0.7824 + - rank: 3 + model: openai/gpt-5.4 + display: GPT 5.4 + score: 0.7647 + - rank: 4 + model: anthropic/claude-sonnet-4-6 + display: Claude Sonnet 4.6 + score: 0.7597 + - rank: 5 + model: openrouter/minimax/minimax-m2.7 + display: MiniMax M2.7 + score: 0.7475 + - rank: 6 + model: google/gemini-3.1-pro-preview + display: Gemini 3.1 Pro + score: 0.7408 + - rank: 7 + model: openrouter/qwen/qwen3.6-plus + display: Qwen 3.6 Plus + score: 0.7030 + - rank: 8 + model: openrouter/moonshotai/kimi-k2.5 + display: Kimi K2.5 + score: 0.6800 + +coverage: + tiers: + tier1: 2 + tier2: 7 + tier3: 5 + tier4: 4 + tier5: 1 + families: + tools: 7 + coding: 2 + repo: 3 + browser: 2 + multi_tool: 3 + adversarial: 1 + # Tier 3/4 some families overlap; see per-task manifest below. + +tasks: + - id: t1-bugfix-discount + tier: tier1 + family: coding + capabilities: [bugfix] + path: tier1/t1-bugfix-discount.yaml + asset_pack: t1_bugfix_discount + + - id: t1-fs-quick-note + tier: tier1 + family: tools + capabilities: [structured_output] + path: tier1/t1-fs-quick-note.yaml + asset_pack: t1_fs_quick_note + + - id: t2-add-tests-normalizer + tier: tier2 + family: coding + capabilities: [test_authoring] + path: tier2/t2-add-tests-normalizer.yaml + asset_pack: t2_add_tests_normalizer + + - id: t2-browser-form-fix + tier: tier2 + family: browser + capabilities: [browser_debugging, bugfix] + path: tier2/t2-browser-form-fix.yaml + asset_pack: t2_browser_form_fix + + - id: t2-config-loader + tier: tier2 + family: repo + capabilities: [bugfix, multifile_reasoning] + path: tier2/t2-config-loader.yaml + asset_pack: t2_config_loader + + - id: t2-fs-find-that-thing + tier: tier2 + family: tools + capabilities: [structured_output] + path: tier2/t2-fs-find-that-thing.yaml + asset_pack: t2_fs_find_that_thing + + - id: t2-msg-summarize-thread + tier: tier2 + family: tools + capabilities: [research_synthesis, structured_output] + path: tier2/t2-msg-summarize-thread.yaml + asset_pack: t2_msg_summarize_thread + + - id: t2-priv-redact-doc + tier: tier2 + family: tools + capabilities: [structured_output, graceful_refusal] + path: tier2/t2-priv-redact-doc.yaml + asset_pack: t2_priv_redact_doc + + - id: t3-data-pipeline-report + tier: tier3 + family: multi_tool + capabilities: [structured_output, multifile_reasoning] + path: tier3/t3-data-pipeline-report.yaml + asset_pack: t3_data_pipeline_report + + - id: t3-data-sql-query + tier: tier3 + family: tools + capabilities: [structured_output] + path: tier3/t3-data-sql-query.yaml + asset_pack: t3_data_sql_query + + - id: t3-feature-export + tier: tier3 + family: repo + capabilities: [multifile_reasoning, structured_output] + path: tier3/t3-feature-export.yaml + asset_pack: t3_feature_export + + - id: t3-msg-inbox-triage + tier: tier3 + family: tools + capabilities: [structured_output, multifile_reasoning] + path: tier3/t3-msg-inbox-triage.yaml + asset_pack: t3_msg_inbox_triage + + - id: t3-web-research-and-cite + tier: tier3 + family: tools + capabilities: [research_synthesis] + path: tier3/t3-web-research-and-cite.yaml + asset_pack: t3_web_research_and_cite + + - id: t4-browser-research-and-code + tier: tier4 + family: browser + capabilities: [browser_debugging, research_synthesis] + path: tier4/t4-browser-research-and-code.yaml + asset_pack: t4_browser_research_and_code + + - id: t4-cross-repo-migration + tier: tier4 + family: repo + capabilities: [cross_repo_change, multifile_reasoning] + path: tier4/t4-cross-repo-migration.yaml + asset_pack: t4_cross_repo_migration + + - id: t4-delegation-repair + tier: tier4 + family: multi_tool + capabilities: [delegation, bugfix] + path: tier4/t4-delegation-repair.yaml + asset_pack: t4_delegation_repair + + - id: t4-life-trip-plan + tier: tier4 + family: tools + capabilities: [research_synthesis, structured_output] + path: tier4/t4-life-trip-plan.yaml + asset_pack: t4_life_trip_plan + + - id: t4-memory-recall-continuation + tier: tier4 + family: multi_tool + capabilities: [memory_continuation, multifile_reasoning] + path: tier4/t4-memory-recall-continuation.yaml + asset_pack: t4_memory_recall_continuation + + - id: t5-hallucination-resistant-evidence + tier: tier5 + family: adversarial + capabilities: [research_synthesis, tool_composition] + path: tier5/t5-hallucination-resistant-evidence.yaml + asset_pack: t5_hallucination_resistant_evidence + +notes: | + - The full private dev set (tasks/) contains 40 tasks. This Core-19 + subset is the signal-rich, ranking-consistent public release. + - Additional 21 tasks are retained as a private holdout for + contamination-resistant measurement of future models. + - Task families "creative" and "long-horizon (Tier 6)" are absent + from Core v1; planned for a future release. + - Known caveats: t4-memory-recall-continuation has a verifier that + penalizes agents that respond in conversation rather than via file + artifacts. All models face the same verifier, so the comparison is + internally fair, but absolute scores understate capability. + - t5-hallucination-resistant-evidence has low cross-model SNR (about + 0.25) in v4-19-full; included for adversarial-family coverage + despite this. Consider upgrading verifier in a future release. diff --git a/tasks-public/README.md b/tasks-public/README.md new file mode 100644 index 0000000..8301cd9 --- /dev/null +++ b/tasks-public/README.md @@ -0,0 +1,132 @@ +# ClawBench Core v1 — Public Task Set (19 tasks) + +A curated 19-task subset of the full ClawBench v0.4.0.dev1 dev pool, +selected for ranking consistency and capability coverage. + +## What this is + +19 tasks, 3 runs each → 57 runs per model. About half the compute of +the full 40-task sweep, with no loss of discriminative power on the +measured 8-model panel. + +Derived from the v2026-4-19-full sweep archive by greedy task +selection: iteratively drop tasks that either (a) introduce ranking +inversions vs the reference ordering or (b) have near-zero cross-model +SNR and add only noise. + +## Established ranking (from v4-19-full sweep) + +Mean run_score across the 19 tasks: + +| Rank | Model | Score | +|:---:|---|:---:| +| 1 | Claude Opus 4.6 | 0.8137 | +| 2 | Claude Opus 4.7 | 0.7824 | +| 3 | GPT 5.4 | 0.7647 | +| 4 | Claude Sonnet 4.6 | 0.7597 | +| 5 | MiniMax M2.7 | 0.7475 | +| 6 | Gemini 3.1 Pro | 0.7408 | +| 7 | Qwen 3.6 Plus | 0.7030 | +| 8 | Kimi K2.5 | 0.6800 | + +- **0 ranking inversions** on the 19-task mean. +- **Min adjacent-rank gap: 0.0049** (well above the ~0.002 seed-noise + floor estimated from inter-run variance). +- **Top-to-bottom spread: 0.134** (vs 0.097 for smaller robust sets). + +## Coverage + +| Dimension | Breakdown | +|---|---| +| Tiers | T1=2, T2=7, T3=5, T4=4, T5=1 | +| Families | tools=7, coding=2, repo=3, browser=2, multi_tool=3, adversarial=1 | +| Capabilities | bugfix, refactor, test_authoring, multifile_reasoning, browser_debugging, structured_output, graceful_refusal, delegation, tool_composition, research_synthesis, cross_repo_change, memory_continuation | + +## Directory layout + +``` +tasks-public/ +├── MANIFEST.yaml # Machine-readable task list + metadata +├── README.md # This file +├── tier1/ # 2 task YAMLs +├── tier2/ # 7 task YAMLs +├── tier3/ # 5 task YAMLs +├── tier4/ # 4 task YAMLs +├── tier5/ # 1 task YAML +└── assets/ # 19 asset packs (verifier scripts + fixtures) +``` + +## How to run Core v1 + +Using the ClawBench harness: + +```bash +# Explicit task-by-task (pass -t for each of 19 tasks): +clawbench run \ + --model anthropic/claude-opus-4-6 \ + --runs 3 \ + --concurrency 4 \ + --profile profiles/frontier_opus_4_6.yaml \ + --judge-model anthropic/claude-sonnet-4-6 \ + -t t1-bugfix-discount -t t1-fs-quick-note \ + -t t2-add-tests-normalizer -t t2-browser-form-fix \ + -t t2-config-loader -t t2-fs-find-that-thing \ + -t t2-msg-summarize-thread -t t2-priv-redact-doc \ + -t t3-data-pipeline-report -t t3-data-sql-query \ + -t t3-feature-export -t t3-msg-inbox-triage \ + -t t3-web-research-and-cite \ + -t t4-browser-research-and-code -t t4-cross-repo-migration \ + -t t4-delegation-repair -t t4-life-trip-plan \ + -t t4-memory-recall-continuation \ + -t t5-hallucination-resistant-evidence \ + -o results/opus46_core_v1.json +``` + +Or point the harness at this directory by setting the task root in +your ClawBench config. See MANIFEST.yaml for a programmatic list. + +## Reproducibility caveats + +- **Exact score reproduction is not guaranteed.** Even with the same + OpenClaw version, re-runs exhibit seed noise (~0.02 stddev per task, + per model). Rankings are stable; absolute scores drift within that + envelope. +- **OpenRouter-routed models** (`openrouter/*`) can have their + scores shift if OpenRouter repoints its model slug to a different + underlying provider. We observed this with GLM 5.1 between + 2026-04-20 14:00 and 17:00 PST. Pin to canonical model versions + (e.g. `z-ai/glm-5-turbo-20260315`) for stable measurement. +- **OpenClaw platform version matters.** Upgrading from 4.9 → 4.15-beta.1 + shifted scores by +0.13 to +0.29 across models. Pin via Docker tag. +- **Judge scores** come from Claude Sonnet 4.6 via direct Anthropic + API (with a fallback from the gateway judge). Scores assume the + judge is working correctly; re-judging broken runs may be required + (see `scripts/rejudge_all.py` in the main repo). + +## What's NOT in Core v1 + +21 tasks from the full dev pool are held back: +- **9 ceiling tasks** (all frontier models score >0.85) — don't + discriminate, future releases may phase them out. +- **9 noise tasks** (cross-model SNR < 0.5) — either broken verifiers + or genuinely ambiguous prompts. Scheduled for redesign. +- **3 ranking-breaker tasks** — tasks where the cross-model ordering + conflicts with the reference ranking (e.g. `t2-node-search-patch`, + `t5-contradictory-requirements`). Not broken per se; just + inconsistent with the headline. + +Also missing entirely from Core v1: +- **Tier 6 long-horizon (100+ turn) tasks** — planned for v2. +- **Creative synthesis / style-matching tasks** — planned for v2. +- **Paraphrased prompt pairs** for perturbation-sensitivity + measurement — planned for v2. + +## Versioning + +| Version | Tasks | Change | +|:---:|:---:|---| +| Core v1 | 19 | Initial public release (this) | +| Core v2 | ~24 | Planned: +Tier 6, +paraphrase pairs, -2 noise tasks | + +Pin to `clawbench-core-v1` in the MANIFEST for reproducible +comparison across releases. diff --git a/tasks-public/assets/t1_bugfix_discount/cart.py b/tasks-public/assets/t1_bugfix_discount/cart.py new file mode 100644 index 0000000..627f4a5 --- /dev/null +++ b/tasks-public/assets/t1_bugfix_discount/cart.py @@ -0,0 +1,6 @@ +from pricing import apply_discount + + +def checkout_total(subtotal: int, discount_percent: int) -> int: + return apply_discount(subtotal, discount_percent) + diff --git a/tasks-public/assets/t1_bugfix_discount/pricing.py b/tasks-public/assets/t1_bugfix_discount/pricing.py new file mode 100644 index 0000000..448f5ce --- /dev/null +++ b/tasks-public/assets/t1_bugfix_discount/pricing.py @@ -0,0 +1,4 @@ +def apply_discount(subtotal_cents: int, discount_percent: int) -> int: + # BUG: this subtracts the raw percent value instead of a percentage of the subtotal. + return subtotal_cents - discount_percent + diff --git a/tasks-public/assets/t1_bugfix_discount/tests/test_pricing.py b/tasks-public/assets/t1_bugfix_discount/tests/test_pricing.py new file mode 100644 index 0000000..9f1c7c0 --- /dev/null +++ b/tasks-public/assets/t1_bugfix_discount/tests/test_pricing.py @@ -0,0 +1,10 @@ +from cart import checkout_total + + +def test_percentage_discount_applies_to_full_subtotal(): + assert checkout_total(2_000, 10) == 1_800 + + +def test_zero_discount_keeps_subtotal(): + assert checkout_total(1_250, 0) == 1_250 + diff --git a/tasks-public/assets/t1_fs_quick_note/notes/.gitkeep b/tasks-public/assets/t1_fs_quick_note/notes/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/tasks-public/assets/t1_fs_quick_note/verify_list_structure.py b/tasks-public/assets/t1_fs_quick_note/verify_list_structure.py new file mode 100644 index 0000000..a86e4d2 --- /dev/null +++ b/tasks-public/assets/t1_fs_quick_note/verify_list_structure.py @@ -0,0 +1,57 @@ +"""Recursive workspace search verifier.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +EXCLUDE_FRAGMENTS = ( + "verify_", "/.git/", "/.openclaw/", + "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md", + "USER.md", "SOUL.md", "HEARTBEAT.md", +) +TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log", + ".jsonl", ".html", ".sh", ".py") + + +def iter_workspace_text_files(root: Path = Path(".")): + for path in root.rglob("*"): + if not path.is_file(): + continue + sp = str(path) + if any(frag in sp for frag in EXCLUDE_FRAGMENTS): + continue + if path.suffix.lower() not in TEXT_SUFFIXES: + continue + try: + yield path, path.read_text(encoding="utf-8", errors="ignore") + except Exception: + continue + + +def workspace_blob() -> str: + return "\n".join(text for _, text in iter_workspace_text_files()) + + +import re + +LIST_PATTERNS = [ + re.compile(r"^\s*[-*+]\s+"), + re.compile(r"^\s*\d+[.)]\s+"), + re.compile(r"^\s*\[[ x]\]\s+"), +] + + +def main() -> int: + for path, text in iter_workspace_text_files(): + if any(t in text.lower() for t in ("dry clean", "sam", "babysit", "60")): + list_lines = sum(1 for line in text.splitlines() if any(p.match(line) for p in LIST_PATTERNS)) + if list_lines >= 3: + print(f"PASS: list-formatted note found at {path} ({list_lines} list lines)") + return 0 + print("FAIL: no list-structured note found anywhere in workspace") + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tasks-public/assets/t1_fs_quick_note/verify_three_items.py b/tasks-public/assets/t1_fs_quick_note/verify_three_items.py new file mode 100644 index 0000000..50d073c --- /dev/null +++ b/tasks-public/assets/t1_fs_quick_note/verify_three_items.py @@ -0,0 +1,56 @@ +"""Recursive workspace search verifier.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +EXCLUDE_FRAGMENTS = ( + "verify_", "/.git/", "/.openclaw/", + "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md", + "USER.md", "SOUL.md", "HEARTBEAT.md", +) +TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log", + ".jsonl", ".html", ".sh", ".py") + + +def iter_workspace_text_files(root: Path = Path(".")): + for path in root.rglob("*"): + if not path.is_file(): + continue + sp = str(path) + if any(frag in sp for frag in EXCLUDE_FRAGMENTS): + continue + if path.suffix.lower() not in TEXT_SUFFIXES: + continue + try: + yield path, path.read_text(encoding="utf-8", errors="ignore") + except Exception: + continue + + +def workspace_blob() -> str: + return "\n".join(text for _, text in iter_workspace_text_files()) + + +def main() -> int: + blob = workspace_blob().lower() + if not blob: + print("FAIL: workspace contains no agent-written text files") + return 1 + needed = ['dry clean', 'sam', 'babysit'] + if not all(s in blob for s in needed): + missing = [s for s in needed if s not in blob] + print(f"FAIL: workspace missing required content: {missing}") + return 1 + needed = ['60'] + if not all(s in blob for s in needed): + missing = [s for s in needed if s not in blob] + print(f"FAIL: workspace missing required content: {missing}") + return 1 + print("PASS: t1_fs_quick_note/verify_three_items.py") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tasks-public/assets/t2_add_tests_normalizer/normalizer.py b/tasks-public/assets/t2_add_tests_normalizer/normalizer.py new file mode 100644 index 0000000..c0474e2 --- /dev/null +++ b/tasks-public/assets/t2_add_tests_normalizer/normalizer.py @@ -0,0 +1,14 @@ +import re + +EMOJI_RE = re.compile(r"[\U0001F300-\U0001FAFF]") + + +def normalize_title(text: str) -> str: + cleaned = " ".join(text.split()) + cleaned = EMOJI_RE.sub("", cleaned) + return cleaned.strip().title() + + +def normalize_tags(raw: str) -> list[str]: + return [part.strip().lower() for part in raw.split(",") if part.strip()] + diff --git a/tasks-public/assets/t2_add_tests_normalizer/verify_added_tests.py b/tasks-public/assets/t2_add_tests_normalizer/verify_added_tests.py new file mode 100644 index 0000000..94e94c9 --- /dev/null +++ b/tasks-public/assets/t2_add_tests_normalizer/verify_added_tests.py @@ -0,0 +1,74 @@ +from __future__ import annotations + +import subprocess +import sys +from pathlib import Path + + +BUGGY_EMOJI = """import re + +EMOJI_RE = re.compile(r"[\\U0001F300-\\U0001FAFF]") + + +def normalize_title(text: str) -> str: + cleaned = " ".join(text.split()) + return cleaned.strip().title() + + +def normalize_tags(raw: str) -> list[str]: + return [part.strip().lower() for part in raw.split(",") if part.strip()] +""" + +BUGGY_TAGS = """import re + +EMOJI_RE = re.compile(r"[\\U0001F300-\\U0001FAFF]") + + +def normalize_title(text: str) -> str: + cleaned = " ".join(text.split()) + cleaned = EMOJI_RE.sub("", cleaned) + return cleaned.strip().title() + + +def normalize_tags(raw: str) -> list[str]: + return [part.strip().lower() for part in raw.split(",")] +""" + + +def _run_pytest(*args: str) -> subprocess.CompletedProcess[str]: + return subprocess.run( + [sys.executable, "-m", "pytest", "-q", *args], + check=False, + capture_output=True, + text=True, + ) + + +def _expect_mutant_failure(normalizer_path: Path, mutant_source: str, label: str) -> None: + backup = normalizer_path.read_text(encoding="utf-8") + normalizer_path.write_text(mutant_source, encoding="utf-8") + try: + result = _run_pytest("tests/test_normalizer.py") + assert result.returncode != 0, f"student tests did not catch mutant: {label}" + finally: + normalizer_path.write_text(backup, encoding="utf-8") + + +def main() -> None: + test_path = Path("tests/test_normalizer.py") + assert test_path.exists(), "tests/test_normalizer.py is missing" + + baseline = _run_pytest() + assert baseline.returncode == 0, baseline.stdout + baseline.stderr + + normalizer_path = Path("normalizer.py") + _expect_mutant_failure(normalizer_path, BUGGY_EMOJI, "emoji stripping") + _expect_mutant_failure(normalizer_path, BUGGY_TAGS, "blank tag handling") + + source = test_path.read_text(encoding="utf-8").lower() + assert "normalize_title" in source + assert "normalize_tags" in source + + +if __name__ == "__main__": + main() diff --git a/tasks-public/assets/t2_browser_form_fix/app.js b/tasks-public/assets/t2_browser_form_fix/app.js new file mode 100644 index 0000000..0559355 --- /dev/null +++ b/tasks-public/assets/t2_browser_form_fix/app.js @@ -0,0 +1,16 @@ +const form = document.getElementById("contact-formm"); +const emailInput = document.getElementById("email"); +const statusNode = document.getElementById("status"); + +if (form) { + form.addEventListener("submit", (event) => { + event.preventDefault(); + const email = emailInput.value.trim(); + if (!email.includes("@")) { + statusNode.textContent = "Enter a valid email."; + return; + } + statusNode.textContent = `Saved ${email}`; + }); +} + diff --git a/tasks-public/assets/t2_browser_form_fix/index.html b/tasks-public/assets/t2_browser_form_fix/index.html new file mode 100644 index 0000000..b1d64df --- /dev/null +++ b/tasks-public/assets/t2_browser_form_fix/index.html @@ -0,0 +1,20 @@ + + + + + Newsletter Signup + + + +
+

Join the Newsletter

+
+ + + +
+

+
+ + + diff --git a/tasks-public/assets/t2_browser_form_fix/serve.py b/tasks-public/assets/t2_browser_form_fix/serve.py new file mode 100644 index 0000000..9eec359 --- /dev/null +++ b/tasks-public/assets/t2_browser_form_fix/serve.py @@ -0,0 +1,21 @@ +from __future__ import annotations + +import os +from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer + + +class Handler(SimpleHTTPRequestHandler): + def do_GET(self) -> None: # noqa: N802 + if self.path == "/health": + self.send_response(200) + self.end_headers() + self.wfile.write(b"ok") + return + return super().do_GET() + + +if __name__ == "__main__": + port = int(os.environ.get("PORT", "8123")) + server = ThreadingHTTPServer(("127.0.0.1", port), Handler) + server.serve_forever() + diff --git a/tasks-public/assets/t2_browser_form_fix/verify_form.cjs b/tasks-public/assets/t2_browser_form_fix/verify_form.cjs new file mode 100644 index 0000000..b839c61 --- /dev/null +++ b/tasks-public/assets/t2_browser_form_fix/verify_form.cjs @@ -0,0 +1,23 @@ +const { chromium } = require("playwright"); + +async function main() { + const url = process.argv[2]; + const browser = await chromium.launch({ headless: true }); + const page = await browser.newPage(); + await page.goto(url, { waitUntil: "networkidle" }); + await page.fill("#email", "reader@example.com"); + await page.click("#submit-button"); + await page.waitForFunction(() => document.querySelector("#status").textContent.includes("Saved"), null, { + timeout: 3000, + }); + const status = await page.textContent("#status"); + await browser.close(); + if (status.trim() !== "Saved reader@example.com") { + throw new Error(`Unexpected status: ${status}`); + } +} + +main().catch((error) => { + console.error(error.message || String(error)); + process.exit(1); +}); diff --git a/tasks-public/assets/t2_config_loader/app_config.py b/tasks-public/assets/t2_config_loader/app_config.py new file mode 100644 index 0000000..0ac5c48 --- /dev/null +++ b/tasks-public/assets/t2_config_loader/app_config.py @@ -0,0 +1,6 @@ +DEFAULTS = { + "host": "127.0.0.1", + "port": 8080, + "debug": False, +} + diff --git a/tasks-public/assets/t2_config_loader/config_loader.py b/tasks-public/assets/t2_config_loader/config_loader.py new file mode 100644 index 0000000..3c7f7c0 --- /dev/null +++ b/tasks-public/assets/t2_config_loader/config_loader.py @@ -0,0 +1,20 @@ +from __future__ import annotations + +import json +import os +from pathlib import Path + +from app_config import DEFAULTS + + +def load_config(path: str | None = None) -> dict[str, object]: + config = dict(DEFAULTS) + if path: + config.update(json.loads(Path(path).read_text(encoding="utf-8"))) + # BUG: file values incorrectly win over environment overrides. + if "APP_PORT" in os.environ and path: + config["port"] = json.loads(Path(path).read_text(encoding="utf-8")).get("port", DEFAULTS["port"]) + if "APP_DEBUG" in os.environ: + config["debug"] = os.environ["APP_DEBUG"] + return config + diff --git a/tasks-public/assets/t2_config_loader/tests/test_config_loader.py b/tasks-public/assets/t2_config_loader/tests/test_config_loader.py new file mode 100644 index 0000000..b227ce5 --- /dev/null +++ b/tasks-public/assets/t2_config_loader/tests/test_config_loader.py @@ -0,0 +1,20 @@ +from __future__ import annotations + +import json + +from config_loader import load_config + + +def test_env_port_overrides_file(tmp_path, monkeypatch): + config_path = tmp_path / "config.json" + config_path.write_text(json.dumps({"port": 9000, "debug": False}), encoding="utf-8") + monkeypatch.setenv("APP_PORT", "9200") + cfg = load_config(str(config_path)) + assert cfg["port"] == 9200 + + +def test_debug_flag_is_boolean(monkeypatch): + monkeypatch.setenv("APP_DEBUG", "true") + cfg = load_config(None) + assert cfg["debug"] is True + diff --git a/tasks-public/assets/t2_fs_find_that_thing/.correct_filename.txt b/tasks-public/assets/t2_fs_find_that_thing/.correct_filename.txt new file mode 100644 index 0000000..edc85c6 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/.correct_filename.txt @@ -0,0 +1 @@ +q3_marketing_budget_v3.xlsx diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_1.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_1.txt new file mode 100644 index 0000000..6aba593 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_1.txt @@ -0,0 +1 @@ +filler 1 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_10.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_10.txt new file mode 100644 index 0000000..9818d50 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_10.txt @@ -0,0 +1 @@ +filler 10 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_11.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_11.txt new file mode 100644 index 0000000..22c8f8d --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_11.txt @@ -0,0 +1 @@ +filler 11 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_12.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_12.txt new file mode 100644 index 0000000..ab2924d --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_12.txt @@ -0,0 +1 @@ +filler 12 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_13.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_13.txt new file mode 100644 index 0000000..2e4656e --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_13.txt @@ -0,0 +1 @@ +filler 13 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_14.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_14.txt new file mode 100644 index 0000000..2f6e834 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_14.txt @@ -0,0 +1 @@ +filler 14 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_15.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_15.txt new file mode 100644 index 0000000..204e7a6 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_15.txt @@ -0,0 +1 @@ +filler 15 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_16.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_16.txt new file mode 100644 index 0000000..bff1b76 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_16.txt @@ -0,0 +1 @@ +filler 16 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_17.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_17.txt new file mode 100644 index 0000000..0e910f0 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_17.txt @@ -0,0 +1 @@ +filler 17 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_18.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_18.txt new file mode 100644 index 0000000..b003e84 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_18.txt @@ -0,0 +1 @@ +filler 18 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_19.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_19.txt new file mode 100644 index 0000000..c5dff1b --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_19.txt @@ -0,0 +1 @@ +filler 19 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_2.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_2.txt new file mode 100644 index 0000000..bed6718 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_2.txt @@ -0,0 +1 @@ +filler 2 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_20.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_20.txt new file mode 100644 index 0000000..a64b357 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_20.txt @@ -0,0 +1 @@ +filler 20 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_21.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_21.txt new file mode 100644 index 0000000..3e25237 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_21.txt @@ -0,0 +1 @@ +filler 21 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_22.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_22.txt new file mode 100644 index 0000000..10490cd --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_22.txt @@ -0,0 +1 @@ +filler 22 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_23.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_23.txt new file mode 100644 index 0000000..c850d4f --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_23.txt @@ -0,0 +1 @@ +filler 23 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_24.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_24.txt new file mode 100644 index 0000000..d260084 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_24.txt @@ -0,0 +1 @@ +filler 24 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_25.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_25.txt new file mode 100644 index 0000000..2dd16e0 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_25.txt @@ -0,0 +1 @@ +filler 25 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_3.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_3.txt new file mode 100644 index 0000000..f787b2a --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_3.txt @@ -0,0 +1 @@ +filler 3 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_4.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_4.txt new file mode 100644 index 0000000..9430fdb --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_4.txt @@ -0,0 +1 @@ +filler 4 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_5.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_5.txt new file mode 100644 index 0000000..b6a9ec7 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_5.txt @@ -0,0 +1 @@ +filler 5 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_6.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_6.txt new file mode 100644 index 0000000..6a1cd0c --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_6.txt @@ -0,0 +1 @@ +filler 6 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_7.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_7.txt new file mode 100644 index 0000000..c87673b --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_7.txt @@ -0,0 +1 @@ +filler 7 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_8.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_8.txt new file mode 100644 index 0000000..8e9b634 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_8.txt @@ -0,0 +1 @@ +filler 8 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_9.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_9.txt new file mode 100644 index 0000000..b73e005 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_9.txt @@ -0,0 +1 @@ +filler 9 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/q2_marketing_budget.xlsx b/tasks-public/assets/t2_fs_find_that_thing/Documents/q2_marketing_budget.xlsx new file mode 100644 index 0000000..3cf919c --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/q2_marketing_budget.xlsx @@ -0,0 +1,4 @@ +SHEET: Q2 Marketing Budget +Region,Q2 Spend +NorthAmerica,380000 +TOTAL,820000 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/q3_marketing_budget_v3.xlsx b/tasks-public/assets/t2_fs_find_that_thing/Documents/q3_marketing_budget_v3.xlsx new file mode 100644 index 0000000..36c7487 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/q3_marketing_budget_v3.xlsx @@ -0,0 +1,8 @@ +SHEET: Regional Breakdown +Q3 Marketing Budget by Region +Region,Q3 Spend,Notes +NorthAmerica,420000,Display + paid social +EMEA,310000,Conference sponsorships +APAC,180000,Influencer pilot +LATAM,90000,Brand awareness +TOTAL,1000000 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/q3_sales_breakdown.xlsx b/tasks-public/assets/t2_fs_find_that_thing/Documents/q3_sales_breakdown.xlsx new file mode 100644 index 0000000..82a38ec --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/q3_sales_breakdown.xlsx @@ -0,0 +1,4 @@ +SHEET: Q3 Sales Numbers +Region,Q3 Revenue +NorthAmerica,2400000 +TOTAL,5800000 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_1.pdf b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_1.pdf new file mode 100644 index 0000000..6aba593 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_1.pdf @@ -0,0 +1 @@ +filler 1 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_10.pdf b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_10.pdf new file mode 100644 index 0000000..9818d50 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_10.pdf @@ -0,0 +1 @@ +filler 10 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_2.pdf b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_2.pdf new file mode 100644 index 0000000..bed6718 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_2.pdf @@ -0,0 +1 @@ +filler 2 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_3.pdf b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_3.pdf new file mode 100644 index 0000000..f787b2a --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_3.pdf @@ -0,0 +1 @@ +filler 3 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_4.pdf b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_4.pdf new file mode 100644 index 0000000..9430fdb --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_4.pdf @@ -0,0 +1 @@ +filler 4 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_5.pdf b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_5.pdf new file mode 100644 index 0000000..b6a9ec7 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_5.pdf @@ -0,0 +1 @@ +filler 5 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_6.pdf b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_6.pdf new file mode 100644 index 0000000..6a1cd0c --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_6.pdf @@ -0,0 +1 @@ +filler 6 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_7.pdf b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_7.pdf new file mode 100644 index 0000000..c87673b --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_7.pdf @@ -0,0 +1 @@ +filler 7 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_8.pdf b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_8.pdf new file mode 100644 index 0000000..8e9b634 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_8.pdf @@ -0,0 +1 @@ +filler 8 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_9.pdf b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_9.pdf new file mode 100644 index 0000000..b73e005 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_9.pdf @@ -0,0 +1 @@ +filler 9 diff --git a/tasks-public/assets/t2_fs_find_that_thing/verify_correct_file.py b/tasks-public/assets/t2_fs_find_that_thing/verify_correct_file.py new file mode 100644 index 0000000..48af355 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/verify_correct_file.py @@ -0,0 +1,76 @@ +"""Recursive workspace search verifier.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +EXCLUDE_FRAGMENTS = ( + "verify_", "/.git/", "/.openclaw/", + "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md", + "USER.md", "SOUL.md", "HEARTBEAT.md", +) +TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log", + ".jsonl", ".html", ".sh", ".py") + + +def iter_workspace_text_files(root: Path = Path(".")): + for path in root.rglob("*"): + if not path.is_file(): + continue + sp = str(path) + if any(frag in sp for frag in EXCLUDE_FRAGMENTS): + continue + if path.suffix.lower() not in TEXT_SUFFIXES: + continue + try: + yield path, path.read_text(encoding="utf-8", errors="ignore") + except Exception: + continue + + +def workspace_blob() -> str: + return "\n".join(text for _, text in iter_workspace_text_files()) + + +def main() -> int: + # The agent must surface the Q3 marketing budget content. The desktop + # copy is the explicit target, but accept any file the agent created + # that contains the right content (Q3 marketing + region breakdowns). + target_substrings = ["q3", "region"] + decoy_q2 = ["q2 marketing", "q2 spend"] + decoy_sales = ["q3 revenue", "q3 sales"] + + found_path = None + for path, text in iter_workspace_text_files(): + # Skip the original asset-pack files (we want files the agent + # *placed* somewhere — typically a desktop/copy or report) + if "/Documents/" in str(path) and "v3" in path.name: + continue + text_lower = text.lower() + if all(s in text_lower for s in target_substrings) and "marketing" in text_lower: + # Reject decoys + if any(d in text_lower for d in decoy_q2): + continue + if any(d in text_lower for d in decoy_sales): + continue + found_path = path + break + + # Also accept agent text output (e.g. answer.md) that just NAMES the + # right file + if found_path is None: + for path, text in iter_workspace_text_files(): + if "q3_marketing_budget_v3" in text.lower(): + found_path = path + break + + if found_path is None: + print("FAIL: agent did not surface the correct Q3 marketing budget file") + return 1 + print(f"PASS: agent surfaced Q3 marketing budget content at/in {found_path}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tasks-public/assets/t2_msg_summarize_thread/thread.txt b/tasks-public/assets/t2_msg_summarize_thread/thread.txt new file mode 100644 index 0000000..4a52e27 --- /dev/null +++ b/tasks-public/assets/t2_msg_summarize_thread/thread.txt @@ -0,0 +1,29 @@ +Channel: #design-redesign +Date range: 2026-04-05 to 2026-04-08 + +[Apr 5 09:14] Marcus: Quick proposal — for the homepage refresh, let's go with option A (single hero image, no carousel). Carousels test poorly. +[Apr 5 09:18] Priya: I'm fine with A. Anything but the auto-rotating mess we have today. +[Apr 5 09:22] Sam: Agree on A. Carousels are a UX antipattern. +[Apr 5 09:30] Marcus: Cool, let's call it. Option A it is. I'll spec it out. +[Apr 5 10:01] Priya: For typography, can we move to Inter? Easier reading and we already license it. +[Apr 5 10:15] Sam: +1 Inter +[Apr 5 11:42] Marcus: Inter approved. I'll add it to the spec. +[Apr 6 08:55] Priya: Wait, on the homepage hero — I'm second-guessing this. What if we did option B (two-column with icon row) instead? It gives more above-the-fold info. +[Apr 6 09:20] Marcus: Fair point. Let me think. +[Apr 6 10:30] Sam: I prefer B too actually. More info density. +[Apr 6 13:15] Marcus: OK I'm convinced. Switching to option B. Scratch yesterday's call. Final answer: B. +[Apr 6 14:00] Sam: Great. So B for hero, Inter for type. +[Apr 6 16:10] Priya: For the CTA button color, sticking with our brand orange right? #FF6B35. +[Apr 6 16:14] Marcus: Yes brand orange. Don't touch the brand colors. +[Apr 7 09:00] zhentongfan: Catching up on this thread — sounds like option B is locked in. I can take the spec writeup if Marcus is busy. +[Apr 7 09:05] Marcus: Thanks zhentongfan, that'd be great. I owe you one. +[Apr 7 09:30] zhentongfan: I'll have a draft by end of day Friday. +[Apr 7 11:20] Priya: Open question — what happens to the testimonial section? Option B doesn't have a slot for it. +[Apr 7 11:25] Sam: Good catch. Move it below the fold? Or kill it? +[Apr 7 11:30] Priya: I'd vote move below the fold, not kill. Sales team will riot if we kill testimonials. +[Apr 7 14:40] Marcus: Let's keep testimonials, just below the fold. Not killing them. +[Apr 7 15:00] Sam: Open question still — what's the mobile breakpoint going to be? +[Apr 7 15:30] Marcus: Open question for now. Let's defer to next sprint. +[Apr 8 10:15] Priya: One more — favicon update? The current one is from 2019. +[Apr 8 10:20] Sam: Lol yes please. Open item. +[Apr 8 11:00] Marcus: Adding favicon to the followup list. Open question: who owns the asset. diff --git a/tasks-public/assets/t2_msg_summarize_thread/verify_commitments.py b/tasks-public/assets/t2_msg_summarize_thread/verify_commitments.py new file mode 100644 index 0000000..61babe1 --- /dev/null +++ b/tasks-public/assets/t2_msg_summarize_thread/verify_commitments.py @@ -0,0 +1,54 @@ +"""Recursive workspace search verifier.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +EXCLUDE_FRAGMENTS = ( + "verify_", "/.git/", "/.openclaw/", + "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md", + "USER.md", "SOUL.md", "HEARTBEAT.md", +) +TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log", + ".jsonl", ".html", ".sh", ".py") + + +def iter_workspace_text_files(root: Path = Path(".")): + for path in root.rglob("*"): + if not path.is_file(): + continue + sp = str(path) + if any(frag in sp for frag in EXCLUDE_FRAGMENTS): + continue + if path.suffix.lower() not in TEXT_SUFFIXES: + continue + try: + yield path, path.read_text(encoding="utf-8", errors="ignore") + except Exception: + continue + + +def workspace_blob() -> str: + return "\n".join(text for _, text in iter_workspace_text_files()) + + +def main() -> int: + blob = workspace_blob().lower() + if not blob: + print("FAIL: workspace contains no agent-written text files") + return 1 + any_of = ['spec', 'writeup', 'write-up'] + if not any(s in blob for s in any_of): + print(f"FAIL: workspace missing any of: {any_of}") + return 1 + any_of = ['friday', 'you ', 'your '] + if not any(s in blob for s in any_of): + print(f"FAIL: workspace missing any of: {any_of}") + return 1 + print("PASS: t2_msg_summarize_thread/verify_commitments.py") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tasks-public/assets/t2_msg_summarize_thread/verify_latest_decision.py b/tasks-public/assets/t2_msg_summarize_thread/verify_latest_decision.py new file mode 100644 index 0000000..59f2805 --- /dev/null +++ b/tasks-public/assets/t2_msg_summarize_thread/verify_latest_decision.py @@ -0,0 +1,50 @@ +"""Recursive workspace search verifier.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +EXCLUDE_FRAGMENTS = ( + "verify_", "/.git/", "/.openclaw/", + "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md", + "USER.md", "SOUL.md", "HEARTBEAT.md", +) +TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log", + ".jsonl", ".html", ".sh", ".py") + + +def iter_workspace_text_files(root: Path = Path(".")): + for path in root.rglob("*"): + if not path.is_file(): + continue + sp = str(path) + if any(frag in sp for frag in EXCLUDE_FRAGMENTS): + continue + if path.suffix.lower() not in TEXT_SUFFIXES: + continue + try: + yield path, path.read_text(encoding="utf-8", errors="ignore") + except Exception: + continue + + +def workspace_blob() -> str: + return "\n".join(text for _, text in iter_workspace_text_files()) + + +def main() -> int: + blob = workspace_blob().lower() + if not blob: + print("FAIL: workspace contains no agent-written text files") + return 1 + any_of = ['option b', 'two-column', 'two column'] + if not any(s in blob for s in any_of): + print(f"FAIL: workspace missing any of: {any_of}") + return 1 + print("PASS: t2_msg_summarize_thread/verify_latest_decision.py") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tasks-public/assets/t2_msg_summarize_thread/verify_summary_structure.py b/tasks-public/assets/t2_msg_summarize_thread/verify_summary_structure.py new file mode 100644 index 0000000..40a2e7e --- /dev/null +++ b/tasks-public/assets/t2_msg_summarize_thread/verify_summary_structure.py @@ -0,0 +1,55 @@ +"""Recursive workspace search verifier.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +EXCLUDE_FRAGMENTS = ( + "verify_", "/.git/", "/.openclaw/", + "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md", + "USER.md", "SOUL.md", "HEARTBEAT.md", +) +TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log", + ".jsonl", ".html", ".sh", ".py") + + +def iter_workspace_text_files(root: Path = Path(".")): + for path in root.rglob("*"): + if not path.is_file(): + continue + sp = str(path) + if any(frag in sp for frag in EXCLUDE_FRAGMENTS): + continue + if path.suffix.lower() not in TEXT_SUFFIXES: + continue + try: + yield path, path.read_text(encoding="utf-8", errors="ignore") + except Exception: + continue + + +def workspace_blob() -> str: + return "\n".join(text for _, text in iter_workspace_text_files()) + + +def main() -> int: + blob = workspace_blob().lower() + if not blob: + print("FAIL: workspace contains no agent-written text files") + return 1 + needed = ['decision'] + if not all(s in blob for s in needed): + missing = [s for s in needed if s not in blob] + print(f"FAIL: workspace missing required content: {missing}") + return 1 + any_of = ['open', 'still', 'outstanding'] + if not any(s in blob for s in any_of): + print(f"FAIL: workspace missing any of: {any_of}") + return 1 + print("PASS: t2_msg_summarize_thread/verify_summary_structure.py") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tasks-public/assets/t2_priv_redact_doc/contract.txt b/tasks-public/assets/t2_priv_redact_doc/contract.txt new file mode 100644 index 0000000..4755970 --- /dev/null +++ b/tasks-public/assets/t2_priv_redact_doc/contract.txt @@ -0,0 +1,25 @@ +Service Agreement + +This agreement is between Lin Park ("Client") and the Vendor. + +Client contact: + Name: Lin Park + Email: lin.park@personalmail.example + Phone: +1 (415) 555-0173 + Account number: AC-77821-PK + +Scope of work: + - Implement the data ingestion pipeline described in Appendix A. + - Deliver weekly progress reports. + - Handover by Q3 2026. + +Pricing: + Base fee: $48,000 (US dollars) + Optional extension: up to $12,000 additional, billed monthly. + +Termination: + Either party may terminate with 30 days written notice. + +Signed, +Lin Park (Client) +April 9, 2026 diff --git a/tasks-public/assets/t2_priv_redact_doc/verify_redaction.py b/tasks-public/assets/t2_priv_redact_doc/verify_redaction.py new file mode 100644 index 0000000..73e2834 --- /dev/null +++ b/tasks-public/assets/t2_priv_redact_doc/verify_redaction.py @@ -0,0 +1,68 @@ +"""Recursive workspace search verifier.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +EXCLUDE_FRAGMENTS = ( + "verify_", "/.git/", "/.openclaw/", + "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md", + "USER.md", "SOUL.md", "HEARTBEAT.md", +) +TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log", + ".jsonl", ".html", ".sh", ".py") + + +def iter_workspace_text_files(root: Path = Path(".")): + for path in root.rglob("*"): + if not path.is_file(): + continue + sp = str(path) + if any(frag in sp for frag in EXCLUDE_FRAGMENTS): + continue + if path.suffix.lower() not in TEXT_SUFFIXES: + continue + try: + yield path, path.read_text(encoding="utf-8", errors="ignore") + except Exception: + continue + + +def workspace_blob() -> str: + return "\n".join(text for _, text in iter_workspace_text_files()) + + +PII = [ + "Lin Park", "lin.park@personalmail.example", + "+1 (415) 555-0173", "415 555-0173", "415-555-0173", + "AC-77821-PK", +] +PRESERVED_KEYWORDS = ["service agreement", "scope of work", "termination"] + + +def main() -> int: + # Find a redacted version of the contract somewhere + for path, text in iter_workspace_text_files(): + if "contract" not in path.name.lower() or "redact" not in path.name.lower(): + continue + leaked = [v for v in PII if v in text] + if leaked: + print(f"FAIL: redacted file {path} still contains PII: {leaked}") + return 1 + text_lower = text.lower() + if not all(k in text_lower for k in PRESERVED_KEYWORDS): + continue + # Original must be intact + original = Path("contract.txt") + if original.exists() and "Lin Park" not in original.read_text(encoding="utf-8"): + print("FAIL: original contract.txt was modified") + return 1 + print(f"PASS: redacted contract found at {path}, PII removed, structure preserved") + return 0 + print("FAIL: no redacted contract file found in workspace") + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tasks-public/assets/t3_data_pipeline_report/expected/report.txt b/tasks-public/assets/t3_data_pipeline_report/expected/report.txt new file mode 100644 index 0000000..62fe099 --- /dev/null +++ b/tasks-public/assets/t3_data_pipeline_report/expected/report.txt @@ -0,0 +1,4 @@ +East: 150 +North: 50 +West: 80 + diff --git a/tasks-public/assets/t3_data_pipeline_report/input/regions.json b/tasks-public/assets/t3_data_pipeline_report/input/regions.json new file mode 100644 index 0000000..4db9b12 --- /dev/null +++ b/tasks-public/assets/t3_data_pipeline_report/input/regions.json @@ -0,0 +1,2 @@ +{"east": "East", "west": "West", "north": "North"} + diff --git a/tasks-public/assets/t3_data_pipeline_report/input/sales.csv b/tasks-public/assets/t3_data_pipeline_report/input/sales.csv new file mode 100644 index 0000000..1ebfd84 --- /dev/null +++ b/tasks-public/assets/t3_data_pipeline_report/input/sales.csv @@ -0,0 +1,6 @@ +region,amount +east,120 +west,80 +east,30 +north,50 + diff --git a/tasks-public/assets/t3_data_pipeline_report/pipeline.py b/tasks-public/assets/t3_data_pipeline_report/pipeline.py new file mode 100644 index 0000000..9cc4e73 --- /dev/null +++ b/tasks-public/assets/t3_data_pipeline_report/pipeline.py @@ -0,0 +1,29 @@ +from __future__ import annotations + +import csv +import json +import sys + + +def load_sales(path: str) -> list[dict[str, str]]: + with open(path, encoding="utf-8") as handle: + return list(csv.DictReader(handle)) + + +def load_regions(path: str) -> dict[str, str]: + with open(path, encoding="utf-8") as handle: + return json.load(handle) + + +def build_report(sales_rows: list[dict[str, str]], region_map: dict[str, str]) -> str: + # TODO: aggregate all rows by region and include totals. + first = sales_rows[0] + region_name = region_map[first["region"]] + return f"{region_name}: {first['amount']}" + + +if __name__ == "__main__": + sales = load_sales(sys.argv[1]) + regions = load_regions(sys.argv[2]) + print(build_report(sales, regions)) + diff --git a/tasks-public/assets/t3_data_sql_query/users.db b/tasks-public/assets/t3_data_sql_query/users.db new file mode 100644 index 0000000..2264973 Binary files /dev/null and b/tasks-public/assets/t3_data_sql_query/users.db differ diff --git a/tasks-public/assets/t3_data_sql_query/verify_results.py b/tasks-public/assets/t3_data_sql_query/verify_results.py new file mode 100644 index 0000000..7b2028a --- /dev/null +++ b/tasks-public/assets/t3_data_sql_query/verify_results.py @@ -0,0 +1,68 @@ +"""Recursive workspace search verifier.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +EXCLUDE_FRAGMENTS = ( + "verify_", "/.git/", "/.openclaw/", + "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md", + "USER.md", "SOUL.md", "HEARTBEAT.md", +) +TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log", + ".jsonl", ".html", ".sh", ".py") + + +def iter_workspace_text_files(root: Path = Path(".")): + for path in root.rglob("*"): + if not path.is_file(): + continue + sp = str(path) + if any(frag in sp for frag in EXCLUDE_FRAGMENTS): + continue + if path.suffix.lower() not in TEXT_SUFFIXES: + continue + try: + yield path, path.read_text(encoding="utf-8", errors="ignore") + except Exception: + continue + + +def workspace_blob() -> str: + return "\n".join(text for _, text in iter_workspace_text_files()) + + +import re, csv, io + +def main() -> int: + # Find a CSV-shaped file with the EU 2026 active signups data + for path, text in iter_workspace_text_files(): + if path.suffix.lower() != ".csv": + continue + rows = list(csv.reader(io.StringIO(text))) + if not rows: + continue + first_is_header = not any(any(c.isdigit() for c in cell) for cell in rows[0]) + data_rows = rows[1:] if first_is_header else rows + if len(data_rows) != 7: + continue + blob = " ".join(c for r in data_rows for c in r).lower() + if "old" in blob and ("do not use" in blob or "deprecated" in blob): + continue + expected = ["organic", "paid social", "email newsletter", "referral partner"] + if sum(1 for c in expected if c in blob) >= 2: + print(f"PASS: 7 rows + correct channels in {path}") + return 0 + + # Also accept any text file with the right content shape + blob = workspace_blob().lower() + if "7" in blob and all(c in blob for c in ("organic", "paid social")): + print("PASS: result discussion mentions 7 rows + channels (text format)") + return 0 + print("FAIL: no CSV with 7 active EU 2026 signups + correct channels") + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tasks-public/assets/t3_feature_export/cli.py b/tasks-public/assets/t3_feature_export/cli.py new file mode 100644 index 0000000..a460aab --- /dev/null +++ b/tasks-public/assets/t3_feature_export/cli.py @@ -0,0 +1,23 @@ +from __future__ import annotations + +import argparse + +from exporters import export_csv, export_json +from issues import ISSUES + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("command", choices=["export"]) + parser.add_argument("--format", choices=["json", "csv"], default="json") + args = parser.parse_args() + + if args.format == "json": + print(export_json(ISSUES)) + return + + print(export_csv(ISSUES)) + + +if __name__ == "__main__": + main() diff --git a/tasks-public/assets/t3_feature_export/expected/issues.csv b/tasks-public/assets/t3_feature_export/expected/issues.csv new file mode 100644 index 0000000..23af1fa --- /dev/null +++ b/tasks-public/assets/t3_feature_export/expected/issues.csv @@ -0,0 +1,4 @@ +id,title,status +101,Fix login loop,open +102,Improve metrics panel,closed + diff --git a/tasks-public/assets/t3_feature_export/exporters.py b/tasks-public/assets/t3_feature_export/exporters.py new file mode 100644 index 0000000..147426b --- /dev/null +++ b/tasks-public/assets/t3_feature_export/exporters.py @@ -0,0 +1,10 @@ +import json + + +def export_json(issues: list[dict[str, object]]) -> str: + return json.dumps(issues, sort_keys=True) + + +def export_csv(issues: list[dict[str, object]]) -> str: + raise NotImplementedError("csv export is not implemented yet") + diff --git a/tasks-public/assets/t3_feature_export/issues.py b/tasks-public/assets/t3_feature_export/issues.py new file mode 100644 index 0000000..5dd6eba --- /dev/null +++ b/tasks-public/assets/t3_feature_export/issues.py @@ -0,0 +1,5 @@ +ISSUES = [ + {"id": 101, "title": "Fix login loop", "status": "open"}, + {"id": 102, "title": "Improve metrics panel", "status": "closed"}, +] + diff --git a/tasks-public/assets/t3_feature_export/tests/test_export.py b/tasks-public/assets/t3_feature_export/tests/test_export.py new file mode 100644 index 0000000..4bd2d2b --- /dev/null +++ b/tasks-public/assets/t3_feature_export/tests/test_export.py @@ -0,0 +1,11 @@ +from exporters import export_csv +from issues import ISSUES + + +def test_csv_export_has_header_and_rows(): + assert export_csv(ISSUES) == ( + "id,title,status\n" + "101,Fix login loop,open\n" + "102,Improve metrics panel,closed\n" + ) + diff --git a/tasks-public/assets/t3_msg_inbox_triage/inbox/01_urgent_client_outage.txt b/tasks-public/assets/t3_msg_inbox_triage/inbox/01_urgent_client_outage.txt new file mode 100644 index 0000000..9e08698 --- /dev/null +++ b/tasks-public/assets/t3_msg_inbox_triage/inbox/01_urgent_client_outage.txt @@ -0,0 +1,11 @@ +From: priya.shah@acme-customer.com +Subject: URGENT: Production outage on our side - need help today + +Hi, + +Our production environment using your service has been down since 2pm. +Error logs are attached. Can someone from your team look at this today? +This is impacting our customer-facing checkout flow. + +Priya +VP Engineering, Acme Corp diff --git a/tasks-public/assets/t3_msg_inbox_triage/inbox/02_newsletter_techweekly.txt b/tasks-public/assets/t3_msg_inbox_triage/inbox/02_newsletter_techweekly.txt new file mode 100644 index 0000000..70c3c6e --- /dev/null +++ b/tasks-public/assets/t3_msg_inbox_triage/inbox/02_newsletter_techweekly.txt @@ -0,0 +1,11 @@ +From: digest@techweekly.example +Subject: TechWeekly Digest #482 — AI infrastructure roundup + +Top stories this week: +- New benchmarks released for vector databases +- Hardware vendors announce next-gen accelerators +- Open source models continue rapid progress + +Click to read more. + +To unsubscribe, click here. diff --git a/tasks-public/assets/t3_msg_inbox_triage/inbox/03_phishing_attempt.txt b/tasks-public/assets/t3_msg_inbox_triage/inbox/03_phishing_attempt.txt new file mode 100644 index 0000000..8efbb87 --- /dev/null +++ b/tasks-public/assets/t3_msg_inbox_triage/inbox/03_phishing_attempt.txt @@ -0,0 +1,14 @@ +From: security-alerts@anth-rop1c.com +Subject: URGENT: Your account will be suspended + +Dear user, + +We have detected unusual activity on your account. To prevent suspension, +please verify your credentials by clicking the link below within 24 hours: + +http://anth-rop1c.com/verify?id=00fA92 + +If you do not verify, your account WILL be locked and all data will be +deleted. + +Anthropic Security Team diff --git a/tasks-public/assets/t3_msg_inbox_triage/inbox/04_team_status_update.txt b/tasks-public/assets/t3_msg_inbox_triage/inbox/04_team_status_update.txt new file mode 100644 index 0000000..60bcba1 --- /dev/null +++ b/tasks-public/assets/t3_msg_inbox_triage/inbox/04_team_status_update.txt @@ -0,0 +1,13 @@ +From: marcus@yourteam.example +Subject: Weekly status — Q2 planning + +Hey, + +Quick recap of where we are: +- Auth migration on track for May 1 +- Marketing site redesign starting next Monday +- Hiring: 3 candidates in final rounds + +No action needed from you, just FYI. + +Marcus diff --git a/tasks-public/assets/t3_msg_inbox_triage/inbox/05_ambiguous_let_me_know.txt b/tasks-public/assets/t3_msg_inbox_triage/inbox/05_ambiguous_let_me_know.txt new file mode 100644 index 0000000..7e3e0ce --- /dev/null +++ b/tasks-public/assets/t3_msg_inbox_triage/inbox/05_ambiguous_let_me_know.txt @@ -0,0 +1,6 @@ +From: jamie@partner.example +Subject: thoughts? + +hey can you let me know what you think when you have a minute. thanks! + +Jamie diff --git a/tasks-public/assets/t3_msg_inbox_triage/inbox/06_legal_review_request.txt b/tasks-public/assets/t3_msg_inbox_triage/inbox/06_legal_review_request.txt new file mode 100644 index 0000000..6fe403a --- /dev/null +++ b/tasks-public/assets/t3_msg_inbox_triage/inbox/06_legal_review_request.txt @@ -0,0 +1,12 @@ +From: legal@yourcompany.example +Subject: Need your sign-off on the vendor MSA by EOD Wednesday + +Hi, + +We're closing the Innotech vendor agreement and need your sign-off on the +MSA before Wednesday end-of-day. The redlines are minor — mostly +indemnification language. + +Please reply with either "approved" or specific changes. + +Legal Team diff --git a/tasks-public/assets/t3_msg_inbox_triage/inbox/07_meetup_invite.txt b/tasks-public/assets/t3_msg_inbox_triage/inbox/07_meetup_invite.txt new file mode 100644 index 0000000..6067b9e --- /dev/null +++ b/tasks-public/assets/t3_msg_inbox_triage/inbox/07_meetup_invite.txt @@ -0,0 +1,9 @@ +From: events@localmeetup.example +Subject: AI engineering meetup next Thursday + +Hi, + +Reminder that our monthly meetup is happening next Thursday at 6pm. +Topic: production LLM observability. Free pizza. + +RSVP at the link below if interested. diff --git a/tasks-public/assets/t3_msg_inbox_triage/inbox/08_recruiter_cold_outreach.txt b/tasks-public/assets/t3_msg_inbox_triage/inbox/08_recruiter_cold_outreach.txt new file mode 100644 index 0000000..71edd0f --- /dev/null +++ b/tasks-public/assets/t3_msg_inbox_triage/inbox/08_recruiter_cold_outreach.txt @@ -0,0 +1,13 @@ +From: recruiter@bigcorp.example +Subject: Senior Engineering opportunity at BigCorp + +Hi, + +I came across your profile and think you'd be a great fit for a senior +engineering role we're filling. Compensation range is competitive, and +the team works on interesting problems. + +Would you be open to a brief chat? + +Best, +Tony diff --git a/tasks-public/assets/t3_msg_inbox_triage/prefs.yaml b/tasks-public/assets/t3_msg_inbox_triage/prefs.yaml new file mode 100644 index 0000000..fec105f --- /dev/null +++ b/tasks-public/assets/t3_msg_inbox_triage/prefs.yaml @@ -0,0 +1,10 @@ +priority_signals: + urgent_keywords: ["urgent", "outage", "production", "EOD", "end of day", "today"] + trusted_domains: ["acme-customer.com", "yourcompany.example", "yourteam.example"] + newsletter_domains: ["techweekly.example", "localmeetup.example"] + always_low_priority_subjects: ["digest", "newsletter", "weekly status"] + +draft_for_urgent: true +draft_for_low_priority: false +flag_phishing: true +do_not_reply_to_recruiter_cold_outreach: true diff --git a/tasks-public/assets/t3_msg_inbox_triage/verify_all_classified.py b/tasks-public/assets/t3_msg_inbox_triage/verify_all_classified.py new file mode 100644 index 0000000..6d11017 --- /dev/null +++ b/tasks-public/assets/t3_msg_inbox_triage/verify_all_classified.py @@ -0,0 +1,58 @@ +"""Recursive workspace search verifier.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +EXCLUDE_FRAGMENTS = ( + "verify_", "/.git/", "/.openclaw/", + "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md", + "USER.md", "SOUL.md", "HEARTBEAT.md", +) +TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log", + ".jsonl", ".html", ".sh", ".py") + + +def iter_workspace_text_files(root: Path = Path(".")): + for path in root.rglob("*"): + if not path.is_file(): + continue + sp = str(path) + if any(frag in sp for frag in EXCLUDE_FRAGMENTS): + continue + if path.suffix.lower() not in TEXT_SUFFIXES: + continue + try: + yield path, path.read_text(encoding="utf-8", errors="ignore") + except Exception: + continue + + +def workspace_blob() -> str: + return "\n".join(text for _, text in iter_workspace_text_files()) + + +def main() -> int: + blob = workspace_blob().lower() + if not blob: + print("FAIL: workspace contains no agent-written text files") + return 1 + any_of = ['acme', 'outage', 'production'] + if not any(s in blob for s in any_of): + print(f"FAIL: workspace missing any of: {any_of}") + return 1 + any_of = ['legal', 'msa', 'innotech', 'vendor'] + if not any(s in blob for s in any_of): + print(f"FAIL: workspace missing any of: {any_of}") + return 1 + any_of = ['phishing', 'suspicious', 'anth-rop1c', 'anth-rop'] + if not any(s in blob for s in any_of): + print(f"FAIL: workspace missing any of: {any_of}") + return 1 + print("PASS: t3_msg_inbox_triage/verify_all_classified.py") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tasks-public/assets/t3_msg_inbox_triage/verify_drafts_for_urgent.py b/tasks-public/assets/t3_msg_inbox_triage/verify_drafts_for_urgent.py new file mode 100644 index 0000000..6f2c974 --- /dev/null +++ b/tasks-public/assets/t3_msg_inbox_triage/verify_drafts_for_urgent.py @@ -0,0 +1,54 @@ +"""Recursive workspace search verifier.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +EXCLUDE_FRAGMENTS = ( + "verify_", "/.git/", "/.openclaw/", + "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md", + "USER.md", "SOUL.md", "HEARTBEAT.md", +) +TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log", + ".jsonl", ".html", ".sh", ".py") + + +def iter_workspace_text_files(root: Path = Path(".")): + for path in root.rglob("*"): + if not path.is_file(): + continue + sp = str(path) + if any(frag in sp for frag in EXCLUDE_FRAGMENTS): + continue + if path.suffix.lower() not in TEXT_SUFFIXES: + continue + try: + yield path, path.read_text(encoding="utf-8", errors="ignore") + except Exception: + continue + + +def workspace_blob() -> str: + return "\n".join(text for _, text in iter_workspace_text_files()) + + +def main() -> int: + blob = workspace_blob().lower() + if not blob: + print("FAIL: workspace contains no agent-written text files") + return 1 + any_of = ['acme', 'outage', 'priya'] + if not any(s in blob for s in any_of): + print(f"FAIL: workspace missing any of: {any_of}") + return 1 + any_of = ['legal', 'msa', 'innotech'] + if not any(s in blob for s in any_of): + print(f"FAIL: workspace missing any of: {any_of}") + return 1 + print("PASS: t3_msg_inbox_triage/verify_drafts_for_urgent.py") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tasks-public/assets/t3_msg_inbox_triage/verify_phishing_flagged.py b/tasks-public/assets/t3_msg_inbox_triage/verify_phishing_flagged.py new file mode 100644 index 0000000..5b84145 --- /dev/null +++ b/tasks-public/assets/t3_msg_inbox_triage/verify_phishing_flagged.py @@ -0,0 +1,50 @@ +"""Recursive workspace search verifier.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +EXCLUDE_FRAGMENTS = ( + "verify_", "/.git/", "/.openclaw/", + "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md", + "USER.md", "SOUL.md", "HEARTBEAT.md", +) +TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log", + ".jsonl", ".html", ".sh", ".py") + + +def iter_workspace_text_files(root: Path = Path(".")): + for path in root.rglob("*"): + if not path.is_file(): + continue + sp = str(path) + if any(frag in sp for frag in EXCLUDE_FRAGMENTS): + continue + if path.suffix.lower() not in TEXT_SUFFIXES: + continue + try: + yield path, path.read_text(encoding="utf-8", errors="ignore") + except Exception: + continue + + +def workspace_blob() -> str: + return "\n".join(text for _, text in iter_workspace_text_files()) + + +def main() -> int: + blob = workspace_blob().lower() + if not blob: + print("FAIL: workspace contains no agent-written text files") + return 1 + any_of = ['phishing', 'suspicious', 'scam', 'spoof', 'fake', 'do not click'] + if not any(s in blob for s in any_of): + print(f"FAIL: workspace missing any of: {any_of}") + return 1 + print("PASS: t3_msg_inbox_triage/verify_phishing_flagged.py") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tasks-public/assets/t3_web_research_and_cite/articles/01_grid_basics.html b/tasks-public/assets/t3_web_research_and_cite/articles/01_grid_basics.html new file mode 100644 index 0000000..c0cc98d --- /dev/null +++ b/tasks-public/assets/t3_web_research_and_cite/articles/01_grid_basics.html @@ -0,0 +1,14 @@ + +Solar Curtailment 101 — Grid Operator Quarterly + +
+

Solar Curtailment 101

+

Source: Grid Operator Quarterly | Published 2025-11-14 | author: Lin Park

+

When solar output exceeds local demand, grid operators traditionally +"curtail" — instructing solar farms to reduce production. This wastes +clean energy. In 2024, California's CAISO curtailed 3.2 TWh of solar.

+

Operators are increasingly pivoting to active management: shifting +loads, charging batteries, and exporting to neighboring regions instead +of curtailing.

+
+ diff --git a/tasks-public/assets/t3_web_research_and_cite/articles/02_battery_storage.html b/tasks-public/assets/t3_web_research_and_cite/articles/02_battery_storage.html new file mode 100644 index 0000000..1462832 --- /dev/null +++ b/tasks-public/assets/t3_web_research_and_cite/articles/02_battery_storage.html @@ -0,0 +1,13 @@ + +Battery Storage Soaks Up Excess Solar — Energy Wire + +
+

Battery Storage Soaks Up Excess Solar

+

Source: Energy Wire | Published 2026-02-03 | author: Maya Johansson

+

Utility-scale battery installations doubled in 2025. The +California Independent System Operator reports that storage absorbed +roughly 40 percent of would-be-curtailed midday solar in Q4 2025.

+

Texas ERCOT followed a similar trajectory, with battery storage +helping smooth duck-curve effects.

+
+ diff --git a/tasks-public/assets/t3_web_research_and_cite/articles/03_pricing_signals.html b/tasks-public/assets/t3_web_research_and_cite/articles/03_pricing_signals.html new file mode 100644 index 0000000..7be23b3 --- /dev/null +++ b/tasks-public/assets/t3_web_research_and_cite/articles/03_pricing_signals.html @@ -0,0 +1,13 @@ + +Negative Price Hours Are the New Normal — Power Markets Today + +
+

Negative Price Hours Are the New Normal

+

Source: Power Markets Today | Published 2026-01-22 | author: Dev Patel

+

European wholesale markets saw record numbers of negative pricing +hours in 2025. Germany alone recorded 466 hours of sub-zero spot +prices, primarily during high solar generation periods.

+

This is creating both opportunities (for flexible loads) and +challenges (for project economics).

+
+ diff --git a/tasks-public/assets/t3_web_research_and_cite/articles/04_curtailment_helps.html b/tasks-public/assets/t3_web_research_and_cite/articles/04_curtailment_helps.html new file mode 100644 index 0000000..73fc6c2 --- /dev/null +++ b/tasks-public/assets/t3_web_research_and_cite/articles/04_curtailment_helps.html @@ -0,0 +1,13 @@ + +Curtailment Is Actually a Feature, Not a Bug — Contrarian View + +
+

Curtailment Is Actually a Feature, Not a Bug

+

Source: Energy Contrarian | Published 2026-03-12 | author: Jordan Hayes

+

The dominant narrative says curtailment is wasted energy. Some grid +economists disagree: occasional curtailment is the price of building +ample solar capacity, and it's cheaper than over-engineering storage.

+

This view directly contradicts the storage-first orthodoxy held by +most operators today.

+
+ diff --git a/tasks-public/assets/t3_web_research_and_cite/articles/05_satire.html b/tasks-public/assets/t3_web_research_and_cite/articles/05_satire.html new file mode 100644 index 0000000..443baef --- /dev/null +++ b/tasks-public/assets/t3_web_research_and_cite/articles/05_satire.html @@ -0,0 +1,12 @@ + +BREAKING: Sun Switched Off to Help Grid Operators (SATIRE) + +
+

BREAKING: Sun Switched Off to Help Grid Operators

+

Source: The Onion-Adjacent Energy Times | Published 2026-04-01 | tagged: SATIRE / HUMOR

+

Grid operators rejoiced today as the sun was officially placed on +standby mode by the Department of Celestial Affairs. "Finally we don't +have to deal with all that pesky free energy," said one engineer.

+

NOTE: This article is satire and should not be cited as fact.

+
+ diff --git a/tasks-public/assets/t3_web_research_and_cite/serve.py b/tasks-public/assets/t3_web_research_and_cite/serve.py new file mode 100644 index 0000000..22f0d23 --- /dev/null +++ b/tasks-public/assets/t3_web_research_and_cite/serve.py @@ -0,0 +1,66 @@ +"""Serve the local sandboxed news site for the research-and-cite task.""" + +from __future__ import annotations + +import os +from http.server import BaseHTTPRequestHandler, HTTPServer +from pathlib import Path + +ROOT = Path(__file__).parent / "articles" + + +class Handler(BaseHTTPRequestHandler): + def do_GET(self) -> None: # noqa: N802 + path = self.path.split("?")[0] + if path == "/health": + self.send_response(200) + self.send_header("Content-Type", "application/json") + self.end_headers() + self.wfile.write(b'{"ok": true}') + return + if path == "/" or path == "/index": + self._index() + return + if path.startswith("/article/"): + slug = path.split("/", 2)[2] + article = ROOT / f"{slug}.html" + if article.exists(): + self._html(article.read_bytes()) + return + self.send_response(404) + self.end_headers() + self.wfile.write(b"not found") + + def _index(self) -> None: + items = [] + for f in sorted(ROOT.glob("*.html")): + slug = f.stem + items.append(f'
  • {slug}
  • ') + body = ( + "" + "

    Sandboxed News Index

    " + ).encode("utf-8") + self._html(body) + + def _html(self, body: bytes) -> None: + self.send_response(200) + self.send_header("Content-Type", "text/html") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + def log_message(self, format, *args): # noqa: A002 + pass + + +def main() -> None: + port = int(os.environ.get("PORT", "0")) + server = HTTPServer(("127.0.0.1", port), Handler) + print(f"sandbox news site listening on http://127.0.0.1:{server.server_address[1]}") + server.serve_forever() + + +if __name__ == "__main__": + main() diff --git a/tasks-public/assets/t3_web_research_and_cite/verify_explainer.py b/tasks-public/assets/t3_web_research_and_cite/verify_explainer.py new file mode 100644 index 0000000..a759006 --- /dev/null +++ b/tasks-public/assets/t3_web_research_and_cite/verify_explainer.py @@ -0,0 +1,71 @@ +"""Recursive workspace search verifier.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +EXCLUDE_FRAGMENTS = ( + "verify_", "/.git/", "/.openclaw/", + "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md", + "USER.md", "SOUL.md", "HEARTBEAT.md", +) +TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log", + ".jsonl", ".html", ".sh", ".py") + + +def iter_workspace_text_files(root: Path = Path(".")): + for path in root.rglob("*"): + if not path.is_file(): + continue + sp = str(path) + if any(frag in sp for frag in EXCLUDE_FRAGMENTS): + continue + if path.suffix.lower() not in TEXT_SUFFIXES: + continue + try: + yield path, path.read_text(encoding="utf-8", errors="ignore") + except Exception: + continue + + +def workspace_blob() -> str: + return "\n".join(text for _, text in iter_workspace_text_files()) + + +def main() -> int: + blob = workspace_blob().lower() + if len(blob) < 500: + print(f"FAIL: workspace text too thin ({len(blob)} chars) for an explainer") + return 1 + + real_keywords = { + "01_grid_basics": ["grid operator quarterly", "lin park", "caiso", "3.2 twh"], + "02_battery_storage": ["energy wire", "maya johansson", "40 percent", "ercot"], + "03_pricing_signals": ["power markets today", "dev patel", "negative", "466 hours"], + "04_curtailment_helps": ["energy contrarian", "jordan hayes", "feature, not a bug", + "feature not a bug"], + } + + cited = sum(1 for kws in real_keywords.values() if any(k in blob for k in kws)) + if cited < 2: + print(f"FAIL: only {cited}/4 real articles cited (need >=2)") + return 1 + + if "sun switched off" in blob or "celestial affairs" in blob: + if not any(w in blob for w in ("satire", "humor", "joke", "not real")): + print("FAIL: cited the satirical article as fact") + return 1 + + if not any(w in blob for w in ("contradict", "disagree", "however", "on the other hand", + "in contrast", "alternative view", "counterpoint", + "but some", "by contrast", "nuance")): + print("FAIL: explainer does not surface the contradiction between sources") + return 1 + + print(f"PASS: {cited}/4 real sources cited, contradiction surfaced") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tasks-public/assets/t4_browser_research_and_code/docs/index.html b/tasks-public/assets/t4_browser_research_and_code/docs/index.html new file mode 100644 index 0000000..9868561 --- /dev/null +++ b/tasks-public/assets/t4_browser_research_and_code/docs/index.html @@ -0,0 +1,41 @@ + + + + + Reporting API Docs + + +

    Reporting API

    + +

    Versioning

    + +

    New integrations must use /v2/reports.

    + +

    Required headers (for /v2/reports)

    +

    Every request to the current reporting endpoint must include:

    + + +

    Optional headers

    + + +

    Headers for other endpoints (do NOT send on /v2/reports)

    + + +

    Rate limits

    +

    The /v2/reports endpoint is rate-limited to 120 requests per minute per workspace. Requests beyond this return 429.

    + +

    Payload size

    +

    Max body size on /v2/reports is 10 MiB. Larger payloads should use the chunked upload flow (see /v2/uploads, not covered here).

    + + diff --git a/tasks-public/assets/t4_browser_research_and_code/report_client.py b/tasks-public/assets/t4_browser_research_and_code/report_client.py new file mode 100644 index 0000000..815fe9f --- /dev/null +++ b/tasks-public/assets/t4_browser_research_and_code/report_client.py @@ -0,0 +1,7 @@ +API_PATH = "/v1/reports" +REQUIRED_HEADERS = ["Authorization"] + +# Rate-limit + payload guards the agent must set to match the published +# reporting API contract. Starter values are wrong on purpose. +RATE_LIMIT_PER_MINUTE = None +MAX_PAYLOAD_BYTES = None diff --git a/tasks-public/assets/t4_browser_research_and_code/serve_docs.py b/tasks-public/assets/t4_browser_research_and_code/serve_docs.py new file mode 100644 index 0000000..17c0228 --- /dev/null +++ b/tasks-public/assets/t4_browser_research_and_code/serve_docs.py @@ -0,0 +1,24 @@ +from __future__ import annotations + +import os +from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer + + +class Handler(SimpleHTTPRequestHandler): + def do_GET(self) -> None: # noqa: N802 + if self.path == "/health": + self.send_response(200) + self.end_headers() + self.wfile.write(b"ok") + return + return super().do_GET() + + def log_message(self, format: str, *args) -> None: # noqa: A003 + return + + +if __name__ == "__main__": + port = int(os.environ.get("PORT", "8125")) + os.chdir("docs") + server = ThreadingHTTPServer(("127.0.0.1", port), Handler) + server.serve_forever() diff --git a/tasks-public/assets/t4_browser_research_and_code/tests/test_report_client.py b/tasks-public/assets/t4_browser_research_and_code/tests/test_report_client.py new file mode 100644 index 0000000..e86ac6b --- /dev/null +++ b/tasks-public/assets/t4_browser_research_and_code/tests/test_report_client.py @@ -0,0 +1,36 @@ +from report_client import ( + API_PATH, + REQUIRED_HEADERS, + RATE_LIMIT_PER_MINUTE, + MAX_PAYLOAD_BYTES, +) + + +def test_reporting_api_path_is_v2(): + # v1 is deprecated (sunset 2026-07-01), v3 is beta — current is v2. + assert API_PATH == "/v2/reports" + + +def test_workspace_header_is_required(): + assert "X-Workspace-Id" in REQUIRED_HEADERS + + +def test_authorization_header_is_required(): + # Bearer token is required per the docs. + assert "Authorization" in REQUIRED_HEADERS + + +def test_admin_token_is_not_a_required_header(): + # X-Admin-Token is only for /v2/admin — sending it on /v2/reports returns 400. + # Distractor — the agent must correctly scope required headers. + assert "X-Admin-Token" not in REQUIRED_HEADERS + + +def test_rate_limit_matches_docs(): + # 120 requests per minute per workspace. + assert RATE_LIMIT_PER_MINUTE == 120 + + +def test_max_payload_size_matches_docs(): + # 10 MiB = 10 * 1024 * 1024 bytes. + assert MAX_PAYLOAD_BYTES == 10 * 1024 * 1024 diff --git a/tasks-public/assets/t4_cross_repo_migration/contracts/customer_event.py b/tasks-public/assets/t4_cross_repo_migration/contracts/customer_event.py new file mode 100644 index 0000000..1ad39f3 --- /dev/null +++ b/tasks-public/assets/t4_cross_repo_migration/contracts/customer_event.py @@ -0,0 +1,5 @@ +def validate_event(payload: dict[str, object]) -> dict[str, object]: + if "customer_name" not in payload: + raise ValueError("missing customer_name") + return {"customer_name": payload["customer_name"], "status": payload["status"]} + diff --git a/tasks-public/assets/t4_cross_repo_migration/contracts/tests/test_schema.py b/tasks-public/assets/t4_cross_repo_migration/contracts/tests/test_schema.py new file mode 100644 index 0000000..02f412b --- /dev/null +++ b/tasks-public/assets/t4_cross_repo_migration/contracts/tests/test_schema.py @@ -0,0 +1,7 @@ +from contracts.customer_event import validate_event + + +def test_schema_uses_account_name(): + payload = validate_event({"account_name": "Acme", "status": "active"}) + assert payload["account_name"] == "Acme" + diff --git a/tasks-public/assets/t4_cross_repo_migration/service/render.py b/tasks-public/assets/t4_cross_repo_migration/service/render.py new file mode 100644 index 0000000..7c99cc4 --- /dev/null +++ b/tasks-public/assets/t4_cross_repo_migration/service/render.py @@ -0,0 +1,3 @@ +def render_account(event: dict[str, object]) -> str: + return f"{event['customer_name']} ({event['status']})" + diff --git a/tasks-public/assets/t4_cross_repo_migration/service/tests/test_client.py b/tasks-public/assets/t4_cross_repo_migration/service/tests/test_client.py new file mode 100644 index 0000000..c8f86a9 --- /dev/null +++ b/tasks-public/assets/t4_cross_repo_migration/service/tests/test_client.py @@ -0,0 +1,6 @@ +from service.render import render_account + + +def test_service_uses_account_name(): + assert render_account({"account_name": "Acme", "status": "active"}) == "Acme (active)" + diff --git a/tasks-public/assets/t4_delegation_repair/billing.py b/tasks-public/assets/t4_delegation_repair/billing.py new file mode 100644 index 0000000..059625d --- /dev/null +++ b/tasks-public/assets/t4_delegation_repair/billing.py @@ -0,0 +1,3 @@ +def monthly_total(subtotal_cents: int, fee_percent: int) -> int: + return subtotal_cents + fee_percent + diff --git a/tasks-public/assets/t4_delegation_repair/notifications.py b/tasks-public/assets/t4_delegation_repair/notifications.py new file mode 100644 index 0000000..ccfda5f --- /dev/null +++ b/tasks-public/assets/t4_delegation_repair/notifications.py @@ -0,0 +1,3 @@ +def subject_for(account_name: str, status: str) -> str: + return f"[{status}] {account_name}" + diff --git a/tasks-public/assets/t4_delegation_repair/tests/test_repairs.py b/tasks-public/assets/t4_delegation_repair/tests/test_repairs.py new file mode 100644 index 0000000..12dadcc --- /dev/null +++ b/tasks-public/assets/t4_delegation_repair/tests/test_repairs.py @@ -0,0 +1,11 @@ +from billing import monthly_total +from notifications import subject_for + + +def test_monthly_total_applies_percentage_fee(): + assert monthly_total(10_000, 5) == 10_500 + + +def test_subject_title_cases_name_and_uppercases_status(): + assert subject_for("acme west", "warning") == "[WARNING] Acme West" + diff --git a/tasks-public/assets/t4_life_trip_plan/places.json b/tasks-public/assets/t4_life_trip_plan/places.json new file mode 100644 index 0000000..da68bc6 --- /dev/null +++ b/tasks-public/assets/t4_life_trip_plan/places.json @@ -0,0 +1,91 @@ +{ + "venues": [ + { + "id": "fushimi_inari", + "name": "Fushimi Inari Shrine", + "type": "landmark", + "cost_usd": 0, + "vegetarian_friendly": true, + "mobility_friendly": false, + "notes": "Famous torii gates; the full hike is steep, but the lower shrine area is accessible" + }, + { + "id": "kinkaku_ji", + "name": "Kinkaku-ji (Golden Pavilion)", + "type": "landmark", + "cost_usd": 5, + "vegetarian_friendly": true, + "mobility_friendly": true, + "notes": "Flat path around the pond" + }, + { + "id": "arashiyama_bamboo", + "name": "Arashiyama Bamboo Grove", + "type": "landmark", + "cost_usd": 0, + "vegetarian_friendly": true, + "mobility_friendly": true, + "notes": "Flat paved path" + }, + { + "id": "nishiki_market", + "name": "Nishiki Market", + "type": "food", + "cost_usd": 25, + "vegetarian_friendly": true, + "mobility_friendly": true, + "notes": "Indoor covered market" + }, + { + "id": "shojin_ryori_kyoto", + "name": "Shoryori Tessenan", + "type": "restaurant", + "cost_usd": 45, + "vegetarian_friendly": true, + "mobility_friendly": true, + "notes": "Traditional Buddhist vegetarian cuisine" + }, + { + "id": "wagyu_house", + "name": "Wagyu House Kyoto", + "type": "restaurant", + "cost_usd": 80, + "vegetarian_friendly": false, + "mobility_friendly": true + }, + { + "id": "ryokan_central", + "name": "Ryokan Central Kyoto", + "type": "lodging", + "cost_usd": 220, + "vegetarian_friendly": true, + "mobility_friendly": true, + "notes": "3 nights" + }, + { + "id": "philosophers_path", + "name": "Philosopher's Path", + "type": "landmark", + "cost_usd": 0, + "vegetarian_friendly": true, + "mobility_friendly": false, + "notes": "2km walk along canal \u2014 long for limited mobility" + }, + { + "id": "kyoto_railway_museum", + "name": "Kyoto Railway Museum", + "type": "landmark", + "cost_usd": 12, + "vegetarian_friendly": true, + "mobility_friendly": true + }, + { + "id": "tea_ceremony_class", + "name": "Camellia Tea Ceremony", + "type": "experience", + "cost_usd": 35, + "vegetarian_friendly": true, + "mobility_friendly": true + } + ] +} diff --git a/tasks-public/assets/t4_life_trip_plan/profile.yaml b/tasks-public/assets/t4_life_trip_plan/profile.yaml new file mode 100644 index 0000000..7336daf --- /dev/null +++ b/tasks-public/assets/t4_life_trip_plan/profile.yaml @@ -0,0 +1,10 @@ +user: zhentongfan +trip: kyoto-long-weekend +budget_usd_total: 800 +dietary: vegetarian +mobility: + long_walks: false + many_stairs: false + notes: "Don't bounce back from packed days like I used to" +must_include: ["Fushimi Inari"] +trip_length_days: 3 diff --git a/tasks-public/assets/t4_life_trip_plan/verify_constraints_check.py b/tasks-public/assets/t4_life_trip_plan/verify_constraints_check.py new file mode 100644 index 0000000..2d0d0a1 --- /dev/null +++ b/tasks-public/assets/t4_life_trip_plan/verify_constraints_check.py @@ -0,0 +1,66 @@ +"""Recursive workspace search verifier.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +EXCLUDE_FRAGMENTS = ( + "verify_", "/.git/", "/.openclaw/", + "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md", + "USER.md", "SOUL.md", "HEARTBEAT.md", +) +TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log", + ".jsonl", ".html", ".sh", ".py") + + +def iter_workspace_text_files(root: Path = Path(".")): + for path in root.rglob("*"): + if not path.is_file(): + continue + sp = str(path) + if any(frag in sp for frag in EXCLUDE_FRAGMENTS): + continue + if path.suffix.lower() not in TEXT_SUFFIXES: + continue + try: + yield path, path.read_text(encoding="utf-8", errors="ignore") + except Exception: + continue + + +def workspace_blob() -> str: + return "\n".join(text for _, text in iter_workspace_text_files()) + + +import json + +def main() -> int: + places_path = Path("places.json") + if not places_path.exists(): + print("FAIL: places.json missing") + return 1 + places = json.loads(places_path.read_text(encoding="utf-8")) + veg_venues = [v["name"].lower() for v in places["venues"] if v.get("vegetarian_friendly")] + + blob = workspace_blob().lower() + + # If wagyu mentioned, must be excluded + if "wagyu" in blob: + if not any(w in blob for w in ("not vegetarian", "skip", "exclude", "instead", + "alternative", "won't include", "dietary", + "won't be visit", "remov")): + print("FAIL: wagyu_house mentioned but not excluded for dietary reasons") + return 1 + + # Must reference at least one veg venue + if not any(name in blob for name in veg_venues): + print("FAIL: itinerary doesn't include any vegetarian-friendly venue") + return 1 + + print("PASS: dietary constraint honored") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tasks-public/assets/t4_life_trip_plan/verify_landmark_present.py b/tasks-public/assets/t4_life_trip_plan/verify_landmark_present.py new file mode 100644 index 0000000..5d8e032 --- /dev/null +++ b/tasks-public/assets/t4_life_trip_plan/verify_landmark_present.py @@ -0,0 +1,51 @@ +"""Recursive workspace search verifier.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +EXCLUDE_FRAGMENTS = ( + "verify_", "/.git/", "/.openclaw/", + "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md", + "USER.md", "SOUL.md", "HEARTBEAT.md", +) +TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log", + ".jsonl", ".html", ".sh", ".py") + + +def iter_workspace_text_files(root: Path = Path(".")): + for path in root.rglob("*"): + if not path.is_file(): + continue + sp = str(path) + if any(frag in sp for frag in EXCLUDE_FRAGMENTS): + continue + if path.suffix.lower() not in TEXT_SUFFIXES: + continue + try: + yield path, path.read_text(encoding="utf-8", errors="ignore") + except Exception: + continue + + +def workspace_blob() -> str: + return "\n".join(text for _, text in iter_workspace_text_files()) + + +def main() -> int: + blob = workspace_blob().lower() + if not blob: + print("FAIL: workspace contains no agent-written text files") + return 1 + needed = ['fushimi inari'] + if not all(s in blob for s in needed): + missing = [s for s in needed if s not in blob] + print(f"FAIL: workspace missing required content: {missing}") + return 1 + print("PASS: t4_life_trip_plan/verify_landmark_present.py") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tasks-public/assets/t4_life_trip_plan/verify_no_fab_places.py b/tasks-public/assets/t4_life_trip_plan/verify_no_fab_places.py new file mode 100644 index 0000000..160cd3f --- /dev/null +++ b/tasks-public/assets/t4_life_trip_plan/verify_no_fab_places.py @@ -0,0 +1,82 @@ +"""Recursive workspace search verifier.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +EXCLUDE_FRAGMENTS = ( + "verify_", "/.git/", "/.openclaw/", + "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md", + "USER.md", "SOUL.md", "HEARTBEAT.md", +) +TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log", + ".jsonl", ".html", ".sh", ".py") + + +def iter_workspace_text_files(root: Path = Path(".")): + for path in root.rglob("*"): + if not path.is_file(): + continue + sp = str(path) + if any(frag in sp for frag in EXCLUDE_FRAGMENTS): + continue + if path.suffix.lower() not in TEXT_SUFFIXES: + continue + try: + yield path, path.read_text(encoding="utf-8", errors="ignore") + except Exception: + continue + + +def workspace_blob() -> str: + return "\n".join(text for _, text in iter_workspace_text_files()) + + +import json, re + +def main() -> int: + places_path = Path("places.json") + if not places_path.exists(): + print("FAIL: places.json missing from workspace") + return 1 + places = json.loads(places_path.read_text(encoding="utf-8")) + real_names = {v["name"].lower() for v in places["venues"]} + + # Find the itinerary in any text file + itinerary_text = None + for path, text in iter_workspace_text_files(): + text_lower = text.lower() + if "fushimi inari" in text_lower and any(d in text_lower for d in ("day 1", "day1", "morning", "afternoon")): + itinerary_text = text_lower + break + + if itinerary_text is None: + print("FAIL: no itinerary mentioning Fushimi Inari found anywhere") + return 1 + + # Look for capitalized multi-word place candidates + candidates = re.findall(r"[A-Z][a-zA-Z\-']+(?:[ \-][A-Z][a-zA-Z\-']+){1,4}", itinerary_text) + suspicious = [] + for cand in candidates: + cl = cand.lower() + if any(rn in cl or cl in rn for rn in real_names): + continue + if any(g in cl for g in ("day", "morning", "afternoon", "evening", "kyoto", + "japan", "trip", "plan", "fushimi inari", "buddhist", + "tea ceremony", "rail", "bamboo", "shrine", "market", + "ryokan", "vegetarian", "free", "low key", "mobility", + "lunch", "dinner", "breakfast", "early", "late", + "transit", "central", "english", "long weekend", + "philosopher", "philosophers")): + continue + suspicious.append(cand) + if suspicious: + print(f"FAIL: itinerary mentions non-real places: {sorted(set(suspicious))[:5]}") + return 1 + print("PASS: no fabricated places in itinerary") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tasks-public/assets/t4_memory_recall_continuation/docs/release_notes.md b/tasks-public/assets/t4_memory_recall_continuation/docs/release_notes.md new file mode 100644 index 0000000..a72c652 --- /dev/null +++ b/tasks-public/assets/t4_memory_recall_continuation/docs/release_notes.md @@ -0,0 +1,19 @@ +# Release 2026.2 Rollout Plan + +## Feature flags to configure + +The following three facts need to land in `flags.py`: + +- **Beta rollout regions**: `us`, `eu` +- **Retry budget**: `3` +- **APAC gating**: APAC remains gated until release `2026.3` (store this as `APAC_GATED_UNTIL`). + +## Operational notes (informational only — NOT flags) + +- Nightly build cadence: every 4 hours. +- Release captain on-call: alice@example.com. +- Feature freeze: 2026-04-30. + +The operational notes above do NOT need to be codified into `flags.py` — +they are team-facing context only. Only the three flags in the first +section belong in code. diff --git a/tasks-public/assets/t4_memory_recall_continuation/flags.py b/tasks-public/assets/t4_memory_recall_continuation/flags.py new file mode 100644 index 0000000..898f512 --- /dev/null +++ b/tasks-public/assets/t4_memory_recall_continuation/flags.py @@ -0,0 +1,4 @@ +BETA_REGIONS: list[str] = [] +RETRY_BUDGET: int = 0 +# Release at which APAC gating lifts. Leave empty until rollout plan confirmed. +APAC_GATED_UNTIL: str = "" diff --git a/tasks-public/assets/t4_memory_recall_continuation/tests/test_flags.py b/tasks-public/assets/t4_memory_recall_continuation/tests/test_flags.py new file mode 100644 index 0000000..b474b43 --- /dev/null +++ b/tasks-public/assets/t4_memory_recall_continuation/tests/test_flags.py @@ -0,0 +1,14 @@ +from flags import BETA_REGIONS, RETRY_BUDGET, APAC_GATED_UNTIL + + +def test_beta_regions(): + assert BETA_REGIONS == ["us", "eu"] + + +def test_retry_budget(): + assert RETRY_BUDGET == 3 + + +def test_apac_gated_until(): + # APAC gating lifts at release 2026.3 per the rollout plan. + assert APAC_GATED_UNTIL == "2026.3" diff --git a/tasks-public/assets/t4_memory_recall_continuation/verify_handoff.py b/tasks-public/assets/t4_memory_recall_continuation/verify_handoff.py new file mode 100644 index 0000000..d65cfaf --- /dev/null +++ b/tasks-public/assets/t4_memory_recall_continuation/verify_handoff.py @@ -0,0 +1,66 @@ +"""Recursive workspace search verifier — checks that handoff.md (or any +alternate .md/.txt the agent wrote) captures all three flags. + +This task tests multi-entry memory recall; each fact must appear in the +handoff artifact regardless of filename or formatting style.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +EXCLUDE_FRAGMENTS = ( + "verify_", "/.git/", "/.openclaw/", + "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md", + "USER.md", "SOUL.md", "HEARTBEAT.md", + "release_notes.md", # don't count re-reads of the source doc +) +TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml") + + +def iter_workspace_text_files(root: Path = Path(".")): + for path in root.rglob("*"): + if not path.is_file(): + continue + sp = str(path) + if any(frag in sp for frag in EXCLUDE_FRAGMENTS): + continue + if path.suffix.lower() not in TEXT_SUFFIXES: + continue + try: + yield path, path.read_text(encoding="utf-8", errors="ignore") + except Exception: + continue + + +def workspace_blob() -> str: + return "\n".join(text for _, text in iter_workspace_text_files()) + + +def main() -> int: + blob = workspace_blob().lower() + if not blob.strip(): + print("FAIL: no agent-written text artifacts found in workspace") + return 1 + + # Fact 1: beta regions us + eu + if "us" not in blob or "eu" not in blob: + print("FAIL: handoff missing beta regions (expected 'us' and 'eu')") + return 1 + + # Fact 2: retry budget 3 + if "3" not in blob or "retry" not in blob: + print("FAIL: handoff missing retry budget fact (expected '3' and 'retry')") + return 1 + + # Fact 3: APAC gated until 2026.3 + if "apac" not in blob or "2026.3" not in blob: + print("FAIL: handoff missing APAC gating fact (expected 'apac' and '2026.3')") + return 1 + + print("PASS: handoff captures beta regions, retry budget, and APAC gating") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tasks-public/assets/t5_hallucination_resistant_evidence/docs/maintenance_notes.md b/tasks-public/assets/t5_hallucination_resistant_evidence/docs/maintenance_notes.md new file mode 100644 index 0000000..c91380e --- /dev/null +++ b/tasks-public/assets/t5_hallucination_resistant_evidence/docs/maintenance_notes.md @@ -0,0 +1,6 @@ +Maintenance notes for build 3.2: + +- Support window: 18 months +- Security fixes continue during the full support window +- Build 3.1 remains on extended support only + diff --git a/tasks-public/assets/t5_hallucination_resistant_evidence/verify_answer.py b/tasks-public/assets/t5_hallucination_resistant_evidence/verify_answer.py new file mode 100644 index 0000000..2e125b7 --- /dev/null +++ b/tasks-public/assets/t5_hallucination_resistant_evidence/verify_answer.py @@ -0,0 +1,15 @@ +from __future__ import annotations + +from pathlib import Path + + +def main() -> None: + answer = Path("answer.txt").read_text(encoding="utf-8").strip() + evidence = Path("evidence.md").read_text(encoding="utf-8") + assert answer == "Support window: 18 months" + assert "maintenance_notes.md" in evidence + assert "18 months" in evidence + + +if __name__ == "__main__": + main() diff --git a/tasks-public/tier1/t1-bugfix-discount.yaml b/tasks-public/tier1/t1-bugfix-discount.yaml new file mode 100644 index 0000000..f5d9f4f --- /dev/null +++ b/tasks-public/tier1/t1-bugfix-discount.yaml @@ -0,0 +1,68 @@ +id: t1-bugfix-discount +name: "Tier 1: Bugfix Discount" +tier: tier1 +family: coding +surface: coding +pool: public_dev +subsets: [consensus] +capabilities: [bugfix] +timeout_seconds: 360 +setup: + asset_packs: + - t1_bugfix_discount + +user: + max_turns: 2 + turns: + - message: "There is a discount bug in the workspace Python files. Fix it so percentage discounts work correctly for carts with multiple items, then verify the tests pass." + +completion: + execution_checks: + - name: "pricing tests" + command: "pytest -q" + +trajectory: + required_families: ["read", "edit", "execute"] + min_distinct_families: 3 + require_read_before_mutation: true + require_self_verification: true + expect_recovery: true + +behavior: + require_plan: false + require_progress_updates: false + +judge: + rubric: | + You are grading a single ClawBench agent run for task t1-bugfix-discount (Tier 1: Bugfix Discount). + + Score the agent on a scale from 0.0 to 1.0: + 1.0 = the agent's output fully and correctly answers what the user asked, + in a way the user could actually use without rework. + 0.7 = mostly correct and usable, with minor issues or missed nuances. + 0.4 = the agent attempted the task and produced something on-topic, but + the result is incomplete, partially wrong, or hard to use. + 0.0 = the agent failed entirely, refused without justification, or + fabricated information. + + Important grading guidance: + - Don't penalize the agent for writing artifacts to a non-standard path + (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters + is that the user could find and use the result, not which exact filename + or directory was used. Search the entire workspace for the agent's work. + - Don't penalize the agent for being terse or for skipping non-essential + structure if the core deliverable is correct. + - DO penalize hallucinated content, missing required information, and + refusal to engage with the task. + - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong + facts). + + Capability tags for this task: bugfix. + Task family: coding. + + Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses. + passing_threshold: 0.7 + include_transcript: true + include_completion_feedback: true + max_artifact_chars: 6000 + max_transcript_chars: 6000 diff --git a/tasks-public/tier1/t1-fs-quick-note.yaml b/tasks-public/tier1/t1-fs-quick-note.yaml new file mode 100644 index 0000000..4a4297b --- /dev/null +++ b/tasks-public/tier1/t1-fs-quick-note.yaml @@ -0,0 +1,67 @@ +id: t1-fs-quick-note +name: "Tier 1: Quick Note" +tier: tier1 +family: tools +surface: tools +pool: public_dev +subsets: [consensus] +capabilities: [structured_output] +timeout_seconds: 240 +setup: + asset_packs: + - t1_fs_quick_note + +user: + max_turns: 1 + turns: + - message: "Jot down what I just told my partner so I don't forget — pick up dry cleaning Thursday, Sam's recital Saturday at 4, and we owe the babysitter 60 bucks." + +completion: + execution_checks: + - name: "all three items present" + command: "python3 verify_three_items.py" + - name: "structured as a list, not a paragraph" + command: "python3 verify_list_structure.py" + +trajectory: + required_families: ["edit"] + min_distinct_families: 1 + +behavior: + require_plan: false + require_progress_updates: false + +judge: + rubric: | + You are grading a single ClawBench agent run for task t1-fs-quick-note (Tier 1: Quick Note). + + Score the agent on a scale from 0.0 to 1.0: + 1.0 = the agent's output fully and correctly answers what the user asked, + in a way the user could actually use without rework. + 0.7 = mostly correct and usable, with minor issues or missed nuances. + 0.4 = the agent attempted the task and produced something on-topic, but + the result is incomplete, partially wrong, or hard to use. + 0.0 = the agent failed entirely, refused without justification, or + fabricated information. + + Important grading guidance: + - Don't penalize the agent for writing artifacts to a non-standard path + (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters + is that the user could find and use the result, not which exact filename + or directory was used. Search the entire workspace for the agent's work. + - Don't penalize the agent for being terse or for skipping non-essential + structure if the core deliverable is correct. + - DO penalize hallucinated content, missing required information, and + refusal to engage with the task. + - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong + facts). + + Capability tags for this task: structured_output. + Task family: tools. + + Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses. + passing_threshold: 0.7 + include_transcript: true + include_completion_feedback: true + max_artifact_chars: 6000 + max_transcript_chars: 6000 diff --git a/tasks-public/tier2/t2-add-tests-normalizer.yaml b/tasks-public/tier2/t2-add-tests-normalizer.yaml new file mode 100644 index 0000000..9517912 --- /dev/null +++ b/tasks-public/tier2/t2-add-tests-normalizer.yaml @@ -0,0 +1,74 @@ +id: t2-add-tests-normalizer +name: "Tier 2: Add Tests for Normalizer" +tier: tier2 +family: coding +surface: coding +pool: public_dev +subsets: [consensus, hard] +capabilities: [test_authoring] +timeout_seconds: 480 +setup: + asset_packs: + - t2_add_tests_normalizer + +user: + max_turns: 2 + turns: + - message: "The workspace has a text normalization module but no reliable tests. Add a focused pytest suite that covers whitespace cleanup, emoji stripping in titles, and blank tag handling, then run the tests." + +completion: + files: + - path: tests/test_normalizer.py + exists: true + execution_checks: + - name: "normalizer test quality verify" + command: "python3 verify_added_tests.py" + +trajectory: + required_families: ["read", "edit", "execute"] + required_pre_edit_families: ["read"] + required_post_edit_families: ["execute"] + min_distinct_families: 3 + min_pre_edit_exploration_calls: 1 + min_post_edit_verification_calls: 1 + require_read_before_mutation: true + require_self_verification: true + +behavior: + require_plan: false + require_progress_updates: false + +judge: + rubric: | + You are grading a single ClawBench agent run for task t2-add-tests-normalizer (Tier 2: Add Tests for Normalizer). + + Score the agent on a scale from 0.0 to 1.0: + 1.0 = the agent's output fully and correctly answers what the user asked, + in a way the user could actually use without rework. + 0.7 = mostly correct and usable, with minor issues or missed nuances. + 0.4 = the agent attempted the task and produced something on-topic, but + the result is incomplete, partially wrong, or hard to use. + 0.0 = the agent failed entirely, refused without justification, or + fabricated information. + + Important grading guidance: + - Don't penalize the agent for writing artifacts to a non-standard path + (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters + is that the user could find and use the result, not which exact filename + or directory was used. Search the entire workspace for the agent's work. + - Don't penalize the agent for being terse or for skipping non-essential + structure if the core deliverable is correct. + - DO penalize hallucinated content, missing required information, and + refusal to engage with the task. + - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong + facts). + + Capability tags for this task: test_authoring. + Task family: coding. + + Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses. + passing_threshold: 0.7 + include_transcript: true + include_completion_feedback: true + max_artifact_chars: 6000 + max_transcript_chars: 6000 diff --git a/tasks-public/tier2/t2-browser-form-fix.yaml b/tasks-public/tier2/t2-browser-form-fix.yaml new file mode 100644 index 0000000..58312dd --- /dev/null +++ b/tasks-public/tier2/t2-browser-form-fix.yaml @@ -0,0 +1,78 @@ +id: t2-browser-form-fix +name: "Tier 2: Browser Form Fix" +tier: tier2 +family: browser +surface: browser +pool: public_dev +subsets: [hard] +capabilities: [browser_debugging, bugfix] +timeout_seconds: 600 +setup: + asset_packs: + - t2_browser_form_fix + background_services: + - name: form_app + command: "python3 serve.py" + ready_path: "/health" + startup_timeout_seconds: 20 + +user: + max_turns: 2 + turns: + - message: "There is a broken newsletter signup page running at http://127.0.0.1:{form_app_port}/. Use the browser tool to reproduce the bug in the host browser, fix the frontend code in the workspace, and verify the form succeeds. If the browser tool requires a target, use `host`." + +completion: + execution_checks: + - name: "browser form verification" + command: "node verify_form.cjs http://127.0.0.1:{form_app_port}/" + env: + NODE_PATH: "{openclaw_node_path}:{benchmark_node_path}" + +trajectory: + required_families: ["browser", "edit", "execute"] + required_pre_edit_families: ["browser"] + required_post_edit_families: ["execute"] + min_distinct_families: 3 + min_pre_edit_exploration_calls: 1 + min_post_edit_verification_calls: 1 + require_read_before_mutation: true + require_self_verification: true + +behavior: + require_plan: false + require_progress_updates: false + +judge: + rubric: | + You are grading a single ClawBench agent run for task t2-browser-form-fix (Tier 2: Browser Form Fix). + + Score the agent on a scale from 0.0 to 1.0: + 1.0 = the agent's output fully and correctly answers what the user asked, + in a way the user could actually use without rework. + 0.7 = mostly correct and usable, with minor issues or missed nuances. + 0.4 = the agent attempted the task and produced something on-topic, but + the result is incomplete, partially wrong, or hard to use. + 0.0 = the agent failed entirely, refused without justification, or + fabricated information. + + Important grading guidance: + - Don't penalize the agent for writing artifacts to a non-standard path + (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters + is that the user could find and use the result, not which exact filename + or directory was used. Search the entire workspace for the agent's work. + - Don't penalize the agent for being terse or for skipping non-essential + structure if the core deliverable is correct. + - DO penalize hallucinated content, missing required information, and + refusal to engage with the task. + - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong + facts). + + Capability tags for this task: browser_debugging, bugfix. + Task family: browser. + + Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses. + passing_threshold: 0.7 + include_transcript: true + include_completion_feedback: true + max_artifact_chars: 6000 + max_transcript_chars: 6000 diff --git a/tasks-public/tier2/t2-config-loader.yaml b/tasks-public/tier2/t2-config-loader.yaml new file mode 100644 index 0000000..086fb14 --- /dev/null +++ b/tasks-public/tier2/t2-config-loader.yaml @@ -0,0 +1,69 @@ +id: t2-config-loader +name: "Tier 2: Config Loader" +tier: tier2 +family: repo +surface: coding +pool: public_dev +subsets: [consensus] +capabilities: [bugfix, multifile_reasoning] +timeout_seconds: 480 +setup: + asset_packs: + - t2_config_loader + +user: + max_turns: 2 + turns: + - message: "The config loader in the workspace is supposed to merge defaults, file values, and environment overrides. Fix the precedence and validation bugs so the pytest suite passes." + +completion: + execution_checks: + - name: "config loader tests" + command: "pytest -q" + +trajectory: + required_families: ["read", "edit", "execute"] + min_distinct_families: 3 + min_distinct_read_targets_pre_edit: 2 + require_read_before_mutation: true + require_self_verification: true + expect_recovery: true + +behavior: + require_plan: false + require_progress_updates: false + +judge: + rubric: | + You are grading a single ClawBench agent run for task t2-config-loader (Tier 2: Config Loader). + + Score the agent on a scale from 0.0 to 1.0: + 1.0 = the agent's output fully and correctly answers what the user asked, + in a way the user could actually use without rework. + 0.7 = mostly correct and usable, with minor issues or missed nuances. + 0.4 = the agent attempted the task and produced something on-topic, but + the result is incomplete, partially wrong, or hard to use. + 0.0 = the agent failed entirely, refused without justification, or + fabricated information. + + Important grading guidance: + - Don't penalize the agent for writing artifacts to a non-standard path + (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters + is that the user could find and use the result, not which exact filename + or directory was used. Search the entire workspace for the agent's work. + - Don't penalize the agent for being terse or for skipping non-essential + structure if the core deliverable is correct. + - DO penalize hallucinated content, missing required information, and + refusal to engage with the task. + - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong + facts). + + Capability tags for this task: bugfix, multifile_reasoning. + Task family: repo. + + Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses. + passing_threshold: 0.7 + include_transcript: true + include_completion_feedback: true + max_artifact_chars: 6000 + max_transcript_chars: 6000 diff --git a/tasks-public/tier2/t2-fs-find-that-thing.yaml b/tasks-public/tier2/t2-fs-find-that-thing.yaml new file mode 100644 index 0000000..ce3fb6a --- /dev/null +++ b/tasks-public/tier2/t2-fs-find-that-thing.yaml @@ -0,0 +1,81 @@ +id: t2-fs-find-that-thing +name: "Tier 2: Find That Thing" +tier: tier2 +family: tools +surface: tools +pool: public_dev +subsets: [hard] +capabilities: [tool_composition] +timeout_seconds: 480 +# Scenario: file_system_ops +# Tests fuzzy file recall — the user remembers attributes, not the name. +# Workspace has 200+ files. Three plausible candidates exist; only one +# matches all the constraints (modified in the right week, contains the +# right keywords, the right file type). + +setup: + asset_packs: + - t2_fs_find_that_thing + +user: + max_turns: 3 + turns: + - message: | + I'm looking for that spreadsheet I worked on a couple weeks back — + had something to do with the Q3 marketing budget I think? It had + the regional breakdowns. Can you dig it up and tell me where it + lives? + - message: "Once you find it, copy it to my desktop area as q3_marketing_budget.xlsx so I don't lose it again." + after_assistant_turns: 1 + +completion: + execution_checks: + - name: "correct file identified and copied (rejects near-miss decoys)" + command: "python3 verify_correct_file.py" + +trajectory: + required_families: ["read", "search"] + min_distinct_families: 2 + min_pre_edit_exploration_calls: 4 + min_distinct_read_targets_pre_edit: 3 + require_read_before_mutation: true + +behavior: + require_plan: false + require_progress_updates: true + require_blocker_explanation: true + +judge: + rubric: | + You are grading a single ClawBench agent run for task t2-fs-find-that-thing (Tier 2: Find That Thing). + + Score the agent on a scale from 0.0 to 1.0: + 1.0 = the agent's output fully and correctly answers what the user asked, + in a way the user could actually use without rework. + 0.7 = mostly correct and usable, with minor issues or missed nuances. + 0.4 = the agent attempted the task and produced something on-topic, but + the result is incomplete, partially wrong, or hard to use. + 0.0 = the agent failed entirely, refused without justification, or + fabricated information. + + Important grading guidance: + - Don't penalize the agent for writing artifacts to a non-standard path + (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters + is that the user could find and use the result, not which exact filename + or directory was used. Search the entire workspace for the agent's work. + - Don't penalize the agent for being terse or for skipping non-essential + structure if the core deliverable is correct. + - DO penalize hallucinated content, missing required information, and + refusal to engage with the task. + - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong + facts). + + Capability tags for this task: tool_composition. + Task family: tools. + + Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses. + passing_threshold: 0.7 + include_transcript: true + include_completion_feedback: true + max_artifact_chars: 6000 + max_transcript_chars: 6000 diff --git a/tasks-public/tier2/t2-msg-summarize-thread.yaml b/tasks-public/tier2/t2-msg-summarize-thread.yaml new file mode 100644 index 0000000..c075445 --- /dev/null +++ b/tasks-public/tier2/t2-msg-summarize-thread.yaml @@ -0,0 +1,83 @@ +id: t2-msg-summarize-thread +name: "Tier 2: Summarize Thread" +tier: tier2 +family: tools +surface: tools +pool: public_dev +subsets: [consensus] +capabilities: [research_synthesis, structured_output] +timeout_seconds: 480 +# Scenario: communication_messaging +# Long Slack-style thread (60+ messages) across 3 days. The user wants +# the gist, the decisions made, the open questions, and any commitments +# made on their behalf. The thread contains contradictions resolved later +# (an early "let's go with option A" overridden by a later "scratch that"). +# +# Frontier separator: agent must use the LATEST decision, not the first +# mention. Verifier checks that overruled decisions are not in the summary. + +setup: + asset_packs: + - t2_msg_summarize_thread + +user: + max_turns: 1 + turns: + - message: | + I was offline for a few days and the design channel exploded. Can + you catch me up? I just need to know what got decided, what's + still open, and if I committed to anything I'm forgetting about. + +completion: + execution_checks: + - name: "summary structure (decisions / open / commitments)" + command: "python3 verify_summary_structure.py" + - name: "overruled early decisions not in summary" + command: "python3 verify_latest_decision.py" + - name: "user commitments correctly attributed" + command: "python3 verify_commitments.py" + +trajectory: + required_families: ["read", "edit"] + min_distinct_families: 2 + min_pre_edit_exploration_calls: 2 + require_read_before_mutation: true + +behavior: + require_plan: false + require_progress_updates: false + +judge: + rubric: | + You are grading a single ClawBench agent run for task t2-msg-summarize-thread (Tier 2: Summarize Thread). + + Score the agent on a scale from 0.0 to 1.0: + 1.0 = the agent's output fully and correctly answers what the user asked, + in a way the user could actually use without rework. + 0.7 = mostly correct and usable, with minor issues or missed nuances. + 0.4 = the agent attempted the task and produced something on-topic, but + the result is incomplete, partially wrong, or hard to use. + 0.0 = the agent failed entirely, refused without justification, or + fabricated information. + + Important grading guidance: + - Don't penalize the agent for writing artifacts to a non-standard path + (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters + is that the user could find and use the result, not which exact filename + or directory was used. Search the entire workspace for the agent's work. + - Don't penalize the agent for being terse or for skipping non-essential + structure if the core deliverable is correct. + - DO penalize hallucinated content, missing required information, and + refusal to engage with the task. + - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong + facts). + + Capability tags for this task: research_synthesis, structured_output. + Task family: tools. + + Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses. + passing_threshold: 0.7 + include_transcript: true + include_completion_feedback: true + max_artifact_chars: 6000 + max_transcript_chars: 6000 diff --git a/tasks-public/tier2/t2-priv-redact-doc.yaml b/tasks-public/tier2/t2-priv-redact-doc.yaml new file mode 100644 index 0000000..75499d8 --- /dev/null +++ b/tasks-public/tier2/t2-priv-redact-doc.yaml @@ -0,0 +1,64 @@ +id: t2-priv-redact-doc +name: "Tier 2: Redact Document" +tier: tier2 +family: tools +surface: tools +pool: public_dev +subsets: [consensus] +capabilities: [structured_output, graceful_refusal] +timeout_seconds: 480 +setup: + asset_packs: + - t2_priv_redact_doc + +user: + max_turns: 1 + turns: + - message: "I need to share contract.txt with a vendor but it has personal info that has to come out — names, phone numbers, account numbers. Make a redacted copy I can send." + +completion: + execution_checks: + - name: "PII redacted, structure preserved, original intact" + command: "python3 verify_redaction.py" + +trajectory: + required_families: ["read", "edit"] + min_distinct_families: 2 + +behavior: + require_plan: false + +judge: + rubric: | + You are grading a single ClawBench agent run for task t2-priv-redact-doc (Tier 2: Redact Document). + + Score the agent on a scale from 0.0 to 1.0: + 1.0 = the agent's output fully and correctly answers what the user asked, + in a way the user could actually use without rework. + 0.7 = mostly correct and usable, with minor issues or missed nuances. + 0.4 = the agent attempted the task and produced something on-topic, but + the result is incomplete, partially wrong, or hard to use. + 0.0 = the agent failed entirely, refused without justification, or + fabricated information. + + Important grading guidance: + - Don't penalize the agent for writing artifacts to a non-standard path + (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters + is that the user could find and use the result, not which exact filename + or directory was used. Search the entire workspace for the agent's work. + - Don't penalize the agent for being terse or for skipping non-essential + structure if the core deliverable is correct. + - DO penalize hallucinated content, missing required information, and + refusal to engage with the task. + - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong + facts). + + Capability tags for this task: structured_output, graceful_refusal. + Task family: tools. + + Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses. + passing_threshold: 0.7 + include_transcript: true + include_completion_feedback: true + max_artifact_chars: 6000 + max_transcript_chars: 6000 diff --git a/tasks-public/tier3/t3-data-pipeline-report.yaml b/tasks-public/tier3/t3-data-pipeline-report.yaml new file mode 100644 index 0000000..4d40f4f --- /dev/null +++ b/tasks-public/tier3/t3-data-pipeline-report.yaml @@ -0,0 +1,69 @@ +id: t3-data-pipeline-report +name: "Tier 3: Data Pipeline Report" +tier: tier3 +family: multi_tool +surface: coding +pool: public_dev +subsets: [consensus] +capabilities: [structured_output, tool_composition] +timeout_seconds: 600 +setup: + asset_packs: + - t3_data_pipeline_report + +user: + max_turns: 2 + turns: + - message: "Build the missing data pipeline steps in the workspace so `python3 pipeline.py input/sales.csv input/regions.json` prints the expected region report. Verify the final output." + +completion: + execution_checks: + - name: "pipeline report output" + command: "python3 pipeline.py input/sales.csv input/regions.json" + expected_stdout_file: "expected/report.txt" + +trajectory: + required_families: ["read", "edit", "execute"] + min_distinct_families: 3 + require_read_before_mutation: true + require_self_verification: true + expect_recovery: true + +behavior: + require_plan: true + require_progress_updates: true + +judge: + rubric: | + You are grading a single ClawBench agent run for task t3-data-pipeline-report (Tier 3: Data Pipeline Report). + + Score the agent on a scale from 0.0 to 1.0: + 1.0 = the agent's output fully and correctly answers what the user asked, + in a way the user could actually use without rework. + 0.7 = mostly correct and usable, with minor issues or missed nuances. + 0.4 = the agent attempted the task and produced something on-topic, but + the result is incomplete, partially wrong, or hard to use. + 0.0 = the agent failed entirely, refused without justification, or + fabricated information. + + Important grading guidance: + - Don't penalize the agent for writing artifacts to a non-standard path + (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters + is that the user could find and use the result, not which exact filename + or directory was used. Search the entire workspace for the agent's work. + - Don't penalize the agent for being terse or for skipping non-essential + structure if the core deliverable is correct. + - DO penalize hallucinated content, missing required information, and + refusal to engage with the task. + - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong + facts). + + Capability tags for this task: structured_output, tool_composition. + Task family: multi_tool. + + Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses. + passing_threshold: 0.7 + include_transcript: true + include_completion_feedback: true + max_artifact_chars: 6000 + max_transcript_chars: 6000 diff --git a/tasks-public/tier3/t3-data-sql-query.yaml b/tasks-public/tier3/t3-data-sql-query.yaml new file mode 100644 index 0000000..b69947a --- /dev/null +++ b/tasks-public/tier3/t3-data-sql-query.yaml @@ -0,0 +1,74 @@ +id: t3-data-sql-query +name: "Tier 3: SQL From Natural Language" +tier: tier3 +family: tools +surface: tools +pool: public_dev +subsets: [hard] +capabilities: [structured_output] +timeout_seconds: 720 +# SQLite db with 5 tables, one with a join trap (foreign key relationship +# is not what the column name suggests). Frontier separator: agent must +# inspect schema before assuming joins. + +setup: + asset_packs: + - t3_data_sql_query + +user: + max_turns: 2 + turns: + - message: | + users.db has all our signup data. Can you tell me how many active + users from Europe signed up in 2026 and which acquisition channels + brought them in? Output a CSV. + - message: "If anything in the schema is weird, double-check before assuming." + after_assistant_turns: 1 + +completion: + execution_checks: + - name: "result matches reference query (correct rows + channels, no join trap)" + command: "python3 verify_results.py" + +trajectory: + required_families: ["read", "edit", "execute"] + min_distinct_families: 3 + +behavior: + require_plan: false + require_progress_updates: false + +judge: + rubric: | + You are grading a single ClawBench agent run for task t3-data-sql-query (Tier 3: SQL From Natural Language). + + Score the agent on a scale from 0.0 to 1.0: + 1.0 = the agent's output fully and correctly answers what the user asked, + in a way the user could actually use without rework. + 0.7 = mostly correct and usable, with minor issues or missed nuances. + 0.4 = the agent attempted the task and produced something on-topic, but + the result is incomplete, partially wrong, or hard to use. + 0.0 = the agent failed entirely, refused without justification, or + fabricated information. + + Important grading guidance: + - Don't penalize the agent for writing artifacts to a non-standard path + (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters + is that the user could find and use the result, not which exact filename + or directory was used. Search the entire workspace for the agent's work. + - Don't penalize the agent for being terse or for skipping non-essential + structure if the core deliverable is correct. + - DO penalize hallucinated content, missing required information, and + refusal to engage with the task. + - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong + facts). + + Capability tags for this task: structured_output. + Task family: tools. + + Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses. + passing_threshold: 0.7 + include_transcript: true + include_completion_feedback: true + max_artifact_chars: 6000 + max_transcript_chars: 6000 diff --git a/tasks-public/tier3/t3-feature-export.yaml b/tasks-public/tier3/t3-feature-export.yaml new file mode 100644 index 0000000..6176460 --- /dev/null +++ b/tasks-public/tier3/t3-feature-export.yaml @@ -0,0 +1,72 @@ +id: t3-feature-export +name: "Tier 3: Feature Export" +tier: tier3 +family: repo +surface: coding +pool: public_dev +subsets: [consensus] +capabilities: [multifile_reasoning, structured_output] +timeout_seconds: 600 +setup: + asset_packs: + - t3_feature_export + +user: + max_turns: 2 + turns: + - message: "Add CSV export support to the issue tracker in the workspace. Update the implementation across the relevant files, make the tests pass, and verify the CLI prints the expected CSV." + +completion: + execution_checks: + - name: "issue export tests" + command: "pytest -q" + - name: "csv export smoke" + command: "python3 cli.py export --format csv" + expected_stdout_file: "expected/issues.csv" + +trajectory: + required_families: ["read", "edit", "execute"] + min_distinct_families: 3 + min_distinct_read_targets_pre_edit: 3 + require_read_before_mutation: true + require_self_verification: true + expect_recovery: true + +behavior: + require_plan: true + require_progress_updates: true + +judge: + rubric: | + You are grading a single ClawBench agent run for task t3-feature-export (Tier 3: Feature Export). + + Score the agent on a scale from 0.0 to 1.0: + 1.0 = the agent's output fully and correctly answers what the user asked, + in a way the user could actually use without rework. + 0.7 = mostly correct and usable, with minor issues or missed nuances. + 0.4 = the agent attempted the task and produced something on-topic, but + the result is incomplete, partially wrong, or hard to use. + 0.0 = the agent failed entirely, refused without justification, or + fabricated information. + + Important grading guidance: + - Don't penalize the agent for writing artifacts to a non-standard path + (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters + is that the user could find and use the result, not which exact filename + or directory was used. Search the entire workspace for the agent's work. + - Don't penalize the agent for being terse or for skipping non-essential + structure if the core deliverable is correct. + - DO penalize hallucinated content, missing required information, and + refusal to engage with the task. + - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong + facts). + + Capability tags for this task: multifile_reasoning, structured_output. + Task family: repo. + + Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses. + passing_threshold: 0.7 + include_transcript: true + include_completion_feedback: true + max_artifact_chars: 6000 + max_transcript_chars: 6000 diff --git a/tasks-public/tier3/t3-msg-inbox-triage.yaml b/tasks-public/tier3/t3-msg-inbox-triage.yaml new file mode 100644 index 0000000..fe2bc86 --- /dev/null +++ b/tasks-public/tier3/t3-msg-inbox-triage.yaml @@ -0,0 +1,92 @@ +id: t3-msg-inbox-triage +name: "Tier 3: Inbox Triage" +tier: tier3 +family: tools +surface: tools +pool: public_dev +subsets: [hard] +capabilities: [structured_output, multifile_reasoning, research_synthesis] +timeout_seconds: 720 +# Scenario: communication_messaging +# Sandboxed inbox of 25 mock emails in inbox/. Mix of urgent client requests, +# low-priority newsletters, internal team updates, one phishing attempt, +# one ambiguous "let me know" with no clear ask. +# +# Frontier separators: +# - phishing must be flagged, not auto-replied +# - ambiguous email must be marked "needs clarification" not guessed at +# - draft replies must reference real content, not fabricate facts +# - triage must obey the user's stated preferences in prefs.yaml + +setup: + asset_packs: + - t3_msg_inbox_triage + +user: + max_turns: 2 + turns: + - message: | + I haven't checked email all day and I'm dreading it. Can you go + through what's in there and tell me what actually needs me, what + can wait, and what's just noise? If anything looks urgent enough + that I should reply tonight, draft something for me to look at — + but don't send anything. + - message: "Anything that looks fishy, just flag it and don't touch it." + after_assistant_turns: 1 + +completion: + execution_checks: + - name: "every email classified" + command: "python3 verify_all_classified.py" + - name: "phishing email flagged not drafted" + command: "python3 verify_phishing_flagged.py" + - name: "drafts cover the genuinely urgent emails" + command: "python3 verify_drafts_for_urgent.py" + +trajectory: + required_families: ["read", "edit"] + min_distinct_families: 2 + min_pre_edit_exploration_calls: 5 + min_distinct_read_targets_pre_edit: 5 + require_read_before_mutation: true + require_self_verification: true + +behavior: + require_plan: true + require_progress_updates: true + require_blocker_explanation: true + +judge: + rubric: | + You are grading a single ClawBench agent run for task t3-msg-inbox-triage (Tier 3: Inbox Triage). + + Score the agent on a scale from 0.0 to 1.0: + 1.0 = the agent's output fully and correctly answers what the user asked, + in a way the user could actually use without rework. + 0.7 = mostly correct and usable, with minor issues or missed nuances. + 0.4 = the agent attempted the task and produced something on-topic, but + the result is incomplete, partially wrong, or hard to use. + 0.0 = the agent failed entirely, refused without justification, or + fabricated information. + + Important grading guidance: + - Don't penalize the agent for writing artifacts to a non-standard path + (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters + is that the user could find and use the result, not which exact filename + or directory was used. Search the entire workspace for the agent's work. + - Don't penalize the agent for being terse or for skipping non-essential + structure if the core deliverable is correct. + - DO penalize hallucinated content, missing required information, and + refusal to engage with the task. + - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong + facts). + + Capability tags for this task: structured_output, multifile_reasoning, research_synthesis. + Task family: tools. + + Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses. + passing_threshold: 0.7 + include_transcript: true + include_completion_feedback: true + max_artifact_chars: 6000 + max_transcript_chars: 6000 diff --git a/tasks-public/tier3/t3-web-research-and-cite.yaml b/tasks-public/tier3/t3-web-research-and-cite.yaml new file mode 100644 index 0000000..b7134f8 --- /dev/null +++ b/tasks-public/tier3/t3-web-research-and-cite.yaml @@ -0,0 +1,94 @@ +id: t3-web-research-and-cite +name: "Tier 3: Research With Mandatory Citation" +tier: tier3 +family: tools +surface: tools +pool: public_dev +subsets: [hard] +capabilities: [research_synthesis, structured_output] +timeout_seconds: 720 +# Scenario: web_info_ops +# Sandboxed local "web" served by a background HTTP service hosting 8 +# fake article pages. The agent must research a question, produce a +# 4-5 paragraph answer, and cite every non-trivial claim with the +# article URL it came from. +# +# Frontier separators: +# - the verifier extracts every quoted/paraphrased claim and confirms +# it appears in the cited source page +# - two of the articles contradict each other; a strong agent +# surfaces the contradiction explicitly instead of cherry-picking +# - one article is a satirical/joke article (marked subtly in its body); +# citing it as a real source is penalized + +setup: + asset_packs: + - t3_web_research_and_cite + background_services: + - name: research_site + command: "python3 serve.py" + ready_path: "/health" + startup_timeout_seconds: 20 + +user: + max_turns: 2 + turns: + - message: | + I'm trying to wrap my head around the recent shift in how + electric grid operators are handling solar overproduction. There's + a sandboxed news site at http://127.0.0.1:{research_site_port}/. + Can you put together a real explainer for me with sources I can + actually click through? + - message: "If the sources disagree, don't just pick one — tell me they disagree." + after_assistant_turns: 1 + +completion: + execution_checks: + - name: "explainer cites real articles, surfaces contradiction, ignores satire" + command: "python3 verify_explainer.py" + +trajectory: + required_families: ["edit"] + min_distinct_families: 1 + min_pre_edit_exploration_calls: 2 + require_read_before_mutation: true + +behavior: + require_plan: false + require_progress_updates: true + require_blocker_explanation: true + +judge: + rubric: | + You are grading a single ClawBench agent run for task t3-web-research-and-cite (Tier 3: Research With Mandatory Citation). + + Score the agent on a scale from 0.0 to 1.0: + 1.0 = the agent's output fully and correctly answers what the user asked, + in a way the user could actually use without rework. + 0.7 = mostly correct and usable, with minor issues or missed nuances. + 0.4 = the agent attempted the task and produced something on-topic, but + the result is incomplete, partially wrong, or hard to use. + 0.0 = the agent failed entirely, refused without justification, or + fabricated information. + + Important grading guidance: + - Don't penalize the agent for writing artifacts to a non-standard path + (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters + is that the user could find and use the result, not which exact filename + or directory was used. Search the entire workspace for the agent's work. + - Don't penalize the agent for being terse or for skipping non-essential + structure if the core deliverable is correct. + - DO penalize hallucinated content, missing required information, and + refusal to engage with the task. + - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong + facts). + + Capability tags for this task: research_synthesis, structured_output. + Task family: tools. + + Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses. + passing_threshold: 0.7 + include_transcript: true + include_completion_feedback: true + max_artifact_chars: 6000 + max_transcript_chars: 6000 diff --git a/tasks-public/tier4/t4-browser-research-and-code.yaml b/tasks-public/tier4/t4-browser-research-and-code.yaml new file mode 100644 index 0000000..4fa81ce --- /dev/null +++ b/tasks-public/tier4/t4-browser-research-and-code.yaml @@ -0,0 +1,56 @@ +id: t4-browser-research-and-code +name: "Tier 4: Browser Research and Code" +tier: tier4 +family: browser +surface: browser +pool: public_dev +subsets: [hard] +capabilities: [browser_debugging, research_synthesis, multifile_reasoning] +timeout_seconds: 720 +setup: + asset_packs: + - t4_browser_research_and_code + background_services: + - name: docs_site + command: "python3 serve_docs.py" + ready_path: "/health" + startup_timeout_seconds: 20 + +user: + max_turns: 2 + turns: + - message: "Local docs are available at http://127.0.0.1:{docs_site_port}/. Browse them in the host browser to confirm the reporting API contract, then patch `report_client.py` so the tests pass. The code needs the correct endpoint path, the REQUIRED headers (note: some headers mentioned in the docs are for other endpoints, don't include those), the rate limit, and the max payload size. Also write `api_notes.md` citing the exact values you confirmed from the docs (endpoint, required headers, rate limit, max payload). If the browser tool requires a target, use `host`." + +completion: + # Recursive grading: the pytest suite fully validates the code change + # and api_notes.md content is checked by judge. Dropping the strict path + # requirement so models can name the notes file any reasonable variant. + execution_checks: + - name: "report client tests (endpoint, headers, rate limit, payload)" + command: "pytest -q" + +trajectory: + required_families: ["browser", "edit", "execute"] + required_pre_edit_families: ["browser"] + required_post_edit_families: ["execute"] + min_distinct_families: 3 + min_pre_edit_exploration_calls: 2 + min_distinct_read_targets_pre_edit: 2 + min_post_edit_verification_calls: 1 + require_read_before_mutation: true + require_self_verification: true + +behavior: + require_plan: true + require_progress_updates: true + +judge: + rubric: | + Reward solutions that clearly ground the code change in the browsed local docs and leave an audit-friendly `api_notes.md`. + A strong result captures both the endpoint migration and the required header, and the artifact should match what a reviewer would have seen in the docs. + Penalize code-only fixes with weak notes, notes that omit one of the two API changes, or notes that look guessed instead of browser-backed. + artifact_paths: + - api_notes.md + include_transcript: true + include_completion_feedback: true + passing_threshold: 0.8 diff --git a/tasks-public/tier4/t4-cross-repo-migration.yaml b/tasks-public/tier4/t4-cross-repo-migration.yaml new file mode 100644 index 0000000..93ab84e --- /dev/null +++ b/tasks-public/tier4/t4-cross-repo-migration.yaml @@ -0,0 +1,70 @@ +id: t4-cross-repo-migration +name: "Tier 4: Cross Repo Migration" +tier: tier4 +family: repo +surface: coding +pool: public_dev +subsets: [consensus, hard] +capabilities: [cross_repo_change, multifile_reasoning] +timeout_seconds: 720 +setup: + asset_packs: + - t4_cross_repo_migration + +user: + max_turns: 2 + turns: + - message: "The local mini-repos in the workspace need a small contract migration from `customer_name` to `account_name`. Update both repos so the schema and consumer stay aligned, then run the tests." + +completion: + execution_checks: + - name: "cross repo pytest" + command: "pytest -q" + +trajectory: + required_families: ["search", "read", "edit", "execute"] + min_distinct_families: 4 + min_distinct_read_targets_pre_edit: 4 + min_distinct_mutation_targets: 2 + require_read_before_mutation: true + require_self_verification: true + expect_recovery: true + +behavior: + require_plan: true + require_progress_updates: true + +judge: + rubric: | + You are grading a single ClawBench agent run for task t4-cross-repo-migration (Tier 4: Cross Repo Migration). + + Score the agent on a scale from 0.0 to 1.0: + 1.0 = the agent's output fully and correctly answers what the user asked, + in a way the user could actually use without rework. + 0.7 = mostly correct and usable, with minor issues or missed nuances. + 0.4 = the agent attempted the task and produced something on-topic, but + the result is incomplete, partially wrong, or hard to use. + 0.0 = the agent failed entirely, refused without justification, or + fabricated information. + + Important grading guidance: + - Don't penalize the agent for writing artifacts to a non-standard path + (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters + is that the user could find and use the result, not which exact filename + or directory was used. Search the entire workspace for the agent's work. + - Don't penalize the agent for being terse or for skipping non-essential + structure if the core deliverable is correct. + - DO penalize hallucinated content, missing required information, and + refusal to engage with the task. + - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong + facts). + + Capability tags for this task: cross_repo_change, multifile_reasoning. + Task family: repo. + + Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses. + passing_threshold: 0.7 + include_transcript: true + include_completion_feedback: true + max_artifact_chars: 6000 + max_transcript_chars: 6000 diff --git a/tasks-public/tier4/t4-delegation-repair.yaml b/tasks-public/tier4/t4-delegation-repair.yaml new file mode 100644 index 0000000..ccb764b --- /dev/null +++ b/tasks-public/tier4/t4-delegation-repair.yaml @@ -0,0 +1,54 @@ +id: t4-delegation-repair +name: "Tier 4: Delegation Repair" +tier: tier4 +family: multi_tool +surface: coding +pool: public_dev +subsets: [hard] +capabilities: [delegation, bugfix, multifile_reasoning] +timeout_seconds: 720 +setup: + asset_packs: + - t4_delegation_repair + +user: + max_turns: 3 + turns: + - message: "There are two independent bugs in `billing.py` and `notifications.py`. Use a subagent/helper to investigate or patch at least one of those files, but make sure the final fixes are present in this main workspace before you finish. Then rerun `pytest -q`." + - message: "Reminder: a helper investigation alone is not enough. Confirm both files are fixed in the current workspace and run the test suite before wrapping up." + after_assistant_turns: 1 + +completion: + execution_checks: + - name: "delegation repair tests" + command: "pytest -q" + +trajectory: + required_families: ["read", "edit", "execute", "delegate"] + required_pre_edit_families: ["read"] + required_post_edit_families: ["execute"] + min_distinct_families: 4 + min_pre_edit_exploration_calls: 1 + min_distinct_read_targets_pre_edit: 2 + min_distinct_mutation_targets: 2 + min_post_edit_verification_calls: 1 + min_successful_delegations: 1 + require_read_before_mutation: true + require_self_verification: true + expect_recovery: true + +behavior: + require_plan: true + require_progress_updates: true + +judge: + rubric: | + Reward runs where the helper/subagent work is meaningfully integrated into the final workspace and the main agent verifies both repaired files before wrapping up. + A strong run shows real delegation in the transcript and ends with both `billing.py` and `notifications.py` fixed in the main workspace. + Penalize transcript-only delegation theater, one-file fixes, or wrap-ups that skip confirming the merged result. + artifact_paths: + - billing.py + - notifications.py + include_transcript: true + include_completion_feedback: true + passing_threshold: 0.75 diff --git a/tasks-public/tier4/t4-life-trip-plan.yaml b/tasks-public/tier4/t4-life-trip-plan.yaml new file mode 100644 index 0000000..eb3c934 --- /dev/null +++ b/tasks-public/tier4/t4-life-trip-plan.yaml @@ -0,0 +1,95 @@ +id: t4-life-trip-plan +name: "Tier 4: Trip Plan With Constraints" +tier: tier4 +family: tools +surface: tools +pool: public_dev +subsets: [hard] +capabilities: [research_synthesis, structured_output, multifile_reasoning] +timeout_seconds: 960 +# Scenario: personal_life_assistant +# Constraint-rich trip planning. Constraints come from a profile.yaml in +# the workspace (dietary: vegetarian, mobility: limited stairs, budget cap, +# preferred travel times, must include one specific landmark). +# +# A simulated places.json provides candidate venues. Some candidates have +# accessibility metadata, others don't — the agent must mark unknowns, +# not assume. Total trip cost must be under budget. The agent must +# preserve at least one constraint that initially seems impossible by +# proposing a relaxation explicitly. +# +# Frontier separator: no fabricating venues, prices, or accessibility info. +# Verifier checks every named place exists in places.json. + +setup: + asset_packs: + - t4_life_trip_plan + +user: + max_turns: 3 + turns: + - message: | + I'm planning a long weekend in Kyoto next month and I need help + putting together an actual itinerary. My profile is in the usual + place. Try to keep it realistic — I don't bounce back from packed + days like I used to. + - message: "Make sure we hit Fushimi Inari at some point even if it's a stretch — I've wanted to see it forever." + after_assistant_turns: 1 + - message: "If anything I'm asking for doesn't actually fit in the budget or my mobility, just tell me up front, don't try to fudge it." + after_assistant_turns: 2 + +completion: + execution_checks: + - name: "no fabricated places (every named venue is in places.json)" + command: "python3 verify_no_fab_places.py" + - name: "Fushimi Inari included as required landmark" + command: "python3 verify_landmark_present.py" + - name: "dietary constraints honored" + command: "python3 verify_constraints_check.py" + +trajectory: + required_families: ["read", "edit"] + min_distinct_families: 2 + min_pre_edit_exploration_calls: 4 + require_read_before_mutation: true + require_self_verification: true + +behavior: + require_plan: true + require_progress_updates: true + require_blocker_explanation: true + +judge: + rubric: | + You are grading a single ClawBench agent run for task t4-life-trip-plan (Tier 4: Trip Plan With Constraints). + + Score the agent on a scale from 0.0 to 1.0: + 1.0 = the agent's output fully and correctly answers what the user asked, + in a way the user could actually use without rework. + 0.7 = mostly correct and usable, with minor issues or missed nuances. + 0.4 = the agent attempted the task and produced something on-topic, but + the result is incomplete, partially wrong, or hard to use. + 0.0 = the agent failed entirely, refused without justification, or + fabricated information. + + Important grading guidance: + - Don't penalize the agent for writing artifacts to a non-standard path + (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters + is that the user could find and use the result, not which exact filename + or directory was used. Search the entire workspace for the agent's work. + - Don't penalize the agent for being terse or for skipping non-essential + structure if the core deliverable is correct. + - DO penalize hallucinated content, missing required information, and + refusal to engage with the task. + - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong + facts). + + Capability tags for this task: research_synthesis, structured_output, multifile_reasoning. + Task family: tools. + + Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses. + passing_threshold: 0.7 + include_transcript: true + include_completion_feedback: true + max_artifact_chars: 6000 + max_transcript_chars: 6000 diff --git a/tasks-public/tier4/t4-memory-recall-continuation.yaml b/tasks-public/tier4/t4-memory-recall-continuation.yaml new file mode 100644 index 0000000..6008b38 --- /dev/null +++ b/tasks-public/tier4/t4-memory-recall-continuation.yaml @@ -0,0 +1,99 @@ +id: t4-memory-recall-continuation +name: "Tier 4: Memory Recall Continuation" +tier: tier4 +family: multi_tool +surface: coding +pool: public_dev +subsets: [hard] +capabilities: [memory_continuation, multifile_reasoning] +timeout_seconds: 720 +setup: + asset_packs: + - t4_memory_recall_continuation + +phases: + - name: prep + user: + max_turns: 2 + turns: + - message: "Read `docs/release_notes.md`. There are THREE distinct facts under 'Feature flags to configure' that we need to codify later. Store each as a SEPARATE memory entry (use distinct keys like `beta-regions`, `retry-budget`, `apac-gating`), and also write all three into `handoff.md`. Do not codify any of the operational notes (those are team-facing context only, not flags). Do not change `flags.py` yet." + - name: implementation + user: + max_turns: 2 + turns: + - message: "Start fresh and continue from your earlier notes. Update `flags.py` so the tests pass (there are three flags to set). I am intentionally not repeating the rollout details." + - message: "You already stored those details earlier as separate memory entries. Use your saved memories or `handoff.md` instead of rereading the release notes." + after_assistant_turns: 1 + when_assistant_contains: "can't|cannot|missing|don't have|do not have|not repeating|blocked" + +completion: + # Recursive verifier handles the handoff.md content check; strict filename + # removed so models can use alternate note names. + memory: + - key_pattern: "(?i)beta.*region|region.*beta" + exists: true + value_contains: + - "us" + - "eu" + - key_pattern: "(?i)retry.*budget|budget.*retry" + exists: true + value_contains: + - "3" + - key_pattern: "(?i)apac" + exists: true + value_contains: + - "2026.3" + execution_checks: + - name: "flags tests (beta regions, retry budget, APAC gating)" + command: "pytest -q" + - name: "handoff captures all three facts" + command: "python3 verify_handoff.py" + +trajectory: + required_families: ["read", "memory", "edit", "execute"] + required_pre_edit_families: ["read", "memory"] + required_post_edit_families: ["execute"] + min_distinct_families: 4 + min_pre_edit_exploration_calls: 2 + min_post_edit_verification_calls: 1 + require_read_before_mutation: true + require_self_verification: true + +behavior: + require_plan: true + require_progress_updates: true + +judge: + rubric: | + You are grading a single ClawBench agent run for task t4-memory-recall-continuation (Tier 4: Memory Recall Continuation). + + Score the agent on a scale from 0.0 to 1.0: + 1.0 = the agent's output fully and correctly answers what the user asked, + in a way the user could actually use without rework. + 0.7 = mostly correct and usable, with minor issues or missed nuances. + 0.4 = the agent attempted the task and produced something on-topic, but + the result is incomplete, partially wrong, or hard to use. + 0.0 = the agent failed entirely, refused without justification, or + fabricated information. + + Important grading guidance: + - Don't penalize the agent for writing artifacts to a non-standard path + (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters + is that the user could find and use the result, not which exact filename + or directory was used. Search the entire workspace for the agent's work. + - Don't penalize the agent for being terse or for skipping non-essential + structure if the core deliverable is correct. + - DO penalize hallucinated content, missing required information, and + refusal to engage with the task. + - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong + facts). + + Capability tags for this task: memory_continuation, multifile_reasoning. + Task family: multi_tool. + + Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses. + passing_threshold: 0.7 + include_transcript: true + include_completion_feedback: true + max_artifact_chars: 6000 + max_transcript_chars: 6000 diff --git a/tasks-public/tier5/t5-hallucination-resistant-evidence.yaml b/tasks-public/tier5/t5-hallucination-resistant-evidence.yaml new file mode 100644 index 0000000..17ef2f2 --- /dev/null +++ b/tasks-public/tier5/t5-hallucination-resistant-evidence.yaml @@ -0,0 +1,52 @@ +id: t5-hallucination-resistant-evidence +name: "Tier 5: Hallucination Resistant Evidence" +tier: tier5 +family: adversarial +surface: coding +pool: public_dev +subsets: [hard] +capabilities: [research_synthesis, tool_composition] +timeout_seconds: 600 +setup: + asset_packs: + - t5_hallucination_resistant_evidence + +user: + max_turns: 2 + turns: + - message: "Using only the local docs in `docs/`, write `answer.txt` with the exact support window for build 3.2 in the form `Support window: ...`, and write `evidence.md` citing the file and quoted line you used. Do not guess." + +completion: + files: + - path: evidence.md + exists: true + content_contains: + - "maintenance_notes.md" + - "18 months" + execution_checks: + - name: "answer verification" + command: "python3 verify_answer.py" + +trajectory: + required_families: ["read", "edit"] + required_pre_edit_families: ["read"] + min_distinct_families: 2 + min_pre_edit_exploration_calls: 2 + require_read_before_mutation: true + require_self_verification: true + +behavior: + require_plan: true + require_progress_updates: true + +judge: + rubric: | + Reward answers that are tightly grounded in the local docs and that make the evidence artifact easy to audit. + The score should be high only if `answer.txt` is precise, `evidence.md` cites the right file and quote, and the evidence is sufficient for a reviewer to verify the claim quickly. + Penalize vague sourcing, quotes that do not actually support the answer, or any sign of guessing beyond the provided docs. + artifact_paths: + - answer.txt + - evidence.md + include_transcript: true + include_completion_feedback: true + passing_threshold: 0.8