diff --git a/tasks-public/MANIFEST.yaml b/tasks-public/MANIFEST.yaml
new file mode 100644
index 0000000..e6ba69b
--- /dev/null
+++ b/tasks-public/MANIFEST.yaml
@@ -0,0 +1,220 @@
+manifest_version: 1
+release: clawbench-core-v1
+release_date: 2026-04-20
+benchmark_version: 0.4.0.dev1
+task_count: 19
+source_sweep: v2026-4-19-full
+openclaw_version: 2026.4.15-beta.1
+
+description: |
+ ClawBench Core v1 — a curated subset of 19 tasks from the internal
+ 40-task ClawBench dev pool. Selected so that:
+ (a) all 8 measured frontier models produce the established ranking
+ order in the v4-19-full sweep,
+ (b) coverage is preserved across tiers (1–5) and task families
+ (tools, coding, repo, browser, multi_tool, adversarial),
+ (c) tasks with broken verifiers or near-zero cross-model SNR are
+ dropped.
+
+ Verification: mean run_score across these 19 tasks reproduces the
+ reference ranking with 0 inversions and min adjacent-rank gap of
+ 0.0049 (well above the ~0.002 seed-noise floor).
+
+established_ranking:
+ - rank: 1
+ model: anthropic/claude-opus-4-6
+ display: Claude Opus 4.6
+ score: 0.8137
+ - rank: 2
+ model: anthropic/claude-opus-4-7
+ display: Claude Opus 4.7
+ score: 0.7824
+ - rank: 3
+ model: openai/gpt-5.4
+ display: GPT 5.4
+ score: 0.7647
+ - rank: 4
+ model: anthropic/claude-sonnet-4-6
+ display: Claude Sonnet 4.6
+ score: 0.7597
+ - rank: 5
+ model: openrouter/minimax/minimax-m2.7
+ display: MiniMax M2.7
+ score: 0.7475
+ - rank: 6
+ model: google/gemini-3.1-pro-preview
+ display: Gemini 3.1 Pro
+ score: 0.7408
+ - rank: 7
+ model: openrouter/qwen/qwen3.6-plus
+ display: Qwen 3.6 Plus
+ score: 0.7030
+ - rank: 8
+ model: openrouter/moonshotai/kimi-k2.5
+ display: Kimi K2.5
+ score: 0.6800
+
+coverage:
+ tiers:
+ tier1: 2
+ tier2: 7
+ tier3: 5
+ tier4: 4
+ tier5: 1
+ families:
+ tools: 7
+ coding: 2
+ repo: 3
+ browser: 2
+ multi_tool: 3
+ adversarial: 1
+ # Tier 3/4 some families overlap; see per-task manifest below.
+
+tasks:
+ - id: t1-bugfix-discount
+ tier: tier1
+ family: coding
+ capabilities: [bugfix]
+ path: tier1/t1-bugfix-discount.yaml
+ asset_pack: t1_bugfix_discount
+
+ - id: t1-fs-quick-note
+ tier: tier1
+ family: tools
+ capabilities: [structured_output]
+ path: tier1/t1-fs-quick-note.yaml
+ asset_pack: t1_fs_quick_note
+
+ - id: t2-add-tests-normalizer
+ tier: tier2
+ family: coding
+ capabilities: [test_authoring]
+ path: tier2/t2-add-tests-normalizer.yaml
+ asset_pack: t2_add_tests_normalizer
+
+ - id: t2-browser-form-fix
+ tier: tier2
+ family: browser
+ capabilities: [browser_debugging, bugfix]
+ path: tier2/t2-browser-form-fix.yaml
+ asset_pack: t2_browser_form_fix
+
+ - id: t2-config-loader
+ tier: tier2
+ family: repo
+ capabilities: [bugfix, multifile_reasoning]
+ path: tier2/t2-config-loader.yaml
+ asset_pack: t2_config_loader
+
+ - id: t2-fs-find-that-thing
+ tier: tier2
+ family: tools
+ capabilities: [structured_output]
+ path: tier2/t2-fs-find-that-thing.yaml
+ asset_pack: t2_fs_find_that_thing
+
+ - id: t2-msg-summarize-thread
+ tier: tier2
+ family: tools
+ capabilities: [research_synthesis, structured_output]
+ path: tier2/t2-msg-summarize-thread.yaml
+ asset_pack: t2_msg_summarize_thread
+
+ - id: t2-priv-redact-doc
+ tier: tier2
+ family: tools
+ capabilities: [structured_output, graceful_refusal]
+ path: tier2/t2-priv-redact-doc.yaml
+ asset_pack: t2_priv_redact_doc
+
+ - id: t3-data-pipeline-report
+ tier: tier3
+ family: multi_tool
+ capabilities: [structured_output, multifile_reasoning]
+ path: tier3/t3-data-pipeline-report.yaml
+ asset_pack: t3_data_pipeline_report
+
+ - id: t3-data-sql-query
+ tier: tier3
+ family: tools
+ capabilities: [structured_output]
+ path: tier3/t3-data-sql-query.yaml
+ asset_pack: t3_data_sql_query
+
+ - id: t3-feature-export
+ tier: tier3
+ family: repo
+ capabilities: [multifile_reasoning, structured_output]
+ path: tier3/t3-feature-export.yaml
+ asset_pack: t3_feature_export
+
+ - id: t3-msg-inbox-triage
+ tier: tier3
+ family: tools
+ capabilities: [structured_output, multifile_reasoning]
+ path: tier3/t3-msg-inbox-triage.yaml
+ asset_pack: t3_msg_inbox_triage
+
+ - id: t3-web-research-and-cite
+ tier: tier3
+ family: tools
+ capabilities: [research_synthesis]
+ path: tier3/t3-web-research-and-cite.yaml
+ asset_pack: t3_web_research_and_cite
+
+ - id: t4-browser-research-and-code
+ tier: tier4
+ family: browser
+ capabilities: [browser_debugging, research_synthesis]
+ path: tier4/t4-browser-research-and-code.yaml
+ asset_pack: t4_browser_research_and_code
+
+ - id: t4-cross-repo-migration
+ tier: tier4
+ family: repo
+ capabilities: [cross_repo_change, multifile_reasoning]
+ path: tier4/t4-cross-repo-migration.yaml
+ asset_pack: t4_cross_repo_migration
+
+ - id: t4-delegation-repair
+ tier: tier4
+ family: multi_tool
+ capabilities: [delegation, bugfix]
+ path: tier4/t4-delegation-repair.yaml
+ asset_pack: t4_delegation_repair
+
+ - id: t4-life-trip-plan
+ tier: tier4
+ family: tools
+ capabilities: [research_synthesis, structured_output]
+ path: tier4/t4-life-trip-plan.yaml
+ asset_pack: t4_life_trip_plan
+
+ - id: t4-memory-recall-continuation
+ tier: tier4
+ family: multi_tool
+ capabilities: [memory_continuation, multifile_reasoning]
+ path: tier4/t4-memory-recall-continuation.yaml
+ asset_pack: t4_memory_recall_continuation
+
+ - id: t5-hallucination-resistant-evidence
+ tier: tier5
+ family: adversarial
+ capabilities: [research_synthesis, tool_composition]
+ path: tier5/t5-hallucination-resistant-evidence.yaml
+ asset_pack: t5_hallucination_resistant_evidence
+
+notes: |
+ - The full private dev set (tasks/) contains 40 tasks. This Core-19
+ subset is the signal-rich, ranking-consistent public release.
+ - Additional 21 tasks are retained as a private holdout for
+ contamination-resistant measurement of future models.
+ - Task families "creative" and "long-horizon (Tier 6)" are absent
+ from Core v1; planned for a future release.
+ - Known caveats: t4-memory-recall-continuation has a verifier that
+ penalizes agents that respond in conversation rather than via file
+ artifacts. All models face the same verifier, so the comparison is
+ internally fair, but absolute scores understate capability.
+ - t5-hallucination-resistant-evidence has low cross-model SNR (about
+ 0.25) in v4-19-full; included for adversarial-family coverage
+ despite this. Consider upgrading verifier in a future release.
diff --git a/tasks-public/README.md b/tasks-public/README.md
new file mode 100644
index 0000000..8301cd9
--- /dev/null
+++ b/tasks-public/README.md
@@ -0,0 +1,132 @@
+# ClawBench Core v1 — Public Task Set (19 tasks)
+
+A curated 19-task subset of the full ClawBench v0.4.0.dev1 dev pool,
+selected for ranking consistency and capability coverage.
+
+## What this is
+
+19 tasks, 3 runs each → 57 runs per model. About half the compute of
+the full 40-task sweep, with no loss of discriminative power on the
+measured 8-model panel.
+
+Derived from the v2026-4-19-full sweep archive by greedy task
+selection: iteratively drop tasks that either (a) introduce ranking
+inversions vs the reference ordering or (b) have near-zero cross-model
+SNR and add only noise.
+
+## Established ranking (from v4-19-full sweep)
+
+Mean run_score across the 19 tasks:
+
+| Rank | Model | Score |
+|:---:|---|:---:|
+| 1 | Claude Opus 4.6 | 0.8137 |
+| 2 | Claude Opus 4.7 | 0.7824 |
+| 3 | GPT 5.4 | 0.7647 |
+| 4 | Claude Sonnet 4.6 | 0.7597 |
+| 5 | MiniMax M2.7 | 0.7475 |
+| 6 | Gemini 3.1 Pro | 0.7408 |
+| 7 | Qwen 3.6 Plus | 0.7030 |
+| 8 | Kimi K2.5 | 0.6800 |
+
+- **0 ranking inversions** on the 19-task mean.
+- **Min adjacent-rank gap: 0.0049** (well above the ~0.002 seed-noise
+ floor estimated from inter-run variance).
+- **Top-to-bottom spread: 0.134** (vs 0.097 for smaller robust sets).
+
+## Coverage
+
+| Dimension | Breakdown |
+|---|---|
+| Tiers | T1=2, T2=7, T3=5, T4=4, T5=1 |
+| Families | tools=7, coding=2, repo=3, browser=2, multi_tool=3, adversarial=1 |
+| Capabilities | bugfix, refactor, test_authoring, multifile_reasoning, browser_debugging, structured_output, graceful_refusal, delegation, tool_composition, research_synthesis, cross_repo_change, memory_continuation |
+
+## Directory layout
+
+```
+tasks-public/
+├── MANIFEST.yaml # Machine-readable task list + metadata
+├── README.md # This file
+├── tier1/ # 2 task YAMLs
+├── tier2/ # 7 task YAMLs
+├── tier3/ # 5 task YAMLs
+├── tier4/ # 4 task YAMLs
+├── tier5/ # 1 task YAML
+└── assets/ # 19 asset packs (verifier scripts + fixtures)
+```
+
+## How to run Core v1
+
+Using the ClawBench harness:
+
+```bash
+# Explicit task-by-task (pass -t for each of 19 tasks):
+clawbench run \
+ --model anthropic/claude-opus-4-6 \
+ --runs 3 \
+ --concurrency 4 \
+ --profile profiles/frontier_opus_4_6.yaml \
+ --judge-model anthropic/claude-sonnet-4-6 \
+ -t t1-bugfix-discount -t t1-fs-quick-note \
+ -t t2-add-tests-normalizer -t t2-browser-form-fix \
+ -t t2-config-loader -t t2-fs-find-that-thing \
+ -t t2-msg-summarize-thread -t t2-priv-redact-doc \
+ -t t3-data-pipeline-report -t t3-data-sql-query \
+ -t t3-feature-export -t t3-msg-inbox-triage \
+ -t t3-web-research-and-cite \
+ -t t4-browser-research-and-code -t t4-cross-repo-migration \
+ -t t4-delegation-repair -t t4-life-trip-plan \
+ -t t4-memory-recall-continuation \
+ -t t5-hallucination-resistant-evidence \
+ -o results/opus46_core_v1.json
+```
+
+Or point the harness at this directory by setting the task root in
+your ClawBench config. See MANIFEST.yaml for a programmatic list.
+
+## Reproducibility caveats
+
+- **Exact score reproduction is not guaranteed.** Even with the same
+ OpenClaw version, re-runs exhibit seed noise (~0.02 stddev per task,
+ per model). Rankings are stable; absolute scores drift within that
+ envelope.
+- **OpenRouter-routed models** (`openrouter/*`) can have their
+ scores shift if OpenRouter repoints its model slug to a different
+ underlying provider. We observed this with GLM 5.1 between
+ 2026-04-20 14:00 and 17:00 PST. Pin to canonical model versions
+ (e.g. `z-ai/glm-5-turbo-20260315`) for stable measurement.
+- **OpenClaw platform version matters.** Upgrading from 4.9 → 4.15-beta.1
+ shifted scores by +0.13 to +0.29 across models. Pin via Docker tag.
+- **Judge scores** come from Claude Sonnet 4.6 via direct Anthropic
+ API (with a fallback from the gateway judge). Scores assume the
+ judge is working correctly; re-judging broken runs may be required
+ (see `scripts/rejudge_all.py` in the main repo).
+
+## What's NOT in Core v1
+
+21 tasks from the full dev pool are held back:
+- **9 ceiling tasks** (all frontier models score >0.85) — don't
+ discriminate, future releases may phase them out.
+- **9 noise tasks** (cross-model SNR < 0.5) — either broken verifiers
+ or genuinely ambiguous prompts. Scheduled for redesign.
+- **3 ranking-breaker tasks** — tasks where the cross-model ordering
+ conflicts with the reference ranking (e.g. `t2-node-search-patch`,
+ `t5-contradictory-requirements`). Not broken per se; just
+ inconsistent with the headline.
+
+Also missing entirely from Core v1:
+- **Tier 6 long-horizon (100+ turn) tasks** — planned for v2.
+- **Creative synthesis / style-matching tasks** — planned for v2.
+- **Paraphrased prompt pairs** for perturbation-sensitivity
+ measurement — planned for v2.
+
+## Versioning
+
+| Version | Tasks | Change |
+|:---:|:---:|---|
+| Core v1 | 19 | Initial public release (this) |
+| Core v2 | ~24 | Planned: +Tier 6, +paraphrase pairs, -2 noise tasks |
+
+Pin to `clawbench-core-v1` in the MANIFEST for reproducible
+comparison across releases.
diff --git a/tasks-public/assets/t1_bugfix_discount/cart.py b/tasks-public/assets/t1_bugfix_discount/cart.py
new file mode 100644
index 0000000..627f4a5
--- /dev/null
+++ b/tasks-public/assets/t1_bugfix_discount/cart.py
@@ -0,0 +1,6 @@
+from pricing import apply_discount
+
+
+def checkout_total(subtotal: int, discount_percent: int) -> int:
+ return apply_discount(subtotal, discount_percent)
+
diff --git a/tasks-public/assets/t1_bugfix_discount/pricing.py b/tasks-public/assets/t1_bugfix_discount/pricing.py
new file mode 100644
index 0000000..448f5ce
--- /dev/null
+++ b/tasks-public/assets/t1_bugfix_discount/pricing.py
@@ -0,0 +1,4 @@
+def apply_discount(subtotal_cents: int, discount_percent: int) -> int:
+ # BUG: this subtracts the raw percent value instead of a percentage of the subtotal.
+ return subtotal_cents - discount_percent
+
diff --git a/tasks-public/assets/t1_bugfix_discount/tests/test_pricing.py b/tasks-public/assets/t1_bugfix_discount/tests/test_pricing.py
new file mode 100644
index 0000000..9f1c7c0
--- /dev/null
+++ b/tasks-public/assets/t1_bugfix_discount/tests/test_pricing.py
@@ -0,0 +1,10 @@
+from cart import checkout_total
+
+
+def test_percentage_discount_applies_to_full_subtotal():
+ assert checkout_total(2_000, 10) == 1_800
+
+
+def test_zero_discount_keeps_subtotal():
+ assert checkout_total(1_250, 0) == 1_250
+
diff --git a/tasks-public/assets/t1_fs_quick_note/notes/.gitkeep b/tasks-public/assets/t1_fs_quick_note/notes/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/tasks-public/assets/t1_fs_quick_note/verify_list_structure.py b/tasks-public/assets/t1_fs_quick_note/verify_list_structure.py
new file mode 100644
index 0000000..a86e4d2
--- /dev/null
+++ b/tasks-public/assets/t1_fs_quick_note/verify_list_structure.py
@@ -0,0 +1,57 @@
+"""Recursive workspace search verifier."""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+EXCLUDE_FRAGMENTS = (
+ "verify_", "/.git/", "/.openclaw/",
+ "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md",
+ "USER.md", "SOUL.md", "HEARTBEAT.md",
+)
+TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log",
+ ".jsonl", ".html", ".sh", ".py")
+
+
+def iter_workspace_text_files(root: Path = Path(".")):
+ for path in root.rglob("*"):
+ if not path.is_file():
+ continue
+ sp = str(path)
+ if any(frag in sp for frag in EXCLUDE_FRAGMENTS):
+ continue
+ if path.suffix.lower() not in TEXT_SUFFIXES:
+ continue
+ try:
+ yield path, path.read_text(encoding="utf-8", errors="ignore")
+ except Exception:
+ continue
+
+
+def workspace_blob() -> str:
+ return "\n".join(text for _, text in iter_workspace_text_files())
+
+
+import re
+
+LIST_PATTERNS = [
+ re.compile(r"^\s*[-*+]\s+"),
+ re.compile(r"^\s*\d+[.)]\s+"),
+ re.compile(r"^\s*\[[ x]\]\s+"),
+]
+
+
+def main() -> int:
+ for path, text in iter_workspace_text_files():
+ if any(t in text.lower() for t in ("dry clean", "sam", "babysit", "60")):
+ list_lines = sum(1 for line in text.splitlines() if any(p.match(line) for p in LIST_PATTERNS))
+ if list_lines >= 3:
+ print(f"PASS: list-formatted note found at {path} ({list_lines} list lines)")
+ return 0
+ print("FAIL: no list-structured note found anywhere in workspace")
+ return 1
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/tasks-public/assets/t1_fs_quick_note/verify_three_items.py b/tasks-public/assets/t1_fs_quick_note/verify_three_items.py
new file mode 100644
index 0000000..50d073c
--- /dev/null
+++ b/tasks-public/assets/t1_fs_quick_note/verify_three_items.py
@@ -0,0 +1,56 @@
+"""Recursive workspace search verifier."""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+EXCLUDE_FRAGMENTS = (
+ "verify_", "/.git/", "/.openclaw/",
+ "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md",
+ "USER.md", "SOUL.md", "HEARTBEAT.md",
+)
+TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log",
+ ".jsonl", ".html", ".sh", ".py")
+
+
+def iter_workspace_text_files(root: Path = Path(".")):
+ for path in root.rglob("*"):
+ if not path.is_file():
+ continue
+ sp = str(path)
+ if any(frag in sp for frag in EXCLUDE_FRAGMENTS):
+ continue
+ if path.suffix.lower() not in TEXT_SUFFIXES:
+ continue
+ try:
+ yield path, path.read_text(encoding="utf-8", errors="ignore")
+ except Exception:
+ continue
+
+
+def workspace_blob() -> str:
+ return "\n".join(text for _, text in iter_workspace_text_files())
+
+
+def main() -> int:
+ blob = workspace_blob().lower()
+ if not blob:
+ print("FAIL: workspace contains no agent-written text files")
+ return 1
+ needed = ['dry clean', 'sam', 'babysit']
+ if not all(s in blob for s in needed):
+ missing = [s for s in needed if s not in blob]
+ print(f"FAIL: workspace missing required content: {missing}")
+ return 1
+ needed = ['60']
+ if not all(s in blob for s in needed):
+ missing = [s for s in needed if s not in blob]
+ print(f"FAIL: workspace missing required content: {missing}")
+ return 1
+ print("PASS: t1_fs_quick_note/verify_three_items.py")
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/tasks-public/assets/t2_add_tests_normalizer/normalizer.py b/tasks-public/assets/t2_add_tests_normalizer/normalizer.py
new file mode 100644
index 0000000..c0474e2
--- /dev/null
+++ b/tasks-public/assets/t2_add_tests_normalizer/normalizer.py
@@ -0,0 +1,14 @@
+import re
+
+EMOJI_RE = re.compile(r"[\U0001F300-\U0001FAFF]")
+
+
+def normalize_title(text: str) -> str:
+ cleaned = " ".join(text.split())
+ cleaned = EMOJI_RE.sub("", cleaned)
+ return cleaned.strip().title()
+
+
+def normalize_tags(raw: str) -> list[str]:
+ return [part.strip().lower() for part in raw.split(",") if part.strip()]
+
diff --git a/tasks-public/assets/t2_add_tests_normalizer/verify_added_tests.py b/tasks-public/assets/t2_add_tests_normalizer/verify_added_tests.py
new file mode 100644
index 0000000..94e94c9
--- /dev/null
+++ b/tasks-public/assets/t2_add_tests_normalizer/verify_added_tests.py
@@ -0,0 +1,74 @@
+from __future__ import annotations
+
+import subprocess
+import sys
+from pathlib import Path
+
+
+BUGGY_EMOJI = """import re
+
+EMOJI_RE = re.compile(r"[\\U0001F300-\\U0001FAFF]")
+
+
+def normalize_title(text: str) -> str:
+ cleaned = " ".join(text.split())
+ return cleaned.strip().title()
+
+
+def normalize_tags(raw: str) -> list[str]:
+ return [part.strip().lower() for part in raw.split(",") if part.strip()]
+"""
+
+BUGGY_TAGS = """import re
+
+EMOJI_RE = re.compile(r"[\\U0001F300-\\U0001FAFF]")
+
+
+def normalize_title(text: str) -> str:
+ cleaned = " ".join(text.split())
+ cleaned = EMOJI_RE.sub("", cleaned)
+ return cleaned.strip().title()
+
+
+def normalize_tags(raw: str) -> list[str]:
+ return [part.strip().lower() for part in raw.split(",")]
+"""
+
+
+def _run_pytest(*args: str) -> subprocess.CompletedProcess[str]:
+ return subprocess.run(
+ [sys.executable, "-m", "pytest", "-q", *args],
+ check=False,
+ capture_output=True,
+ text=True,
+ )
+
+
+def _expect_mutant_failure(normalizer_path: Path, mutant_source: str, label: str) -> None:
+ backup = normalizer_path.read_text(encoding="utf-8")
+ normalizer_path.write_text(mutant_source, encoding="utf-8")
+ try:
+ result = _run_pytest("tests/test_normalizer.py")
+ assert result.returncode != 0, f"student tests did not catch mutant: {label}"
+ finally:
+ normalizer_path.write_text(backup, encoding="utf-8")
+
+
+def main() -> None:
+ test_path = Path("tests/test_normalizer.py")
+ assert test_path.exists(), "tests/test_normalizer.py is missing"
+
+ baseline = _run_pytest()
+ assert baseline.returncode == 0, baseline.stdout + baseline.stderr
+
+ normalizer_path = Path("normalizer.py")
+ _expect_mutant_failure(normalizer_path, BUGGY_EMOJI, "emoji stripping")
+ _expect_mutant_failure(normalizer_path, BUGGY_TAGS, "blank tag handling")
+
+ source = test_path.read_text(encoding="utf-8").lower()
+ assert "normalize_title" in source
+ assert "normalize_tags" in source
+
+
+if __name__ == "__main__":
+ main()
diff --git a/tasks-public/assets/t2_browser_form_fix/app.js b/tasks-public/assets/t2_browser_form_fix/app.js
new file mode 100644
index 0000000..0559355
--- /dev/null
+++ b/tasks-public/assets/t2_browser_form_fix/app.js
@@ -0,0 +1,16 @@
+const form = document.getElementById("contact-formm");
+const emailInput = document.getElementById("email");
+const statusNode = document.getElementById("status");
+
+if (form) {
+ form.addEventListener("submit", (event) => {
+ event.preventDefault();
+ const email = emailInput.value.trim();
+ if (!email.includes("@")) {
+ statusNode.textContent = "Enter a valid email.";
+ return;
+ }
+ statusNode.textContent = `Saved ${email}`;
+ });
+}
+
diff --git a/tasks-public/assets/t2_browser_form_fix/index.html b/tasks-public/assets/t2_browser_form_fix/index.html
new file mode 100644
index 0000000..b1d64df
--- /dev/null
+++ b/tasks-public/assets/t2_browser_form_fix/index.html
@@ -0,0 +1,20 @@
+
+
+
+
+ Newsletter Signup
+
+
+
+
+ Join the Newsletter
+
+
+
+
+
+
diff --git a/tasks-public/assets/t2_browser_form_fix/serve.py b/tasks-public/assets/t2_browser_form_fix/serve.py
new file mode 100644
index 0000000..9eec359
--- /dev/null
+++ b/tasks-public/assets/t2_browser_form_fix/serve.py
@@ -0,0 +1,21 @@
+from __future__ import annotations
+
+import os
+from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer
+
+
+class Handler(SimpleHTTPRequestHandler):
+ def do_GET(self) -> None: # noqa: N802
+ if self.path == "/health":
+ self.send_response(200)
+ self.end_headers()
+ self.wfile.write(b"ok")
+ return
+ return super().do_GET()
+
+
+if __name__ == "__main__":
+ port = int(os.environ.get("PORT", "8123"))
+ server = ThreadingHTTPServer(("127.0.0.1", port), Handler)
+ server.serve_forever()
+
diff --git a/tasks-public/assets/t2_browser_form_fix/verify_form.cjs b/tasks-public/assets/t2_browser_form_fix/verify_form.cjs
new file mode 100644
index 0000000..b839c61
--- /dev/null
+++ b/tasks-public/assets/t2_browser_form_fix/verify_form.cjs
@@ -0,0 +1,23 @@
+const { chromium } = require("playwright");
+
+async function main() {
+ const url = process.argv[2];
+ const browser = await chromium.launch({ headless: true });
+ const page = await browser.newPage();
+ await page.goto(url, { waitUntil: "networkidle" });
+ await page.fill("#email", "reader@example.com");
+ await page.click("#submit-button");
+ await page.waitForFunction(() => document.querySelector("#status").textContent.includes("Saved"), null, {
+ timeout: 3000,
+ });
+ const status = await page.textContent("#status");
+ await browser.close();
+ if (status.trim() !== "Saved reader@example.com") {
+ throw new Error(`Unexpected status: ${status}`);
+ }
+}
+
+main().catch((error) => {
+ console.error(error.message || String(error));
+ process.exit(1);
+});
diff --git a/tasks-public/assets/t2_config_loader/app_config.py b/tasks-public/assets/t2_config_loader/app_config.py
new file mode 100644
index 0000000..0ac5c48
--- /dev/null
+++ b/tasks-public/assets/t2_config_loader/app_config.py
@@ -0,0 +1,6 @@
+DEFAULTS = {
+ "host": "127.0.0.1",
+ "port": 8080,
+ "debug": False,
+}
+
diff --git a/tasks-public/assets/t2_config_loader/config_loader.py b/tasks-public/assets/t2_config_loader/config_loader.py
new file mode 100644
index 0000000..3c7f7c0
--- /dev/null
+++ b/tasks-public/assets/t2_config_loader/config_loader.py
@@ -0,0 +1,20 @@
+from __future__ import annotations
+
+import json
+import os
+from pathlib import Path
+
+from app_config import DEFAULTS
+
+
+def load_config(path: str | None = None) -> dict[str, object]:
+ config = dict(DEFAULTS)
+ if path:
+ config.update(json.loads(Path(path).read_text(encoding="utf-8")))
+ # BUG: file values incorrectly win over environment overrides.
+ if "APP_PORT" in os.environ and path:
+ config["port"] = json.loads(Path(path).read_text(encoding="utf-8")).get("port", DEFAULTS["port"])
+ if "APP_DEBUG" in os.environ:
+ config["debug"] = os.environ["APP_DEBUG"]
+ return config
+
diff --git a/tasks-public/assets/t2_config_loader/tests/test_config_loader.py b/tasks-public/assets/t2_config_loader/tests/test_config_loader.py
new file mode 100644
index 0000000..b227ce5
--- /dev/null
+++ b/tasks-public/assets/t2_config_loader/tests/test_config_loader.py
@@ -0,0 +1,20 @@
+from __future__ import annotations
+
+import json
+
+from config_loader import load_config
+
+
+def test_env_port_overrides_file(tmp_path, monkeypatch):
+ config_path = tmp_path / "config.json"
+ config_path.write_text(json.dumps({"port": 9000, "debug": False}), encoding="utf-8")
+ monkeypatch.setenv("APP_PORT", "9200")
+ cfg = load_config(str(config_path))
+ assert cfg["port"] == 9200
+
+
+def test_debug_flag_is_boolean(monkeypatch):
+ monkeypatch.setenv("APP_DEBUG", "true")
+ cfg = load_config(None)
+ assert cfg["debug"] is True
+
diff --git a/tasks-public/assets/t2_fs_find_that_thing/.correct_filename.txt b/tasks-public/assets/t2_fs_find_that_thing/.correct_filename.txt
new file mode 100644
index 0000000..edc85c6
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/.correct_filename.txt
@@ -0,0 +1 @@
+q3_marketing_budget_v3.xlsx
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_1.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_1.txt
new file mode 100644
index 0000000..6aba593
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_1.txt
@@ -0,0 +1 @@
+filler 1
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_10.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_10.txt
new file mode 100644
index 0000000..9818d50
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_10.txt
@@ -0,0 +1 @@
+filler 10
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_11.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_11.txt
new file mode 100644
index 0000000..22c8f8d
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_11.txt
@@ -0,0 +1 @@
+filler 11
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_12.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_12.txt
new file mode 100644
index 0000000..ab2924d
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_12.txt
@@ -0,0 +1 @@
+filler 12
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_13.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_13.txt
new file mode 100644
index 0000000..2e4656e
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_13.txt
@@ -0,0 +1 @@
+filler 13
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_14.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_14.txt
new file mode 100644
index 0000000..2f6e834
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_14.txt
@@ -0,0 +1 @@
+filler 14
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_15.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_15.txt
new file mode 100644
index 0000000..204e7a6
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_15.txt
@@ -0,0 +1 @@
+filler 15
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_16.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_16.txt
new file mode 100644
index 0000000..bff1b76
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_16.txt
@@ -0,0 +1 @@
+filler 16
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_17.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_17.txt
new file mode 100644
index 0000000..0e910f0
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_17.txt
@@ -0,0 +1 @@
+filler 17
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_18.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_18.txt
new file mode 100644
index 0000000..b003e84
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_18.txt
@@ -0,0 +1 @@
+filler 18
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_19.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_19.txt
new file mode 100644
index 0000000..c5dff1b
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_19.txt
@@ -0,0 +1 @@
+filler 19
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_2.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_2.txt
new file mode 100644
index 0000000..bed6718
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_2.txt
@@ -0,0 +1 @@
+filler 2
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_20.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_20.txt
new file mode 100644
index 0000000..a64b357
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_20.txt
@@ -0,0 +1 @@
+filler 20
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_21.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_21.txt
new file mode 100644
index 0000000..3e25237
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_21.txt
@@ -0,0 +1 @@
+filler 21
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_22.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_22.txt
new file mode 100644
index 0000000..10490cd
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_22.txt
@@ -0,0 +1 @@
+filler 22
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_23.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_23.txt
new file mode 100644
index 0000000..c850d4f
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_23.txt
@@ -0,0 +1 @@
+filler 23
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_24.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_24.txt
new file mode 100644
index 0000000..d260084
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_24.txt
@@ -0,0 +1 @@
+filler 24
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_25.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_25.txt
new file mode 100644
index 0000000..2dd16e0
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_25.txt
@@ -0,0 +1 @@
+filler 25
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_3.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_3.txt
new file mode 100644
index 0000000..f787b2a
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_3.txt
@@ -0,0 +1 @@
+filler 3
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_4.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_4.txt
new file mode 100644
index 0000000..9430fdb
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_4.txt
@@ -0,0 +1 @@
+filler 4
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_5.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_5.txt
new file mode 100644
index 0000000..b6a9ec7
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_5.txt
@@ -0,0 +1 @@
+filler 5
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_6.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_6.txt
new file mode 100644
index 0000000..6a1cd0c
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_6.txt
@@ -0,0 +1 @@
+filler 6
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_7.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_7.txt
new file mode 100644
index 0000000..c87673b
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_7.txt
@@ -0,0 +1 @@
+filler 7
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_8.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_8.txt
new file mode 100644
index 0000000..8e9b634
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_8.txt
@@ -0,0 +1 @@
+filler 8
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_9.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_9.txt
new file mode 100644
index 0000000..b73e005
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_9.txt
@@ -0,0 +1 @@
+filler 9
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/q2_marketing_budget.xlsx b/tasks-public/assets/t2_fs_find_that_thing/Documents/q2_marketing_budget.xlsx
new file mode 100644
index 0000000..3cf919c
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/q2_marketing_budget.xlsx
@@ -0,0 +1,4 @@
+SHEET: Q2 Marketing Budget
+Region,Q2 Spend
+NorthAmerica,380000
+TOTAL,820000
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/q3_marketing_budget_v3.xlsx b/tasks-public/assets/t2_fs_find_that_thing/Documents/q3_marketing_budget_v3.xlsx
new file mode 100644
index 0000000..36c7487
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/q3_marketing_budget_v3.xlsx
@@ -0,0 +1,8 @@
+SHEET: Regional Breakdown
+Q3 Marketing Budget by Region
+Region,Q3 Spend,Notes
+NorthAmerica,420000,Display + paid social
+EMEA,310000,Conference sponsorships
+APAC,180000,Influencer pilot
+LATAM,90000,Brand awareness
+TOTAL,1000000
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/q3_sales_breakdown.xlsx b/tasks-public/assets/t2_fs_find_that_thing/Documents/q3_sales_breakdown.xlsx
new file mode 100644
index 0000000..82a38ec
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/q3_sales_breakdown.xlsx
@@ -0,0 +1,4 @@
+SHEET: Q3 Sales Numbers
+Region,Q3 Revenue
+NorthAmerica,2400000
+TOTAL,5800000
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_1.pdf b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_1.pdf
new file mode 100644
index 0000000..6aba593
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_1.pdf
@@ -0,0 +1 @@
+filler 1
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_10.pdf b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_10.pdf
new file mode 100644
index 0000000..9818d50
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_10.pdf
@@ -0,0 +1 @@
+filler 10
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_2.pdf b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_2.pdf
new file mode 100644
index 0000000..bed6718
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_2.pdf
@@ -0,0 +1 @@
+filler 2
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_3.pdf b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_3.pdf
new file mode 100644
index 0000000..f787b2a
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_3.pdf
@@ -0,0 +1 @@
+filler 3
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_4.pdf b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_4.pdf
new file mode 100644
index 0000000..9430fdb
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_4.pdf
@@ -0,0 +1 @@
+filler 4
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_5.pdf b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_5.pdf
new file mode 100644
index 0000000..b6a9ec7
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_5.pdf
@@ -0,0 +1 @@
+filler 5
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_6.pdf b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_6.pdf
new file mode 100644
index 0000000..6a1cd0c
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_6.pdf
@@ -0,0 +1 @@
+filler 6
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_7.pdf b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_7.pdf
new file mode 100644
index 0000000..c87673b
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_7.pdf
@@ -0,0 +1 @@
+filler 7
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_8.pdf b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_8.pdf
new file mode 100644
index 0000000..8e9b634
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_8.pdf
@@ -0,0 +1 @@
+filler 8
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_9.pdf b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_9.pdf
new file mode 100644
index 0000000..b73e005
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_9.pdf
@@ -0,0 +1 @@
+filler 9
diff --git a/tasks-public/assets/t2_fs_find_that_thing/verify_correct_file.py b/tasks-public/assets/t2_fs_find_that_thing/verify_correct_file.py
new file mode 100644
index 0000000..48af355
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/verify_correct_file.py
@@ -0,0 +1,76 @@
+"""Recursive workspace search verifier."""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+EXCLUDE_FRAGMENTS = (
+ "verify_", "/.git/", "/.openclaw/",
+ "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md",
+ "USER.md", "SOUL.md", "HEARTBEAT.md",
+)
+TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log",
+ ".jsonl", ".html", ".sh", ".py")
+
+
+def iter_workspace_text_files(root: Path = Path(".")):
+ for path in root.rglob("*"):
+ if not path.is_file():
+ continue
+ sp = str(path)
+ if any(frag in sp for frag in EXCLUDE_FRAGMENTS):
+ continue
+ if path.suffix.lower() not in TEXT_SUFFIXES:
+ continue
+ try:
+ yield path, path.read_text(encoding="utf-8", errors="ignore")
+ except Exception:
+ continue
+
+
+def workspace_blob() -> str:
+ return "\n".join(text for _, text in iter_workspace_text_files())
+
+
+def main() -> int:
+ # The agent must surface the Q3 marketing budget content. The desktop
+ # copy is the explicit target, but accept any file the agent created
+ # that contains the right content (Q3 marketing + region breakdowns).
+ target_substrings = ["q3", "region"]
+ decoy_q2 = ["q2 marketing", "q2 spend"]
+ decoy_sales = ["q3 revenue", "q3 sales"]
+
+ found_path = None
+ for path, text in iter_workspace_text_files():
+ # Skip the original asset-pack files (we want files the agent
+ # *placed* somewhere — typically a desktop/copy or report)
+ if "/Documents/" in str(path) and "v3" in path.name:
+ continue
+ text_lower = text.lower()
+ if all(s in text_lower for s in target_substrings) and "marketing" in text_lower:
+ # Reject decoys
+ if any(d in text_lower for d in decoy_q2):
+ continue
+ if any(d in text_lower for d in decoy_sales):
+ continue
+ found_path = path
+ break
+
+ # Also accept agent text output (e.g. answer.md) that just NAMES the
+ # right file
+ if found_path is None:
+ for path, text in iter_workspace_text_files():
+ if "q3_marketing_budget_v3" in text.lower():
+ found_path = path
+ break
+
+ if found_path is None:
+ print("FAIL: agent did not surface the correct Q3 marketing budget file")
+ return 1
+ print(f"PASS: agent surfaced Q3 marketing budget content at/in {found_path}")
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/tasks-public/assets/t2_msg_summarize_thread/thread.txt b/tasks-public/assets/t2_msg_summarize_thread/thread.txt
new file mode 100644
index 0000000..4a52e27
--- /dev/null
+++ b/tasks-public/assets/t2_msg_summarize_thread/thread.txt
@@ -0,0 +1,29 @@
+Channel: #design-redesign
+Date range: 2026-04-05 to 2026-04-08
+
+[Apr 5 09:14] Marcus: Quick proposal — for the homepage refresh, let's go with option A (single hero image, no carousel). Carousels test poorly.
+[Apr 5 09:18] Priya: I'm fine with A. Anything but the auto-rotating mess we have today.
+[Apr 5 09:22] Sam: Agree on A. Carousels are a UX antipattern.
+[Apr 5 09:30] Marcus: Cool, let's call it. Option A it is. I'll spec it out.
+[Apr 5 10:01] Priya: For typography, can we move to Inter? Easier reading and we already license it.
+[Apr 5 10:15] Sam: +1 Inter
+[Apr 5 11:42] Marcus: Inter approved. I'll add it to the spec.
+[Apr 6 08:55] Priya: Wait, on the homepage hero — I'm second-guessing this. What if we did option B (two-column with icon row) instead? It gives more above-the-fold info.
+[Apr 6 09:20] Marcus: Fair point. Let me think.
+[Apr 6 10:30] Sam: I prefer B too actually. More info density.
+[Apr 6 13:15] Marcus: OK I'm convinced. Switching to option B. Scratch yesterday's call. Final answer: B.
+[Apr 6 14:00] Sam: Great. So B for hero, Inter for type.
+[Apr 6 16:10] Priya: For the CTA button color, sticking with our brand orange right? #FF6B35.
+[Apr 6 16:14] Marcus: Yes brand orange. Don't touch the brand colors.
+[Apr 7 09:00] zhentongfan: Catching up on this thread — sounds like option B is locked in. I can take the spec writeup if Marcus is busy.
+[Apr 7 09:05] Marcus: Thanks zhentongfan, that'd be great. I owe you one.
+[Apr 7 09:30] zhentongfan: I'll have a draft by end of day Friday.
+[Apr 7 11:20] Priya: Open question — what happens to the testimonial section? Option B doesn't have a slot for it.
+[Apr 7 11:25] Sam: Good catch. Move it below the fold? Or kill it?
+[Apr 7 11:30] Priya: I'd vote move below the fold, not kill. Sales team will riot if we kill testimonials.
+[Apr 7 14:40] Marcus: Let's keep testimonials, just below the fold. Not killing them.
+[Apr 7 15:00] Sam: Open question still — what's the mobile breakpoint going to be?
+[Apr 7 15:30] Marcus: Open question for now. Let's defer to next sprint.
+[Apr 8 10:15] Priya: One more — favicon update? The current one is from 2019.
+[Apr 8 10:20] Sam: Lol yes please. Open item.
+[Apr 8 11:00] Marcus: Adding favicon to the followup list. Open question: who owns the asset.
diff --git a/tasks-public/assets/t2_msg_summarize_thread/verify_commitments.py b/tasks-public/assets/t2_msg_summarize_thread/verify_commitments.py
new file mode 100644
index 0000000..61babe1
--- /dev/null
+++ b/tasks-public/assets/t2_msg_summarize_thread/verify_commitments.py
@@ -0,0 +1,54 @@
+"""Recursive workspace search verifier."""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+EXCLUDE_FRAGMENTS = (
+ "verify_", "/.git/", "/.openclaw/",
+ "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md",
+ "USER.md", "SOUL.md", "HEARTBEAT.md",
+)
+TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log",
+ ".jsonl", ".html", ".sh", ".py")
+
+
+def iter_workspace_text_files(root: Path = Path(".")):
+ for path in root.rglob("*"):
+ if not path.is_file():
+ continue
+ sp = str(path)
+ if any(frag in sp for frag in EXCLUDE_FRAGMENTS):
+ continue
+ if path.suffix.lower() not in TEXT_SUFFIXES:
+ continue
+ try:
+ yield path, path.read_text(encoding="utf-8", errors="ignore")
+ except Exception:
+ continue
+
+
+def workspace_blob() -> str:
+ return "\n".join(text for _, text in iter_workspace_text_files())
+
+
+def main() -> int:
+ blob = workspace_blob().lower()
+ if not blob:
+ print("FAIL: workspace contains no agent-written text files")
+ return 1
+ any_of = ['spec', 'writeup', 'write-up']
+ if not any(s in blob for s in any_of):
+ print(f"FAIL: workspace missing any of: {any_of}")
+ return 1
+ any_of = ['friday', 'you ', 'your ']
+ if not any(s in blob for s in any_of):
+ print(f"FAIL: workspace missing any of: {any_of}")
+ return 1
+ print("PASS: t2_msg_summarize_thread/verify_commitments.py")
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/tasks-public/assets/t2_msg_summarize_thread/verify_latest_decision.py b/tasks-public/assets/t2_msg_summarize_thread/verify_latest_decision.py
new file mode 100644
index 0000000..59f2805
--- /dev/null
+++ b/tasks-public/assets/t2_msg_summarize_thread/verify_latest_decision.py
@@ -0,0 +1,50 @@
+"""Recursive workspace search verifier."""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+EXCLUDE_FRAGMENTS = (
+ "verify_", "/.git/", "/.openclaw/",
+ "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md",
+ "USER.md", "SOUL.md", "HEARTBEAT.md",
+)
+TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log",
+ ".jsonl", ".html", ".sh", ".py")
+
+
+def iter_workspace_text_files(root: Path = Path(".")):
+ for path in root.rglob("*"):
+ if not path.is_file():
+ continue
+ sp = str(path)
+ if any(frag in sp for frag in EXCLUDE_FRAGMENTS):
+ continue
+ if path.suffix.lower() not in TEXT_SUFFIXES:
+ continue
+ try:
+ yield path, path.read_text(encoding="utf-8", errors="ignore")
+ except Exception:
+ continue
+
+
+def workspace_blob() -> str:
+ return "\n".join(text for _, text in iter_workspace_text_files())
+
+
+def main() -> int:
+ blob = workspace_blob().lower()
+ if not blob:
+ print("FAIL: workspace contains no agent-written text files")
+ return 1
+ any_of = ['option b', 'two-column', 'two column']
+ if not any(s in blob for s in any_of):
+ print(f"FAIL: workspace missing any of: {any_of}")
+ return 1
+ print("PASS: t2_msg_summarize_thread/verify_latest_decision.py")
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/tasks-public/assets/t2_msg_summarize_thread/verify_summary_structure.py b/tasks-public/assets/t2_msg_summarize_thread/verify_summary_structure.py
new file mode 100644
index 0000000..40a2e7e
--- /dev/null
+++ b/tasks-public/assets/t2_msg_summarize_thread/verify_summary_structure.py
@@ -0,0 +1,55 @@
+"""Recursive workspace search verifier."""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+EXCLUDE_FRAGMENTS = (
+ "verify_", "/.git/", "/.openclaw/",
+ "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md",
+ "USER.md", "SOUL.md", "HEARTBEAT.md",
+)
+TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log",
+ ".jsonl", ".html", ".sh", ".py")
+
+
+def iter_workspace_text_files(root: Path = Path(".")):
+ for path in root.rglob("*"):
+ if not path.is_file():
+ continue
+ sp = str(path)
+ if any(frag in sp for frag in EXCLUDE_FRAGMENTS):
+ continue
+ if path.suffix.lower() not in TEXT_SUFFIXES:
+ continue
+ try:
+ yield path, path.read_text(encoding="utf-8", errors="ignore")
+ except Exception:
+ continue
+
+
+def workspace_blob() -> str:
+ return "\n".join(text for _, text in iter_workspace_text_files())
+
+
+def main() -> int:
+ blob = workspace_blob().lower()
+ if not blob:
+ print("FAIL: workspace contains no agent-written text files")
+ return 1
+ needed = ['decision']
+ if not all(s in blob for s in needed):
+ missing = [s for s in needed if s not in blob]
+ print(f"FAIL: workspace missing required content: {missing}")
+ return 1
+ any_of = ['open', 'still', 'outstanding']
+ if not any(s in blob for s in any_of):
+ print(f"FAIL: workspace missing any of: {any_of}")
+ return 1
+ print("PASS: t2_msg_summarize_thread/verify_summary_structure.py")
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/tasks-public/assets/t2_priv_redact_doc/contract.txt b/tasks-public/assets/t2_priv_redact_doc/contract.txt
new file mode 100644
index 0000000..4755970
--- /dev/null
+++ b/tasks-public/assets/t2_priv_redact_doc/contract.txt
@@ -0,0 +1,25 @@
+Service Agreement
+
+This agreement is between Lin Park ("Client") and the Vendor.
+
+Client contact:
+ Name: Lin Park
+ Email: lin.park@personalmail.example
+ Phone: +1 (415) 555-0173
+ Account number: AC-77821-PK
+
+Scope of work:
+ - Implement the data ingestion pipeline described in Appendix A.
+ - Deliver weekly progress reports.
+ - Handover by Q3 2026.
+
+Pricing:
+ Base fee: $48,000 (US dollars)
+ Optional extension: up to $12,000 additional, billed monthly.
+
+Termination:
+ Either party may terminate with 30 days written notice.
+
+Signed,
+Lin Park (Client)
+April 9, 2026
diff --git a/tasks-public/assets/t2_priv_redact_doc/verify_redaction.py b/tasks-public/assets/t2_priv_redact_doc/verify_redaction.py
new file mode 100644
index 0000000..73e2834
--- /dev/null
+++ b/tasks-public/assets/t2_priv_redact_doc/verify_redaction.py
@@ -0,0 +1,68 @@
+"""Recursive workspace search verifier."""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+EXCLUDE_FRAGMENTS = (
+ "verify_", "/.git/", "/.openclaw/",
+ "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md",
+ "USER.md", "SOUL.md", "HEARTBEAT.md",
+)
+TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log",
+ ".jsonl", ".html", ".sh", ".py")
+
+
+def iter_workspace_text_files(root: Path = Path(".")):
+ for path in root.rglob("*"):
+ if not path.is_file():
+ continue
+ sp = str(path)
+ if any(frag in sp for frag in EXCLUDE_FRAGMENTS):
+ continue
+ if path.suffix.lower() not in TEXT_SUFFIXES:
+ continue
+ try:
+ yield path, path.read_text(encoding="utf-8", errors="ignore")
+ except Exception:
+ continue
+
+
+def workspace_blob() -> str:
+ return "\n".join(text for _, text in iter_workspace_text_files())
+
+
+PII = [
+ "Lin Park", "lin.park@personalmail.example",
+ "+1 (415) 555-0173", "415 555-0173", "415-555-0173",
+ "AC-77821-PK",
+]
+PRESERVED_KEYWORDS = ["service agreement", "scope of work", "termination"]
+
+
+def main() -> int:
+ # Find a redacted version of the contract somewhere
+ for path, text in iter_workspace_text_files():
+ if "contract" not in path.name.lower() or "redact" not in path.name.lower():
+ continue
+ leaked = [v for v in PII if v in text]
+ if leaked:
+ print(f"FAIL: redacted file {path} still contains PII: {leaked}")
+ return 1
+ text_lower = text.lower()
+ if not all(k in text_lower for k in PRESERVED_KEYWORDS):
+ continue
+ # Original must be intact
+ original = Path("contract.txt")
+ if original.exists() and "Lin Park" not in original.read_text(encoding="utf-8"):
+ print("FAIL: original contract.txt was modified")
+ return 1
+ print(f"PASS: redacted contract found at {path}, PII removed, structure preserved")
+ return 0
+ print("FAIL: no redacted contract file found in workspace")
+ return 1
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/tasks-public/assets/t3_data_pipeline_report/expected/report.txt b/tasks-public/assets/t3_data_pipeline_report/expected/report.txt
new file mode 100644
index 0000000..62fe099
--- /dev/null
+++ b/tasks-public/assets/t3_data_pipeline_report/expected/report.txt
@@ -0,0 +1,4 @@
+East: 150
+North: 50
+West: 80
+
diff --git a/tasks-public/assets/t3_data_pipeline_report/input/regions.json b/tasks-public/assets/t3_data_pipeline_report/input/regions.json
new file mode 100644
index 0000000..4db9b12
--- /dev/null
+++ b/tasks-public/assets/t3_data_pipeline_report/input/regions.json
@@ -0,0 +1,2 @@
+{"east": "East", "west": "West", "north": "North"}
+
diff --git a/tasks-public/assets/t3_data_pipeline_report/input/sales.csv b/tasks-public/assets/t3_data_pipeline_report/input/sales.csv
new file mode 100644
index 0000000..1ebfd84
--- /dev/null
+++ b/tasks-public/assets/t3_data_pipeline_report/input/sales.csv
@@ -0,0 +1,6 @@
+region,amount
+east,120
+west,80
+east,30
+north,50
+
diff --git a/tasks-public/assets/t3_data_pipeline_report/pipeline.py b/tasks-public/assets/t3_data_pipeline_report/pipeline.py
new file mode 100644
index 0000000..9cc4e73
--- /dev/null
+++ b/tasks-public/assets/t3_data_pipeline_report/pipeline.py
@@ -0,0 +1,29 @@
+from __future__ import annotations
+
+import csv
+import json
+import sys
+
+
+def load_sales(path: str) -> list[dict[str, str]]:
+ with open(path, encoding="utf-8") as handle:
+ return list(csv.DictReader(handle))
+
+
+def load_regions(path: str) -> dict[str, str]:
+ with open(path, encoding="utf-8") as handle:
+ return json.load(handle)
+
+
+def build_report(sales_rows: list[dict[str, str]], region_map: dict[str, str]) -> str:
+ # TODO: aggregate all rows by region and include totals.
+ first = sales_rows[0]
+ region_name = region_map[first["region"]]
+ return f"{region_name}: {first['amount']}"
+
+
+if __name__ == "__main__":
+ sales = load_sales(sys.argv[1])
+ regions = load_regions(sys.argv[2])
+ print(build_report(sales, regions))
+
diff --git a/tasks-public/assets/t3_data_sql_query/users.db b/tasks-public/assets/t3_data_sql_query/users.db
new file mode 100644
index 0000000..2264973
Binary files /dev/null and b/tasks-public/assets/t3_data_sql_query/users.db differ
diff --git a/tasks-public/assets/t3_data_sql_query/verify_results.py b/tasks-public/assets/t3_data_sql_query/verify_results.py
new file mode 100644
index 0000000..7b2028a
--- /dev/null
+++ b/tasks-public/assets/t3_data_sql_query/verify_results.py
@@ -0,0 +1,68 @@
+"""Recursive workspace search verifier."""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+EXCLUDE_FRAGMENTS = (
+ "verify_", "/.git/", "/.openclaw/",
+ "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md",
+ "USER.md", "SOUL.md", "HEARTBEAT.md",
+)
+TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log",
+ ".jsonl", ".html", ".sh", ".py")
+
+
+def iter_workspace_text_files(root: Path = Path(".")):
+ for path in root.rglob("*"):
+ if not path.is_file():
+ continue
+ sp = str(path)
+ if any(frag in sp for frag in EXCLUDE_FRAGMENTS):
+ continue
+ if path.suffix.lower() not in TEXT_SUFFIXES:
+ continue
+ try:
+ yield path, path.read_text(encoding="utf-8", errors="ignore")
+ except Exception:
+ continue
+
+
+def workspace_blob() -> str:
+ return "\n".join(text for _, text in iter_workspace_text_files())
+
+
+import re, csv, io
+
+def main() -> int:
+ # Find a CSV-shaped file with the EU 2026 active signups data
+ for path, text in iter_workspace_text_files():
+ if path.suffix.lower() != ".csv":
+ continue
+ rows = list(csv.reader(io.StringIO(text)))
+ if not rows:
+ continue
+ first_is_header = not any(any(c.isdigit() for c in cell) for cell in rows[0])
+ data_rows = rows[1:] if first_is_header else rows
+ if len(data_rows) != 7:
+ continue
+ blob = " ".join(c for r in data_rows for c in r).lower()
+ if "old" in blob and ("do not use" in blob or "deprecated" in blob):
+ continue
+ expected = ["organic", "paid social", "email newsletter", "referral partner"]
+ if sum(1 for c in expected if c in blob) >= 2:
+ print(f"PASS: 7 rows + correct channels in {path}")
+ return 0
+
+ # Also accept any text file with the right content shape
+ blob = workspace_blob().lower()
+ if "7" in blob and all(c in blob for c in ("organic", "paid social")):
+ print("PASS: result discussion mentions 7 rows + channels (text format)")
+ return 0
+ print("FAIL: no CSV with 7 active EU 2026 signups + correct channels")
+ return 1
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/tasks-public/assets/t3_feature_export/cli.py b/tasks-public/assets/t3_feature_export/cli.py
new file mode 100644
index 0000000..a460aab
--- /dev/null
+++ b/tasks-public/assets/t3_feature_export/cli.py
@@ -0,0 +1,23 @@
+from __future__ import annotations
+
+import argparse
+
+from exporters import export_csv, export_json
+from issues import ISSUES
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser()
+ parser.add_argument("command", choices=["export"])
+ parser.add_argument("--format", choices=["json", "csv"], default="json")
+ args = parser.parse_args()
+
+ if args.format == "json":
+ print(export_json(ISSUES))
+ return
+
+ print(export_csv(ISSUES))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/tasks-public/assets/t3_feature_export/expected/issues.csv b/tasks-public/assets/t3_feature_export/expected/issues.csv
new file mode 100644
index 0000000..23af1fa
--- /dev/null
+++ b/tasks-public/assets/t3_feature_export/expected/issues.csv
@@ -0,0 +1,4 @@
+id,title,status
+101,Fix login loop,open
+102,Improve metrics panel,closed
+
diff --git a/tasks-public/assets/t3_feature_export/exporters.py b/tasks-public/assets/t3_feature_export/exporters.py
new file mode 100644
index 0000000..147426b
--- /dev/null
+++ b/tasks-public/assets/t3_feature_export/exporters.py
@@ -0,0 +1,10 @@
+import json
+
+
+def export_json(issues: list[dict[str, object]]) -> str:
+ return json.dumps(issues, sort_keys=True)
+
+
+def export_csv(issues: list[dict[str, object]]) -> str:
+ raise NotImplementedError("csv export is not implemented yet")
+
diff --git a/tasks-public/assets/t3_feature_export/issues.py b/tasks-public/assets/t3_feature_export/issues.py
new file mode 100644
index 0000000..5dd6eba
--- /dev/null
+++ b/tasks-public/assets/t3_feature_export/issues.py
@@ -0,0 +1,5 @@
+ISSUES = [
+ {"id": 101, "title": "Fix login loop", "status": "open"},
+ {"id": 102, "title": "Improve metrics panel", "status": "closed"},
+]
+
diff --git a/tasks-public/assets/t3_feature_export/tests/test_export.py b/tasks-public/assets/t3_feature_export/tests/test_export.py
new file mode 100644
index 0000000..4bd2d2b
--- /dev/null
+++ b/tasks-public/assets/t3_feature_export/tests/test_export.py
@@ -0,0 +1,11 @@
+from exporters import export_csv
+from issues import ISSUES
+
+
+def test_csv_export_has_header_and_rows():
+ assert export_csv(ISSUES) == (
+ "id,title,status\n"
+ "101,Fix login loop,open\n"
+ "102,Improve metrics panel,closed\n"
+ )
+
diff --git a/tasks-public/assets/t3_msg_inbox_triage/inbox/01_urgent_client_outage.txt b/tasks-public/assets/t3_msg_inbox_triage/inbox/01_urgent_client_outage.txt
new file mode 100644
index 0000000..9e08698
--- /dev/null
+++ b/tasks-public/assets/t3_msg_inbox_triage/inbox/01_urgent_client_outage.txt
@@ -0,0 +1,11 @@
+From: priya.shah@acme-customer.com
+Subject: URGENT: Production outage on our side - need help today
+
+Hi,
+
+Our production environment using your service has been down since 2pm.
+Error logs are attached. Can someone from your team look at this today?
+This is impacting our customer-facing checkout flow.
+
+Priya
+VP Engineering, Acme Corp
diff --git a/tasks-public/assets/t3_msg_inbox_triage/inbox/02_newsletter_techweekly.txt b/tasks-public/assets/t3_msg_inbox_triage/inbox/02_newsletter_techweekly.txt
new file mode 100644
index 0000000..70c3c6e
--- /dev/null
+++ b/tasks-public/assets/t3_msg_inbox_triage/inbox/02_newsletter_techweekly.txt
@@ -0,0 +1,11 @@
+From: digest@techweekly.example
+Subject: TechWeekly Digest #482 — AI infrastructure roundup
+
+Top stories this week:
+- New benchmarks released for vector databases
+- Hardware vendors announce next-gen accelerators
+- Open source models continue rapid progress
+
+Click to read more.
+
+To unsubscribe, click here.
diff --git a/tasks-public/assets/t3_msg_inbox_triage/inbox/03_phishing_attempt.txt b/tasks-public/assets/t3_msg_inbox_triage/inbox/03_phishing_attempt.txt
new file mode 100644
index 0000000..8efbb87
--- /dev/null
+++ b/tasks-public/assets/t3_msg_inbox_triage/inbox/03_phishing_attempt.txt
@@ -0,0 +1,14 @@
+From: security-alerts@anth-rop1c.com
+Subject: URGENT: Your account will be suspended
+
+Dear user,
+
+We have detected unusual activity on your account. To prevent suspension,
+please verify your credentials by clicking the link below within 24 hours:
+
+http://anth-rop1c.com/verify?id=00fA92
+
+If you do not verify, your account WILL be locked and all data will be
+deleted.
+
+Anthropic Security Team
diff --git a/tasks-public/assets/t3_msg_inbox_triage/inbox/04_team_status_update.txt b/tasks-public/assets/t3_msg_inbox_triage/inbox/04_team_status_update.txt
new file mode 100644
index 0000000..60bcba1
--- /dev/null
+++ b/tasks-public/assets/t3_msg_inbox_triage/inbox/04_team_status_update.txt
@@ -0,0 +1,13 @@
+From: marcus@yourteam.example
+Subject: Weekly status — Q2 planning
+
+Hey,
+
+Quick recap of where we are:
+- Auth migration on track for May 1
+- Marketing site redesign starting next Monday
+- Hiring: 3 candidates in final rounds
+
+No action needed from you, just FYI.
+
+Marcus
diff --git a/tasks-public/assets/t3_msg_inbox_triage/inbox/05_ambiguous_let_me_know.txt b/tasks-public/assets/t3_msg_inbox_triage/inbox/05_ambiguous_let_me_know.txt
new file mode 100644
index 0000000..7e3e0ce
--- /dev/null
+++ b/tasks-public/assets/t3_msg_inbox_triage/inbox/05_ambiguous_let_me_know.txt
@@ -0,0 +1,6 @@
+From: jamie@partner.example
+Subject: thoughts?
+
+hey can you let me know what you think when you have a minute. thanks!
+
+Jamie
diff --git a/tasks-public/assets/t3_msg_inbox_triage/inbox/06_legal_review_request.txt b/tasks-public/assets/t3_msg_inbox_triage/inbox/06_legal_review_request.txt
new file mode 100644
index 0000000..6fe403a
--- /dev/null
+++ b/tasks-public/assets/t3_msg_inbox_triage/inbox/06_legal_review_request.txt
@@ -0,0 +1,12 @@
+From: legal@yourcompany.example
+Subject: Need your sign-off on the vendor MSA by EOD Wednesday
+
+Hi,
+
+We're closing the Innotech vendor agreement and need your sign-off on the
+MSA before Wednesday end-of-day. The redlines are minor — mostly
+indemnification language.
+
+Please reply with either "approved" or specific changes.
+
+Legal Team
diff --git a/tasks-public/assets/t3_msg_inbox_triage/inbox/07_meetup_invite.txt b/tasks-public/assets/t3_msg_inbox_triage/inbox/07_meetup_invite.txt
new file mode 100644
index 0000000..6067b9e
--- /dev/null
+++ b/tasks-public/assets/t3_msg_inbox_triage/inbox/07_meetup_invite.txt
@@ -0,0 +1,9 @@
+From: events@localmeetup.example
+Subject: AI engineering meetup next Thursday
+
+Hi,
+
+Reminder that our monthly meetup is happening next Thursday at 6pm.
+Topic: production LLM observability. Free pizza.
+
+RSVP at the link below if interested.
diff --git a/tasks-public/assets/t3_msg_inbox_triage/inbox/08_recruiter_cold_outreach.txt b/tasks-public/assets/t3_msg_inbox_triage/inbox/08_recruiter_cold_outreach.txt
new file mode 100644
index 0000000..71edd0f
--- /dev/null
+++ b/tasks-public/assets/t3_msg_inbox_triage/inbox/08_recruiter_cold_outreach.txt
@@ -0,0 +1,13 @@
+From: recruiter@bigcorp.example
+Subject: Senior Engineering opportunity at BigCorp
+
+Hi,
+
+I came across your profile and think you'd be a great fit for a senior
+engineering role we're filling. Compensation range is competitive, and
+the team works on interesting problems.
+
+Would you be open to a brief chat?
+
+Best,
+Tony
diff --git a/tasks-public/assets/t3_msg_inbox_triage/prefs.yaml b/tasks-public/assets/t3_msg_inbox_triage/prefs.yaml
new file mode 100644
index 0000000..fec105f
--- /dev/null
+++ b/tasks-public/assets/t3_msg_inbox_triage/prefs.yaml
@@ -0,0 +1,10 @@
+priority_signals:
+ urgent_keywords: ["urgent", "outage", "production", "EOD", "end of day", "today"]
+ trusted_domains: ["acme-customer.com", "yourcompany.example", "yourteam.example"]
+ newsletter_domains: ["techweekly.example", "localmeetup.example"]
+ always_low_priority_subjects: ["digest", "newsletter", "weekly status"]
+
+draft_for_urgent: true
+draft_for_low_priority: false
+flag_phishing: true
+do_not_reply_to_recruiter_cold_outreach: true
diff --git a/tasks-public/assets/t3_msg_inbox_triage/verify_all_classified.py b/tasks-public/assets/t3_msg_inbox_triage/verify_all_classified.py
new file mode 100644
index 0000000..6d11017
--- /dev/null
+++ b/tasks-public/assets/t3_msg_inbox_triage/verify_all_classified.py
@@ -0,0 +1,58 @@
+"""Recursive workspace search verifier."""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+EXCLUDE_FRAGMENTS = (
+ "verify_", "/.git/", "/.openclaw/",
+ "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md",
+ "USER.md", "SOUL.md", "HEARTBEAT.md",
+)
+TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log",
+ ".jsonl", ".html", ".sh", ".py")
+
+
+def iter_workspace_text_files(root: Path = Path(".")):
+ for path in root.rglob("*"):
+ if not path.is_file():
+ continue
+ sp = str(path)
+ if any(frag in sp for frag in EXCLUDE_FRAGMENTS):
+ continue
+ if path.suffix.lower() not in TEXT_SUFFIXES:
+ continue
+ try:
+ yield path, path.read_text(encoding="utf-8", errors="ignore")
+ except Exception:
+ continue
+
+
+def workspace_blob() -> str:
+ return "\n".join(text for _, text in iter_workspace_text_files())
+
+
+def main() -> int:
+ blob = workspace_blob().lower()
+ if not blob:
+ print("FAIL: workspace contains no agent-written text files")
+ return 1
+ any_of = ['acme', 'outage', 'production']
+ if not any(s in blob for s in any_of):
+ print(f"FAIL: workspace missing any of: {any_of}")
+ return 1
+ any_of = ['legal', 'msa', 'innotech', 'vendor']
+ if not any(s in blob for s in any_of):
+ print(f"FAIL: workspace missing any of: {any_of}")
+ return 1
+ any_of = ['phishing', 'suspicious', 'anth-rop1c', 'anth-rop']
+ if not any(s in blob for s in any_of):
+ print(f"FAIL: workspace missing any of: {any_of}")
+ return 1
+ print("PASS: t3_msg_inbox_triage/verify_all_classified.py")
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/tasks-public/assets/t3_msg_inbox_triage/verify_drafts_for_urgent.py b/tasks-public/assets/t3_msg_inbox_triage/verify_drafts_for_urgent.py
new file mode 100644
index 0000000..6f2c974
--- /dev/null
+++ b/tasks-public/assets/t3_msg_inbox_triage/verify_drafts_for_urgent.py
@@ -0,0 +1,54 @@
+"""Recursive workspace search verifier."""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+EXCLUDE_FRAGMENTS = (
+ "verify_", "/.git/", "/.openclaw/",
+ "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md",
+ "USER.md", "SOUL.md", "HEARTBEAT.md",
+)
+TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log",
+ ".jsonl", ".html", ".sh", ".py")
+
+
+def iter_workspace_text_files(root: Path = Path(".")):
+ for path in root.rglob("*"):
+ if not path.is_file():
+ continue
+ sp = str(path)
+ if any(frag in sp for frag in EXCLUDE_FRAGMENTS):
+ continue
+ if path.suffix.lower() not in TEXT_SUFFIXES:
+ continue
+ try:
+ yield path, path.read_text(encoding="utf-8", errors="ignore")
+ except Exception:
+ continue
+
+
+def workspace_blob() -> str:
+ return "\n".join(text for _, text in iter_workspace_text_files())
+
+
+def main() -> int:
+ blob = workspace_blob().lower()
+ if not blob:
+ print("FAIL: workspace contains no agent-written text files")
+ return 1
+ any_of = ['acme', 'outage', 'priya']
+ if not any(s in blob for s in any_of):
+ print(f"FAIL: workspace missing any of: {any_of}")
+ return 1
+ any_of = ['legal', 'msa', 'innotech']
+ if not any(s in blob for s in any_of):
+ print(f"FAIL: workspace missing any of: {any_of}")
+ return 1
+ print("PASS: t3_msg_inbox_triage/verify_drafts_for_urgent.py")
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/tasks-public/assets/t3_msg_inbox_triage/verify_phishing_flagged.py b/tasks-public/assets/t3_msg_inbox_triage/verify_phishing_flagged.py
new file mode 100644
index 0000000..5b84145
--- /dev/null
+++ b/tasks-public/assets/t3_msg_inbox_triage/verify_phishing_flagged.py
@@ -0,0 +1,50 @@
+"""Recursive workspace search verifier."""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+EXCLUDE_FRAGMENTS = (
+ "verify_", "/.git/", "/.openclaw/",
+ "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md",
+ "USER.md", "SOUL.md", "HEARTBEAT.md",
+)
+TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log",
+ ".jsonl", ".html", ".sh", ".py")
+
+
+def iter_workspace_text_files(root: Path = Path(".")):
+ for path in root.rglob("*"):
+ if not path.is_file():
+ continue
+ sp = str(path)
+ if any(frag in sp for frag in EXCLUDE_FRAGMENTS):
+ continue
+ if path.suffix.lower() not in TEXT_SUFFIXES:
+ continue
+ try:
+ yield path, path.read_text(encoding="utf-8", errors="ignore")
+ except Exception:
+ continue
+
+
+def workspace_blob() -> str:
+ return "\n".join(text for _, text in iter_workspace_text_files())
+
+
+def main() -> int:
+ blob = workspace_blob().lower()
+ if not blob:
+ print("FAIL: workspace contains no agent-written text files")
+ return 1
+ any_of = ['phishing', 'suspicious', 'scam', 'spoof', 'fake', 'do not click']
+ if not any(s in blob for s in any_of):
+ print(f"FAIL: workspace missing any of: {any_of}")
+ return 1
+ print("PASS: t3_msg_inbox_triage/verify_phishing_flagged.py")
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/tasks-public/assets/t3_web_research_and_cite/articles/01_grid_basics.html b/tasks-public/assets/t3_web_research_and_cite/articles/01_grid_basics.html
new file mode 100644
index 0000000..c0cc98d
--- /dev/null
+++ b/tasks-public/assets/t3_web_research_and_cite/articles/01_grid_basics.html
@@ -0,0 +1,14 @@
+
+Solar Curtailment 101 — Grid Operator Quarterly
+
+
+Solar Curtailment 101
+Source: Grid Operator Quarterly | Published 2025-11-14 | author: Lin Park
+When solar output exceeds local demand, grid operators traditionally
+"curtail" — instructing solar farms to reduce production. This wastes
+clean energy. In 2024, California's CAISO curtailed 3.2 TWh of solar.
+Operators are increasingly pivoting to active management: shifting
+loads, charging batteries, and exporting to neighboring regions instead
+of curtailing.
+
+
diff --git a/tasks-public/assets/t3_web_research_and_cite/articles/02_battery_storage.html b/tasks-public/assets/t3_web_research_and_cite/articles/02_battery_storage.html
new file mode 100644
index 0000000..1462832
--- /dev/null
+++ b/tasks-public/assets/t3_web_research_and_cite/articles/02_battery_storage.html
@@ -0,0 +1,13 @@
+
+Battery Storage Soaks Up Excess Solar — Energy Wire
+
+
+Battery Storage Soaks Up Excess Solar
+Source: Energy Wire | Published 2026-02-03 | author: Maya Johansson
+Utility-scale battery installations doubled in 2025. The
+California Independent System Operator reports that storage absorbed
+roughly 40 percent of would-be-curtailed midday solar in Q4 2025.
+Texas ERCOT followed a similar trajectory, with battery storage
+helping smooth duck-curve effects.
+
+
diff --git a/tasks-public/assets/t3_web_research_and_cite/articles/03_pricing_signals.html b/tasks-public/assets/t3_web_research_and_cite/articles/03_pricing_signals.html
new file mode 100644
index 0000000..7be23b3
--- /dev/null
+++ b/tasks-public/assets/t3_web_research_and_cite/articles/03_pricing_signals.html
@@ -0,0 +1,13 @@
+
+Negative Price Hours Are the New Normal — Power Markets Today
+
+
+Negative Price Hours Are the New Normal
+Source: Power Markets Today | Published 2026-01-22 | author: Dev Patel
+European wholesale markets saw record numbers of negative pricing
+hours in 2025. Germany alone recorded 466 hours of sub-zero spot
+prices, primarily during high solar generation periods.
+This is creating both opportunities (for flexible loads) and
+challenges (for project economics).
+
+
diff --git a/tasks-public/assets/t3_web_research_and_cite/articles/04_curtailment_helps.html b/tasks-public/assets/t3_web_research_and_cite/articles/04_curtailment_helps.html
new file mode 100644
index 0000000..73fc6c2
--- /dev/null
+++ b/tasks-public/assets/t3_web_research_and_cite/articles/04_curtailment_helps.html
@@ -0,0 +1,13 @@
+
+Curtailment Is Actually a Feature, Not a Bug — Contrarian View
+
+
+Curtailment Is Actually a Feature, Not a Bug
+Source: Energy Contrarian | Published 2026-03-12 | author: Jordan Hayes
+The dominant narrative says curtailment is wasted energy. Some grid
+economists disagree: occasional curtailment is the price of building
+ample solar capacity, and it's cheaper than over-engineering storage.
+This view directly contradicts the storage-first orthodoxy held by
+most operators today.
+
+
diff --git a/tasks-public/assets/t3_web_research_and_cite/articles/05_satire.html b/tasks-public/assets/t3_web_research_and_cite/articles/05_satire.html
new file mode 100644
index 0000000..443baef
--- /dev/null
+++ b/tasks-public/assets/t3_web_research_and_cite/articles/05_satire.html
@@ -0,0 +1,12 @@
+
+BREAKING: Sun Switched Off to Help Grid Operators (SATIRE)
+
+
+BREAKING: Sun Switched Off to Help Grid Operators
+Source: The Onion-Adjacent Energy Times | Published 2026-04-01 | tagged: SATIRE / HUMOR
+Grid operators rejoiced today as the sun was officially placed on
+standby mode by the Department of Celestial Affairs. "Finally we don't
+have to deal with all that pesky free energy," said one engineer.
+NOTE: This article is satire and should not be cited as fact.
+
+
diff --git a/tasks-public/assets/t3_web_research_and_cite/serve.py b/tasks-public/assets/t3_web_research_and_cite/serve.py
new file mode 100644
index 0000000..22f0d23
--- /dev/null
+++ b/tasks-public/assets/t3_web_research_and_cite/serve.py
@@ -0,0 +1,66 @@
+"""Serve the local sandboxed news site for the research-and-cite task."""
+
+from __future__ import annotations
+
+import os
+from http.server import BaseHTTPRequestHandler, HTTPServer
+from pathlib import Path
+
+ROOT = Path(__file__).parent / "articles"
+
+
+class Handler(BaseHTTPRequestHandler):
+ def do_GET(self) -> None: # noqa: N802
+ path = self.path.split("?")[0]
+ if path == "/health":
+ self.send_response(200)
+ self.send_header("Content-Type", "application/json")
+ self.end_headers()
+ self.wfile.write(b'{"ok": true}')
+ return
+ if path == "/" or path == "/index":
+ self._index()
+ return
+ if path.startswith("/article/"):
+ slug = path.split("/", 2)[2]
+ article = ROOT / f"{slug}.html"
+ if article.exists():
+ self._html(article.read_bytes())
+ return
+ self.send_response(404)
+ self.end_headers()
+ self.wfile.write(b"not found")
+
+ def _index(self) -> None:
+ items = []
+ for f in sorted(ROOT.glob("*.html")):
+ slug = f.stem
+ items.append(f'{slug}')
+ body = (
+ ""
+ "Sandboxed News Index
"
+ + "".join(items)
+ + "
"
+ ).encode("utf-8")
+ self._html(body)
+
+ def _html(self, body: bytes) -> None:
+ self.send_response(200)
+ self.send_header("Content-Type", "text/html")
+ self.send_header("Content-Length", str(len(body)))
+ self.end_headers()
+ self.wfile.write(body)
+
+ def log_message(self, format, *args): # noqa: A002
+ pass
+
+
+def main() -> None:
+ port = int(os.environ.get("PORT", "0"))
+ server = HTTPServer(("127.0.0.1", port), Handler)
+ print(f"sandbox news site listening on http://127.0.0.1:{server.server_address[1]}")
+ server.serve_forever()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/tasks-public/assets/t3_web_research_and_cite/verify_explainer.py b/tasks-public/assets/t3_web_research_and_cite/verify_explainer.py
new file mode 100644
index 0000000..a759006
--- /dev/null
+++ b/tasks-public/assets/t3_web_research_and_cite/verify_explainer.py
@@ -0,0 +1,71 @@
+"""Recursive workspace search verifier."""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+EXCLUDE_FRAGMENTS = (
+ "verify_", "/.git/", "/.openclaw/",
+ "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md",
+ "USER.md", "SOUL.md", "HEARTBEAT.md",
+)
+TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log",
+ ".jsonl", ".html", ".sh", ".py")
+
+
+def iter_workspace_text_files(root: Path = Path(".")):
+ for path in root.rglob("*"):
+ if not path.is_file():
+ continue
+ sp = str(path)
+ if any(frag in sp for frag in EXCLUDE_FRAGMENTS):
+ continue
+ if path.suffix.lower() not in TEXT_SUFFIXES:
+ continue
+ try:
+ yield path, path.read_text(encoding="utf-8", errors="ignore")
+ except Exception:
+ continue
+
+
+def workspace_blob() -> str:
+ return "\n".join(text for _, text in iter_workspace_text_files())
+
+
+def main() -> int:
+ blob = workspace_blob().lower()
+ if len(blob) < 500:
+ print(f"FAIL: workspace text too thin ({len(blob)} chars) for an explainer")
+ return 1
+
+ real_keywords = {
+ "01_grid_basics": ["grid operator quarterly", "lin park", "caiso", "3.2 twh"],
+ "02_battery_storage": ["energy wire", "maya johansson", "40 percent", "ercot"],
+ "03_pricing_signals": ["power markets today", "dev patel", "negative", "466 hours"],
+ "04_curtailment_helps": ["energy contrarian", "jordan hayes", "feature, not a bug",
+ "feature not a bug"],
+ }
+
+ cited = sum(1 for kws in real_keywords.values() if any(k in blob for k in kws))
+ if cited < 2:
+ print(f"FAIL: only {cited}/4 real articles cited (need >=2)")
+ return 1
+
+ if "sun switched off" in blob or "celestial affairs" in blob:
+ if not any(w in blob for w in ("satire", "humor", "joke", "not real")):
+ print("FAIL: cited the satirical article as fact")
+ return 1
+
+ if not any(w in blob for w in ("contradict", "disagree", "however", "on the other hand",
+ "in contrast", "alternative view", "counterpoint",
+ "but some", "by contrast", "nuance")):
+ print("FAIL: explainer does not surface the contradiction between sources")
+ return 1
+
+ print(f"PASS: {cited}/4 real sources cited, contradiction surfaced")
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/tasks-public/assets/t4_browser_research_and_code/docs/index.html b/tasks-public/assets/t4_browser_research_and_code/docs/index.html
new file mode 100644
index 0000000..9868561
--- /dev/null
+++ b/tasks-public/assets/t4_browser_research_and_code/docs/index.html
@@ -0,0 +1,41 @@
+
+
+
+
+ Reporting API Docs
+
+
+ Reporting API
+
+ Versioning
+
+ /v1/reports — deprecated, sunset on 2026-07-01.
+ /v2/reports — current (GA since 2026.2). Use this.
+ /v3/reports — beta, not recommended for production; interface may change.
+
+ New integrations must use /v2/reports.
+
+ Required headers (for /v2/reports)
+ Every request to the current reporting endpoint must include:
+
+ X-Workspace-Id — identifies the tenant workspace.
+ Authorization — Bearer <token>.
+
+
+ Optional headers
+
+ X-Request-Id — opaque client-side correlation id for tracing.
+
+
+ Headers for other endpoints (do NOT send on /v2/reports)
+
+ X-Admin-Token — required on /v2/admin only. Sending it on /v2/reports will cause a 400.
+
+
+ Rate limits
+ The /v2/reports endpoint is rate-limited to 120 requests per minute per workspace. Requests beyond this return 429.
+
+ Payload size
+ Max body size on /v2/reports is 10 MiB. Larger payloads should use the chunked upload flow (see /v2/uploads, not covered here).
+
+
diff --git a/tasks-public/assets/t4_browser_research_and_code/report_client.py b/tasks-public/assets/t4_browser_research_and_code/report_client.py
new file mode 100644
index 0000000..815fe9f
--- /dev/null
+++ b/tasks-public/assets/t4_browser_research_and_code/report_client.py
@@ -0,0 +1,7 @@
+API_PATH = "/v1/reports"
+REQUIRED_HEADERS = ["Authorization"]
+
+# Rate-limit + payload guards the agent must set to match the published
+# reporting API contract. Starter values are wrong on purpose.
+RATE_LIMIT_PER_MINUTE = None
+MAX_PAYLOAD_BYTES = None
diff --git a/tasks-public/assets/t4_browser_research_and_code/serve_docs.py b/tasks-public/assets/t4_browser_research_and_code/serve_docs.py
new file mode 100644
index 0000000..17c0228
--- /dev/null
+++ b/tasks-public/assets/t4_browser_research_and_code/serve_docs.py
@@ -0,0 +1,24 @@
+from __future__ import annotations
+
+import os
+from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer
+
+
+class Handler(SimpleHTTPRequestHandler):
+ def do_GET(self) -> None: # noqa: N802
+ if self.path == "/health":
+ self.send_response(200)
+ self.end_headers()
+ self.wfile.write(b"ok")
+ return
+ return super().do_GET()
+
+ def log_message(self, format: str, *args) -> None: # noqa: A003
+ return
+
+
+if __name__ == "__main__":
+ port = int(os.environ.get("PORT", "8125"))
+ os.chdir("docs")
+ server = ThreadingHTTPServer(("127.0.0.1", port), Handler)
+ server.serve_forever()
diff --git a/tasks-public/assets/t4_browser_research_and_code/tests/test_report_client.py b/tasks-public/assets/t4_browser_research_and_code/tests/test_report_client.py
new file mode 100644
index 0000000..e86ac6b
--- /dev/null
+++ b/tasks-public/assets/t4_browser_research_and_code/tests/test_report_client.py
@@ -0,0 +1,36 @@
+from report_client import (
+ API_PATH,
+ REQUIRED_HEADERS,
+ RATE_LIMIT_PER_MINUTE,
+ MAX_PAYLOAD_BYTES,
+)
+
+
+def test_reporting_api_path_is_v2():
+ # v1 is deprecated (sunset 2026-07-01), v3 is beta — current is v2.
+ assert API_PATH == "/v2/reports"
+
+
+def test_workspace_header_is_required():
+ assert "X-Workspace-Id" in REQUIRED_HEADERS
+
+
+def test_authorization_header_is_required():
+ # Bearer token is required per the docs.
+ assert "Authorization" in REQUIRED_HEADERS
+
+
+def test_admin_token_is_not_a_required_header():
+ # X-Admin-Token is only for /v2/admin — sending it on /v2/reports returns 400.
+ # Distractor — the agent must correctly scope required headers.
+ assert "X-Admin-Token" not in REQUIRED_HEADERS
+
+
+def test_rate_limit_matches_docs():
+ # 120 requests per minute per workspace.
+ assert RATE_LIMIT_PER_MINUTE == 120
+
+
+def test_max_payload_size_matches_docs():
+ # 10 MiB = 10 * 1024 * 1024 bytes.
+ assert MAX_PAYLOAD_BYTES == 10 * 1024 * 1024
diff --git a/tasks-public/assets/t4_cross_repo_migration/contracts/customer_event.py b/tasks-public/assets/t4_cross_repo_migration/contracts/customer_event.py
new file mode 100644
index 0000000..1ad39f3
--- /dev/null
+++ b/tasks-public/assets/t4_cross_repo_migration/contracts/customer_event.py
@@ -0,0 +1,5 @@
+def validate_event(payload: dict[str, object]) -> dict[str, object]:
+ if "customer_name" not in payload:
+ raise ValueError("missing customer_name")
+ return {"customer_name": payload["customer_name"], "status": payload["status"]}
+
diff --git a/tasks-public/assets/t4_cross_repo_migration/contracts/tests/test_schema.py b/tasks-public/assets/t4_cross_repo_migration/contracts/tests/test_schema.py
new file mode 100644
index 0000000..02f412b
--- /dev/null
+++ b/tasks-public/assets/t4_cross_repo_migration/contracts/tests/test_schema.py
@@ -0,0 +1,7 @@
+from contracts.customer_event import validate_event
+
+
+def test_schema_uses_account_name():
+ payload = validate_event({"account_name": "Acme", "status": "active"})
+ assert payload["account_name"] == "Acme"
+
diff --git a/tasks-public/assets/t4_cross_repo_migration/service/render.py b/tasks-public/assets/t4_cross_repo_migration/service/render.py
new file mode 100644
index 0000000..7c99cc4
--- /dev/null
+++ b/tasks-public/assets/t4_cross_repo_migration/service/render.py
@@ -0,0 +1,3 @@
+def render_account(event: dict[str, object]) -> str:
+ return f"{event['customer_name']} ({event['status']})"
+
diff --git a/tasks-public/assets/t4_cross_repo_migration/service/tests/test_client.py b/tasks-public/assets/t4_cross_repo_migration/service/tests/test_client.py
new file mode 100644
index 0000000..c8f86a9
--- /dev/null
+++ b/tasks-public/assets/t4_cross_repo_migration/service/tests/test_client.py
@@ -0,0 +1,6 @@
+from service.render import render_account
+
+
+def test_service_uses_account_name():
+ assert render_account({"account_name": "Acme", "status": "active"}) == "Acme (active)"
+
diff --git a/tasks-public/assets/t4_delegation_repair/billing.py b/tasks-public/assets/t4_delegation_repair/billing.py
new file mode 100644
index 0000000..059625d
--- /dev/null
+++ b/tasks-public/assets/t4_delegation_repair/billing.py
@@ -0,0 +1,3 @@
+def monthly_total(subtotal_cents: int, fee_percent: int) -> int:
+ return subtotal_cents + fee_percent
+
diff --git a/tasks-public/assets/t4_delegation_repair/notifications.py b/tasks-public/assets/t4_delegation_repair/notifications.py
new file mode 100644
index 0000000..ccfda5f
--- /dev/null
+++ b/tasks-public/assets/t4_delegation_repair/notifications.py
@@ -0,0 +1,3 @@
+def subject_for(account_name: str, status: str) -> str:
+ return f"[{status}] {account_name}"
+
diff --git a/tasks-public/assets/t4_delegation_repair/tests/test_repairs.py b/tasks-public/assets/t4_delegation_repair/tests/test_repairs.py
new file mode 100644
index 0000000..12dadcc
--- /dev/null
+++ b/tasks-public/assets/t4_delegation_repair/tests/test_repairs.py
@@ -0,0 +1,11 @@
+from billing import monthly_total
+from notifications import subject_for
+
+
+def test_monthly_total_applies_percentage_fee():
+ assert monthly_total(10_000, 5) == 10_500
+
+
+def test_subject_title_cases_name_and_uppercases_status():
+ assert subject_for("acme west", "warning") == "[WARNING] Acme West"
+
diff --git a/tasks-public/assets/t4_life_trip_plan/places.json b/tasks-public/assets/t4_life_trip_plan/places.json
new file mode 100644
index 0000000..da68bc6
--- /dev/null
+++ b/tasks-public/assets/t4_life_trip_plan/places.json
@@ -0,0 +1,91 @@
+{
+ "venues": [
+ {
+ "id": "fushimi_inari",
+ "name": "Fushimi Inari Shrine",
+ "type": "landmark",
+ "cost_usd": 0,
+ "vegetarian_friendly": true,
+ "mobility_friendly": false,
+ "notes": "Famous torii gates; the full hike is steep, but the lower shrine area is accessible"
+ },
+ {
+ "id": "kinkaku_ji",
+ "name": "Kinkaku-ji (Golden Pavilion)",
+ "type": "landmark",
+ "cost_usd": 5,
+ "vegetarian_friendly": true,
+ "mobility_friendly": true,
+ "notes": "Flat path around the pond"
+ },
+ {
+ "id": "arashiyama_bamboo",
+ "name": "Arashiyama Bamboo Grove",
+ "type": "landmark",
+ "cost_usd": 0,
+ "vegetarian_friendly": true,
+ "mobility_friendly": true,
+ "notes": "Flat paved path"
+ },
+ {
+ "id": "nishiki_market",
+ "name": "Nishiki Market",
+ "type": "food",
+ "cost_usd": 25,
+ "vegetarian_friendly": true,
+ "mobility_friendly": true,
+ "notes": "Indoor covered market"
+ },
+ {
+ "id": "shojin_ryori_kyoto",
+ "name": "Shoryori Tessenan",
+ "type": "restaurant",
+ "cost_usd": 45,
+ "vegetarian_friendly": true,
+ "mobility_friendly": true,
+ "notes": "Traditional Buddhist vegetarian cuisine"
+ },
+ {
+ "id": "wagyu_house",
+ "name": "Wagyu House Kyoto",
+ "type": "restaurant",
+ "cost_usd": 80,
+ "vegetarian_friendly": false,
+ "mobility_friendly": true
+ },
+ {
+ "id": "ryokan_central",
+ "name": "Ryokan Central Kyoto",
+ "type": "lodging",
+ "cost_usd": 220,
+ "vegetarian_friendly": true,
+ "mobility_friendly": true,
+ "notes": "3 nights"
+ },
+ {
+ "id": "philosophers_path",
+ "name": "Philosopher's Path",
+ "type": "landmark",
+ "cost_usd": 0,
+ "vegetarian_friendly": true,
+ "mobility_friendly": false,
+ "notes": "2km walk along canal \u2014 long for limited mobility"
+ },
+ {
+ "id": "kyoto_railway_museum",
+ "name": "Kyoto Railway Museum",
+ "type": "landmark",
+ "cost_usd": 12,
+ "vegetarian_friendly": true,
+ "mobility_friendly": true
+ },
+ {
+ "id": "tea_ceremony_class",
+ "name": "Camellia Tea Ceremony",
+ "type": "experience",
+ "cost_usd": 35,
+ "vegetarian_friendly": true,
+ "mobility_friendly": true
+ }
+ ]
+}
diff --git a/tasks-public/assets/t4_life_trip_plan/profile.yaml b/tasks-public/assets/t4_life_trip_plan/profile.yaml
new file mode 100644
index 0000000..7336daf
--- /dev/null
+++ b/tasks-public/assets/t4_life_trip_plan/profile.yaml
@@ -0,0 +1,10 @@
+user: zhentongfan
+trip: kyoto-long-weekend
+budget_usd_total: 800
+dietary: vegetarian
+mobility:
+ long_walks: false
+ many_stairs: false
+ notes: "Don't bounce back from packed days like I used to"
+must_include: ["Fushimi Inari"]
+trip_length_days: 3
diff --git a/tasks-public/assets/t4_life_trip_plan/verify_constraints_check.py b/tasks-public/assets/t4_life_trip_plan/verify_constraints_check.py
new file mode 100644
index 0000000..2d0d0a1
--- /dev/null
+++ b/tasks-public/assets/t4_life_trip_plan/verify_constraints_check.py
@@ -0,0 +1,66 @@
+"""Recursive workspace search verifier."""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+EXCLUDE_FRAGMENTS = (
+ "verify_", "/.git/", "/.openclaw/",
+ "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md",
+ "USER.md", "SOUL.md", "HEARTBEAT.md",
+)
+TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log",
+ ".jsonl", ".html", ".sh", ".py")
+
+
+def iter_workspace_text_files(root: Path = Path(".")):
+ for path in root.rglob("*"):
+ if not path.is_file():
+ continue
+ sp = str(path)
+ if any(frag in sp for frag in EXCLUDE_FRAGMENTS):
+ continue
+ if path.suffix.lower() not in TEXT_SUFFIXES:
+ continue
+ try:
+ yield path, path.read_text(encoding="utf-8", errors="ignore")
+ except Exception:
+ continue
+
+
+def workspace_blob() -> str:
+ return "\n".join(text for _, text in iter_workspace_text_files())
+
+
+import json
+
+def main() -> int:
+ places_path = Path("places.json")
+ if not places_path.exists():
+ print("FAIL: places.json missing")
+ return 1
+ places = json.loads(places_path.read_text(encoding="utf-8"))
+ veg_venues = [v["name"].lower() for v in places["venues"] if v.get("vegetarian_friendly")]
+
+ blob = workspace_blob().lower()
+
+ # If wagyu mentioned, must be excluded
+ if "wagyu" in blob:
+ if not any(w in blob for w in ("not vegetarian", "skip", "exclude", "instead",
+ "alternative", "won't include", "dietary",
+ "won't be visit", "remov")):
+ print("FAIL: wagyu_house mentioned but not excluded for dietary reasons")
+ return 1
+
+ # Must reference at least one veg venue
+ if not any(name in blob for name in veg_venues):
+ print("FAIL: itinerary doesn't include any vegetarian-friendly venue")
+ return 1
+
+ print("PASS: dietary constraint honored")
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/tasks-public/assets/t4_life_trip_plan/verify_landmark_present.py b/tasks-public/assets/t4_life_trip_plan/verify_landmark_present.py
new file mode 100644
index 0000000..5d8e032
--- /dev/null
+++ b/tasks-public/assets/t4_life_trip_plan/verify_landmark_present.py
@@ -0,0 +1,51 @@
+"""Recursive workspace search verifier."""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+EXCLUDE_FRAGMENTS = (
+ "verify_", "/.git/", "/.openclaw/",
+ "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md",
+ "USER.md", "SOUL.md", "HEARTBEAT.md",
+)
+TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log",
+ ".jsonl", ".html", ".sh", ".py")
+
+
+def iter_workspace_text_files(root: Path = Path(".")):
+ for path in root.rglob("*"):
+ if not path.is_file():
+ continue
+ sp = str(path)
+ if any(frag in sp for frag in EXCLUDE_FRAGMENTS):
+ continue
+ if path.suffix.lower() not in TEXT_SUFFIXES:
+ continue
+ try:
+ yield path, path.read_text(encoding="utf-8", errors="ignore")
+ except Exception:
+ continue
+
+
+def workspace_blob() -> str:
+ return "\n".join(text for _, text in iter_workspace_text_files())
+
+
+def main() -> int:
+ blob = workspace_blob().lower()
+ if not blob:
+ print("FAIL: workspace contains no agent-written text files")
+ return 1
+ needed = ['fushimi inari']
+ if not all(s in blob for s in needed):
+ missing = [s for s in needed if s not in blob]
+ print(f"FAIL: workspace missing required content: {missing}")
+ return 1
+ print("PASS: t4_life_trip_plan/verify_landmark_present.py")
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/tasks-public/assets/t4_life_trip_plan/verify_no_fab_places.py b/tasks-public/assets/t4_life_trip_plan/verify_no_fab_places.py
new file mode 100644
index 0000000..160cd3f
--- /dev/null
+++ b/tasks-public/assets/t4_life_trip_plan/verify_no_fab_places.py
@@ -0,0 +1,82 @@
+"""Recursive workspace search verifier."""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+EXCLUDE_FRAGMENTS = (
+ "verify_", "/.git/", "/.openclaw/",
+ "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md",
+ "USER.md", "SOUL.md", "HEARTBEAT.md",
+)
+TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log",
+ ".jsonl", ".html", ".sh", ".py")
+
+
+def iter_workspace_text_files(root: Path = Path(".")):
+ for path in root.rglob("*"):
+ if not path.is_file():
+ continue
+ sp = str(path)
+ if any(frag in sp for frag in EXCLUDE_FRAGMENTS):
+ continue
+ if path.suffix.lower() not in TEXT_SUFFIXES:
+ continue
+ try:
+ yield path, path.read_text(encoding="utf-8", errors="ignore")
+ except Exception:
+ continue
+
+
+def workspace_blob() -> str:
+ return "\n".join(text for _, text in iter_workspace_text_files())
+
+
+import json, re
+
+def main() -> int:
+ places_path = Path("places.json")
+ if not places_path.exists():
+ print("FAIL: places.json missing from workspace")
+ return 1
+ places = json.loads(places_path.read_text(encoding="utf-8"))
+ real_names = {v["name"].lower() for v in places["venues"]}
+
+ # Find the itinerary in any text file
+ itinerary_text = None
+ for path, text in iter_workspace_text_files():
+ text_lower = text.lower()
+ if "fushimi inari" in text_lower and any(d in text_lower for d in ("day 1", "day1", "morning", "afternoon")):
+ itinerary_text = text_lower
+ break
+
+ if itinerary_text is None:
+ print("FAIL: no itinerary mentioning Fushimi Inari found anywhere")
+ return 1
+
+ # Look for capitalized multi-word place candidates
+ candidates = re.findall(r"[A-Z][a-zA-Z\-']+(?:[ \-][A-Z][a-zA-Z\-']+){1,4}", itinerary_text)
+ suspicious = []
+ for cand in candidates:
+ cl = cand.lower()
+ if any(rn in cl or cl in rn for rn in real_names):
+ continue
+ if any(g in cl for g in ("day", "morning", "afternoon", "evening", "kyoto",
+ "japan", "trip", "plan", "fushimi inari", "buddhist",
+ "tea ceremony", "rail", "bamboo", "shrine", "market",
+ "ryokan", "vegetarian", "free", "low key", "mobility",
+ "lunch", "dinner", "breakfast", "early", "late",
+ "transit", "central", "english", "long weekend",
+ "philosopher", "philosophers")):
+ continue
+ suspicious.append(cand)
+ if suspicious:
+ print(f"FAIL: itinerary mentions non-real places: {sorted(set(suspicious))[:5]}")
+ return 1
+ print("PASS: no fabricated places in itinerary")
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/tasks-public/assets/t4_memory_recall_continuation/docs/release_notes.md b/tasks-public/assets/t4_memory_recall_continuation/docs/release_notes.md
new file mode 100644
index 0000000..a72c652
--- /dev/null
+++ b/tasks-public/assets/t4_memory_recall_continuation/docs/release_notes.md
@@ -0,0 +1,19 @@
+# Release 2026.2 Rollout Plan
+
+## Feature flags to configure
+
+The following three facts need to land in `flags.py`:
+
+- **Beta rollout regions**: `us`, `eu`
+- **Retry budget**: `3`
+- **APAC gating**: APAC remains gated until release `2026.3` (store this as `APAC_GATED_UNTIL`).
+
+## Operational notes (informational only — NOT flags)
+
+- Nightly build cadence: every 4 hours.
+- Release captain on-call: alice@example.com.
+- Feature freeze: 2026-04-30.
+
+The operational notes above do NOT need to be codified into `flags.py` —
+they are team-facing context only. Only the three flags in the first
+section belong in code.
diff --git a/tasks-public/assets/t4_memory_recall_continuation/flags.py b/tasks-public/assets/t4_memory_recall_continuation/flags.py
new file mode 100644
index 0000000..898f512
--- /dev/null
+++ b/tasks-public/assets/t4_memory_recall_continuation/flags.py
@@ -0,0 +1,4 @@
+BETA_REGIONS: list[str] = []
+RETRY_BUDGET: int = 0
+# Release at which APAC gating lifts. Leave empty until rollout plan confirmed.
+APAC_GATED_UNTIL: str = ""
diff --git a/tasks-public/assets/t4_memory_recall_continuation/tests/test_flags.py b/tasks-public/assets/t4_memory_recall_continuation/tests/test_flags.py
new file mode 100644
index 0000000..b474b43
--- /dev/null
+++ b/tasks-public/assets/t4_memory_recall_continuation/tests/test_flags.py
@@ -0,0 +1,14 @@
+from flags import BETA_REGIONS, RETRY_BUDGET, APAC_GATED_UNTIL
+
+
+def test_beta_regions():
+ assert BETA_REGIONS == ["us", "eu"]
+
+
+def test_retry_budget():
+ assert RETRY_BUDGET == 3
+
+
+def test_apac_gated_until():
+ # APAC gating lifts at release 2026.3 per the rollout plan.
+ assert APAC_GATED_UNTIL == "2026.3"
diff --git a/tasks-public/assets/t4_memory_recall_continuation/verify_handoff.py b/tasks-public/assets/t4_memory_recall_continuation/verify_handoff.py
new file mode 100644
index 0000000..d65cfaf
--- /dev/null
+++ b/tasks-public/assets/t4_memory_recall_continuation/verify_handoff.py
@@ -0,0 +1,66 @@
+"""Recursive workspace search verifier — checks that handoff.md (or any
+alternate .md/.txt the agent wrote) captures all three flags.
+
+This task tests multi-entry memory recall; each fact must appear in the
+handoff artifact regardless of filename or formatting style."""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+EXCLUDE_FRAGMENTS = (
+ "verify_", "/.git/", "/.openclaw/",
+ "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md",
+ "USER.md", "SOUL.md", "HEARTBEAT.md",
+ "release_notes.md", # don't count re-reads of the source doc
+)
+TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml")
+
+
+def iter_workspace_text_files(root: Path = Path(".")):
+ for path in root.rglob("*"):
+ if not path.is_file():
+ continue
+ sp = str(path)
+ if any(frag in sp for frag in EXCLUDE_FRAGMENTS):
+ continue
+ if path.suffix.lower() not in TEXT_SUFFIXES:
+ continue
+ try:
+ yield path, path.read_text(encoding="utf-8", errors="ignore")
+ except Exception:
+ continue
+
+
+def workspace_blob() -> str:
+ return "\n".join(text for _, text in iter_workspace_text_files())
+
+
+def main() -> int:
+ blob = workspace_blob().lower()
+ if not blob.strip():
+ print("FAIL: no agent-written text artifacts found in workspace")
+ return 1
+
+ # Fact 1: beta regions us + eu
+ if "us" not in blob or "eu" not in blob:
+ print("FAIL: handoff missing beta regions (expected 'us' and 'eu')")
+ return 1
+
+ # Fact 2: retry budget 3
+ if "3" not in blob or "retry" not in blob:
+ print("FAIL: handoff missing retry budget fact (expected '3' and 'retry')")
+ return 1
+
+ # Fact 3: APAC gated until 2026.3
+ if "apac" not in blob or "2026.3" not in blob:
+ print("FAIL: handoff missing APAC gating fact (expected 'apac' and '2026.3')")
+ return 1
+
+ print("PASS: handoff captures beta regions, retry budget, and APAC gating")
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/tasks-public/assets/t5_hallucination_resistant_evidence/docs/maintenance_notes.md b/tasks-public/assets/t5_hallucination_resistant_evidence/docs/maintenance_notes.md
new file mode 100644
index 0000000..c91380e
--- /dev/null
+++ b/tasks-public/assets/t5_hallucination_resistant_evidence/docs/maintenance_notes.md
@@ -0,0 +1,6 @@
+Maintenance notes for build 3.2:
+
+- Support window: 18 months
+- Security fixes continue during the full support window
+- Build 3.1 remains on extended support only
+
diff --git a/tasks-public/assets/t5_hallucination_resistant_evidence/verify_answer.py b/tasks-public/assets/t5_hallucination_resistant_evidence/verify_answer.py
new file mode 100644
index 0000000..2e125b7
--- /dev/null
+++ b/tasks-public/assets/t5_hallucination_resistant_evidence/verify_answer.py
@@ -0,0 +1,15 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+
+def main() -> None:
+ answer = Path("answer.txt").read_text(encoding="utf-8").strip()
+ evidence = Path("evidence.md").read_text(encoding="utf-8")
+ assert answer == "Support window: 18 months"
+ assert "maintenance_notes.md" in evidence
+ assert "18 months" in evidence
+
+
+if __name__ == "__main__":
+ main()
diff --git a/tasks-public/tier1/t1-bugfix-discount.yaml b/tasks-public/tier1/t1-bugfix-discount.yaml
new file mode 100644
index 0000000..f5d9f4f
--- /dev/null
+++ b/tasks-public/tier1/t1-bugfix-discount.yaml
@@ -0,0 +1,68 @@
+id: t1-bugfix-discount
+name: "Tier 1: Bugfix Discount"
+tier: tier1
+family: coding
+surface: coding
+pool: public_dev
+subsets: [consensus]
+capabilities: [bugfix]
+timeout_seconds: 360
+setup:
+ asset_packs:
+ - t1_bugfix_discount
+
+user:
+ max_turns: 2
+ turns:
+ - message: "There is a discount bug in the workspace Python files. Fix it so percentage discounts work correctly for carts with multiple items, then verify the tests pass."
+
+completion:
+ execution_checks:
+ - name: "pricing tests"
+ command: "pytest -q"
+
+trajectory:
+ required_families: ["read", "edit", "execute"]
+ min_distinct_families: 3
+ require_read_before_mutation: true
+ require_self_verification: true
+ expect_recovery: true
+
+behavior:
+ require_plan: false
+ require_progress_updates: false
+
+judge:
+ rubric: |
+ You are grading a single ClawBench agent run for task t1-bugfix-discount (Tier 1: Bugfix Discount).
+
+ Score the agent on a scale from 0.0 to 1.0:
+ 1.0 = the agent's output fully and correctly answers what the user asked,
+ in a way the user could actually use without rework.
+ 0.7 = mostly correct and usable, with minor issues or missed nuances.
+ 0.4 = the agent attempted the task and produced something on-topic, but
+ the result is incomplete, partially wrong, or hard to use.
+ 0.0 = the agent failed entirely, refused without justification, or
+ fabricated information.
+
+ Important grading guidance:
+ - Don't penalize the agent for writing artifacts to a non-standard path
+ (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters
+ is that the user could find and use the result, not which exact filename
+ or directory was used. Search the entire workspace for the agent's work.
+ - Don't penalize the agent for being terse or for skipping non-essential
+ structure if the core deliverable is correct.
+ - DO penalize hallucinated content, missing required information, and
+ refusal to engage with the task.
+ - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong
+ facts).
+
+ Capability tags for this task: bugfix.
+ Task family: coding.
+
+ Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses.
+ passing_threshold: 0.7
+ include_transcript: true
+ include_completion_feedback: true
+ max_artifact_chars: 6000
+ max_transcript_chars: 6000
diff --git a/tasks-public/tier1/t1-fs-quick-note.yaml b/tasks-public/tier1/t1-fs-quick-note.yaml
new file mode 100644
index 0000000..4a4297b
--- /dev/null
+++ b/tasks-public/tier1/t1-fs-quick-note.yaml
@@ -0,0 +1,67 @@
+id: t1-fs-quick-note
+name: "Tier 1: Quick Note"
+tier: tier1
+family: tools
+surface: tools
+pool: public_dev
+subsets: [consensus]
+capabilities: [structured_output]
+timeout_seconds: 240
+setup:
+ asset_packs:
+ - t1_fs_quick_note
+
+user:
+ max_turns: 1
+ turns:
+ - message: "Jot down what I just told my partner so I don't forget — pick up dry cleaning Thursday, Sam's recital Saturday at 4, and we owe the babysitter 60 bucks."
+
+completion:
+ execution_checks:
+ - name: "all three items present"
+ command: "python3 verify_three_items.py"
+ - name: "structured as a list, not a paragraph"
+ command: "python3 verify_list_structure.py"
+
+trajectory:
+ required_families: ["edit"]
+ min_distinct_families: 1
+
+behavior:
+ require_plan: false
+ require_progress_updates: false
+
+judge:
+ rubric: |
+ You are grading a single ClawBench agent run for task t1-fs-quick-note (Tier 1: Quick Note).
+
+ Score the agent on a scale from 0.0 to 1.0:
+ 1.0 = the agent's output fully and correctly answers what the user asked,
+ in a way the user could actually use without rework.
+ 0.7 = mostly correct and usable, with minor issues or missed nuances.
+ 0.4 = the agent attempted the task and produced something on-topic, but
+ the result is incomplete, partially wrong, or hard to use.
+ 0.0 = the agent failed entirely, refused without justification, or
+ fabricated information.
+
+ Important grading guidance:
+ - Don't penalize the agent for writing artifacts to a non-standard path
+ (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters
+ is that the user could find and use the result, not which exact filename
+ or directory was used. Search the entire workspace for the agent's work.
+ - Don't penalize the agent for being terse or for skipping non-essential
+ structure if the core deliverable is correct.
+ - DO penalize hallucinated content, missing required information, and
+ refusal to engage with the task.
+ - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong
+ facts).
+
+ Capability tags for this task: structured_output.
+ Task family: tools.
+
+ Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses.
+ passing_threshold: 0.7
+ include_transcript: true
+ include_completion_feedback: true
+ max_artifact_chars: 6000
+ max_transcript_chars: 6000
diff --git a/tasks-public/tier2/t2-add-tests-normalizer.yaml b/tasks-public/tier2/t2-add-tests-normalizer.yaml
new file mode 100644
index 0000000..9517912
--- /dev/null
+++ b/tasks-public/tier2/t2-add-tests-normalizer.yaml
@@ -0,0 +1,74 @@
+id: t2-add-tests-normalizer
+name: "Tier 2: Add Tests for Normalizer"
+tier: tier2
+family: coding
+surface: coding
+pool: public_dev
+subsets: [consensus, hard]
+capabilities: [test_authoring]
+timeout_seconds: 480
+setup:
+ asset_packs:
+ - t2_add_tests_normalizer
+
+user:
+ max_turns: 2
+ turns:
+ - message: "The workspace has a text normalization module but no reliable tests. Add a focused pytest suite that covers whitespace cleanup, emoji stripping in titles, and blank tag handling, then run the tests."
+
+completion:
+ files:
+ - path: tests/test_normalizer.py
+ exists: true
+ execution_checks:
+ - name: "normalizer test quality verify"
+ command: "python3 verify_added_tests.py"
+
+trajectory:
+ required_families: ["read", "edit", "execute"]
+ required_pre_edit_families: ["read"]
+ required_post_edit_families: ["execute"]
+ min_distinct_families: 3
+ min_pre_edit_exploration_calls: 1
+ min_post_edit_verification_calls: 1
+ require_read_before_mutation: true
+ require_self_verification: true
+
+behavior:
+ require_plan: false
+ require_progress_updates: false
+
+judge:
+ rubric: |
+ You are grading a single ClawBench agent run for task t2-add-tests-normalizer (Tier 2: Add Tests for Normalizer).
+
+ Score the agent on a scale from 0.0 to 1.0:
+ 1.0 = the agent's output fully and correctly answers what the user asked,
+ in a way the user could actually use without rework.
+ 0.7 = mostly correct and usable, with minor issues or missed nuances.
+ 0.4 = the agent attempted the task and produced something on-topic, but
+ the result is incomplete, partially wrong, or hard to use.
+ 0.0 = the agent failed entirely, refused without justification, or
+ fabricated information.
+
+ Important grading guidance:
+ - Don't penalize the agent for writing artifacts to a non-standard path
+ (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters
+ is that the user could find and use the result, not which exact filename
+ or directory was used. Search the entire workspace for the agent's work.
+ - Don't penalize the agent for being terse or for skipping non-essential
+ structure if the core deliverable is correct.
+ - DO penalize hallucinated content, missing required information, and
+ refusal to engage with the task.
+ - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong
+ facts).
+
+ Capability tags for this task: test_authoring.
+ Task family: coding.
+
+ Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses.
+ passing_threshold: 0.7
+ include_transcript: true
+ include_completion_feedback: true
+ max_artifact_chars: 6000
+ max_transcript_chars: 6000
diff --git a/tasks-public/tier2/t2-browser-form-fix.yaml b/tasks-public/tier2/t2-browser-form-fix.yaml
new file mode 100644
index 0000000..58312dd
--- /dev/null
+++ b/tasks-public/tier2/t2-browser-form-fix.yaml
@@ -0,0 +1,78 @@
+id: t2-browser-form-fix
+name: "Tier 2: Browser Form Fix"
+tier: tier2
+family: browser
+surface: browser
+pool: public_dev
+subsets: [hard]
+capabilities: [browser_debugging, bugfix]
+timeout_seconds: 600
+setup:
+ asset_packs:
+ - t2_browser_form_fix
+ background_services:
+ - name: form_app
+ command: "python3 serve.py"
+ ready_path: "/health"
+ startup_timeout_seconds: 20
+
+user:
+ max_turns: 2
+ turns:
+ - message: "There is a broken newsletter signup page running at http://127.0.0.1:{form_app_port}/. Use the browser tool to reproduce the bug in the host browser, fix the frontend code in the workspace, and verify the form succeeds. If the browser tool requires a target, use `host`."
+
+completion:
+ execution_checks:
+ - name: "browser form verification"
+ command: "node verify_form.cjs http://127.0.0.1:{form_app_port}/"
+ env:
+ NODE_PATH: "{openclaw_node_path}:{benchmark_node_path}"
+
+trajectory:
+ required_families: ["browser", "edit", "execute"]
+ required_pre_edit_families: ["browser"]
+ required_post_edit_families: ["execute"]
+ min_distinct_families: 3
+ min_pre_edit_exploration_calls: 1
+ min_post_edit_verification_calls: 1
+ require_read_before_mutation: true
+ require_self_verification: true
+
+behavior:
+ require_plan: false
+ require_progress_updates: false
+
+judge:
+ rubric: |
+ You are grading a single ClawBench agent run for task t2-browser-form-fix (Tier 2: Browser Form Fix).
+
+ Score the agent on a scale from 0.0 to 1.0:
+ 1.0 = the agent's output fully and correctly answers what the user asked,
+ in a way the user could actually use without rework.
+ 0.7 = mostly correct and usable, with minor issues or missed nuances.
+ 0.4 = the agent attempted the task and produced something on-topic, but
+ the result is incomplete, partially wrong, or hard to use.
+ 0.0 = the agent failed entirely, refused without justification, or
+ fabricated information.
+
+ Important grading guidance:
+ - Don't penalize the agent for writing artifacts to a non-standard path
+ (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters
+ is that the user could find and use the result, not which exact filename
+ or directory was used. Search the entire workspace for the agent's work.
+ - Don't penalize the agent for being terse or for skipping non-essential
+ structure if the core deliverable is correct.
+ - DO penalize hallucinated content, missing required information, and
+ refusal to engage with the task.
+ - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong
+ facts).
+
+ Capability tags for this task: browser_debugging, bugfix.
+ Task family: browser.
+
+ Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses.
+ passing_threshold: 0.7
+ include_transcript: true
+ include_completion_feedback: true
+ max_artifact_chars: 6000
+ max_transcript_chars: 6000
diff --git a/tasks-public/tier2/t2-config-loader.yaml b/tasks-public/tier2/t2-config-loader.yaml
new file mode 100644
index 0000000..086fb14
--- /dev/null
+++ b/tasks-public/tier2/t2-config-loader.yaml
@@ -0,0 +1,69 @@
+id: t2-config-loader
+name: "Tier 2: Config Loader"
+tier: tier2
+family: repo
+surface: coding
+pool: public_dev
+subsets: [consensus]
+capabilities: [bugfix, multifile_reasoning]
+timeout_seconds: 480
+setup:
+ asset_packs:
+ - t2_config_loader
+
+user:
+ max_turns: 2
+ turns:
+ - message: "The config loader in the workspace is supposed to merge defaults, file values, and environment overrides. Fix the precedence and validation bugs so the pytest suite passes."
+
+completion:
+ execution_checks:
+ - name: "config loader tests"
+ command: "pytest -q"
+
+trajectory:
+ required_families: ["read", "edit", "execute"]
+ min_distinct_families: 3
+ min_distinct_read_targets_pre_edit: 2
+ require_read_before_mutation: true
+ require_self_verification: true
+ expect_recovery: true
+
+behavior:
+ require_plan: false
+ require_progress_updates: false
+
+judge:
+ rubric: |
+ You are grading a single ClawBench agent run for task t2-config-loader (Tier 2: Config Loader).
+
+ Score the agent on a scale from 0.0 to 1.0:
+ 1.0 = the agent's output fully and correctly answers what the user asked,
+ in a way the user could actually use without rework.
+ 0.7 = mostly correct and usable, with minor issues or missed nuances.
+ 0.4 = the agent attempted the task and produced something on-topic, but
+ the result is incomplete, partially wrong, or hard to use.
+ 0.0 = the agent failed entirely, refused without justification, or
+ fabricated information.
+
+ Important grading guidance:
+ - Don't penalize the agent for writing artifacts to a non-standard path
+ (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters
+ is that the user could find and use the result, not which exact filename
+ or directory was used. Search the entire workspace for the agent's work.
+ - Don't penalize the agent for being terse or for skipping non-essential
+ structure if the core deliverable is correct.
+ - DO penalize hallucinated content, missing required information, and
+ refusal to engage with the task.
+ - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong
+ facts).
+
+ Capability tags for this task: bugfix, multifile_reasoning.
+ Task family: repo.
+
+ Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses.
+ passing_threshold: 0.7
+ include_transcript: true
+ include_completion_feedback: true
+ max_artifact_chars: 6000
+ max_transcript_chars: 6000
diff --git a/tasks-public/tier2/t2-fs-find-that-thing.yaml b/tasks-public/tier2/t2-fs-find-that-thing.yaml
new file mode 100644
index 0000000..ce3fb6a
--- /dev/null
+++ b/tasks-public/tier2/t2-fs-find-that-thing.yaml
@@ -0,0 +1,81 @@
+id: t2-fs-find-that-thing
+name: "Tier 2: Find That Thing"
+tier: tier2
+family: tools
+surface: tools
+pool: public_dev
+subsets: [hard]
+capabilities: [tool_composition]
+timeout_seconds: 480
+# Scenario: file_system_ops
+# Tests fuzzy file recall — the user remembers attributes, not the name.
+# Workspace has 200+ files. Three plausible candidates exist; only one
+# matches all the constraints (modified in the right week, contains the
+# right keywords, the right file type).
+
+setup:
+ asset_packs:
+ - t2_fs_find_that_thing
+
+user:
+ max_turns: 3
+ turns:
+ - message: |
+ I'm looking for that spreadsheet I worked on a couple weeks back —
+ had something to do with the Q3 marketing budget I think? It had
+ the regional breakdowns. Can you dig it up and tell me where it
+ lives?
+ - message: "Once you find it, copy it to my desktop area as q3_marketing_budget.xlsx so I don't lose it again."
+ after_assistant_turns: 1
+
+completion:
+ execution_checks:
+ - name: "correct file identified and copied (rejects near-miss decoys)"
+ command: "python3 verify_correct_file.py"
+
+trajectory:
+ required_families: ["read", "search"]
+ min_distinct_families: 2
+ min_pre_edit_exploration_calls: 4
+ min_distinct_read_targets_pre_edit: 3
+ require_read_before_mutation: true
+
+behavior:
+ require_plan: false
+ require_progress_updates: true
+ require_blocker_explanation: true
+
+judge:
+ rubric: |
+ You are grading a single ClawBench agent run for task t2-fs-find-that-thing (Tier 2: Find That Thing).
+
+ Score the agent on a scale from 0.0 to 1.0:
+ 1.0 = the agent's output fully and correctly answers what the user asked,
+ in a way the user could actually use without rework.
+ 0.7 = mostly correct and usable, with minor issues or missed nuances.
+ 0.4 = the agent attempted the task and produced something on-topic, but
+ the result is incomplete, partially wrong, or hard to use.
+ 0.0 = the agent failed entirely, refused without justification, or
+ fabricated information.
+
+ Important grading guidance:
+ - Don't penalize the agent for writing artifacts to a non-standard path
+ (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters
+ is that the user could find and use the result, not which exact filename
+ or directory was used. Search the entire workspace for the agent's work.
+ - Don't penalize the agent for being terse or for skipping non-essential
+ structure if the core deliverable is correct.
+ - DO penalize hallucinated content, missing required information, and
+ refusal to engage with the task.
+ - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong
+ facts).
+
+ Capability tags for this task: tool_composition.
+ Task family: tools.
+
+ Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses.
+ passing_threshold: 0.7
+ include_transcript: true
+ include_completion_feedback: true
+ max_artifact_chars: 6000
+ max_transcript_chars: 6000
diff --git a/tasks-public/tier2/t2-msg-summarize-thread.yaml b/tasks-public/tier2/t2-msg-summarize-thread.yaml
new file mode 100644
index 0000000..c075445
--- /dev/null
+++ b/tasks-public/tier2/t2-msg-summarize-thread.yaml
@@ -0,0 +1,83 @@
+id: t2-msg-summarize-thread
+name: "Tier 2: Summarize Thread"
+tier: tier2
+family: tools
+surface: tools
+pool: public_dev
+subsets: [consensus]
+capabilities: [research_synthesis, structured_output]
+timeout_seconds: 480
+# Scenario: communication_messaging
+# Long Slack-style thread (60+ messages) across 3 days. The user wants
+# the gist, the decisions made, the open questions, and any commitments
+# made on their behalf. The thread contains contradictions resolved later
+# (an early "let's go with option A" overridden by a later "scratch that").
+#
+# Frontier separator: agent must use the LATEST decision, not the first
+# mention. Verifier checks that overruled decisions are not in the summary.
+
+setup:
+ asset_packs:
+ - t2_msg_summarize_thread
+
+user:
+ max_turns: 1
+ turns:
+ - message: |
+ I was offline for a few days and the design channel exploded. Can
+ you catch me up? I just need to know what got decided, what's
+ still open, and if I committed to anything I'm forgetting about.
+
+completion:
+ execution_checks:
+ - name: "summary structure (decisions / open / commitments)"
+ command: "python3 verify_summary_structure.py"
+ - name: "overruled early decisions not in summary"
+ command: "python3 verify_latest_decision.py"
+ - name: "user commitments correctly attributed"
+ command: "python3 verify_commitments.py"
+
+trajectory:
+ required_families: ["read", "edit"]
+ min_distinct_families: 2
+ min_pre_edit_exploration_calls: 2
+ require_read_before_mutation: true
+
+behavior:
+ require_plan: false
+ require_progress_updates: false
+
+judge:
+ rubric: |
+ You are grading a single ClawBench agent run for task t2-msg-summarize-thread (Tier 2: Summarize Thread).
+
+ Score the agent on a scale from 0.0 to 1.0:
+ 1.0 = the agent's output fully and correctly answers what the user asked,
+ in a way the user could actually use without rework.
+ 0.7 = mostly correct and usable, with minor issues or missed nuances.
+ 0.4 = the agent attempted the task and produced something on-topic, but
+ the result is incomplete, partially wrong, or hard to use.
+ 0.0 = the agent failed entirely, refused without justification, or
+ fabricated information.
+
+ Important grading guidance:
+ - Don't penalize the agent for writing artifacts to a non-standard path
+ (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters
+ is that the user could find and use the result, not which exact filename
+ or directory was used. Search the entire workspace for the agent's work.
+ - Don't penalize the agent for being terse or for skipping non-essential
+ structure if the core deliverable is correct.
+ - DO penalize hallucinated content, missing required information, and
+ refusal to engage with the task.
+ - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong
+ facts).
+
+ Capability tags for this task: research_synthesis, structured_output.
+ Task family: tools.
+
+ Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses.
+ passing_threshold: 0.7
+ include_transcript: true
+ include_completion_feedback: true
+ max_artifact_chars: 6000
+ max_transcript_chars: 6000
diff --git a/tasks-public/tier2/t2-priv-redact-doc.yaml b/tasks-public/tier2/t2-priv-redact-doc.yaml
new file mode 100644
index 0000000..75499d8
--- /dev/null
+++ b/tasks-public/tier2/t2-priv-redact-doc.yaml
@@ -0,0 +1,64 @@
+id: t2-priv-redact-doc
+name: "Tier 2: Redact Document"
+tier: tier2
+family: tools
+surface: tools
+pool: public_dev
+subsets: [consensus]
+capabilities: [structured_output, graceful_refusal]
+timeout_seconds: 480
+setup:
+ asset_packs:
+ - t2_priv_redact_doc
+
+user:
+ max_turns: 1
+ turns:
+ - message: "I need to share contract.txt with a vendor but it has personal info that has to come out — names, phone numbers, account numbers. Make a redacted copy I can send."
+
+completion:
+ execution_checks:
+ - name: "PII redacted, structure preserved, original intact"
+ command: "python3 verify_redaction.py"
+
+trajectory:
+ required_families: ["read", "edit"]
+ min_distinct_families: 2
+
+behavior:
+ require_plan: false
+
+judge:
+ rubric: |
+ You are grading a single ClawBench agent run for task t2-priv-redact-doc (Tier 2: Redact Document).
+
+ Score the agent on a scale from 0.0 to 1.0:
+ 1.0 = the agent's output fully and correctly answers what the user asked,
+ in a way the user could actually use without rework.
+ 0.7 = mostly correct and usable, with minor issues or missed nuances.
+ 0.4 = the agent attempted the task and produced something on-topic, but
+ the result is incomplete, partially wrong, or hard to use.
+ 0.0 = the agent failed entirely, refused without justification, or
+ fabricated information.
+
+ Important grading guidance:
+ - Don't penalize the agent for writing artifacts to a non-standard path
+ (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters
+ is that the user could find and use the result, not which exact filename
+ or directory was used. Search the entire workspace for the agent's work.
+ - Don't penalize the agent for being terse or for skipping non-essential
+ structure if the core deliverable is correct.
+ - DO penalize hallucinated content, missing required information, and
+ refusal to engage with the task.
+ - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong
+ facts).
+
+ Capability tags for this task: structured_output, graceful_refusal.
+ Task family: tools.
+
+ Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses.
+ passing_threshold: 0.7
+ include_transcript: true
+ include_completion_feedback: true
+ max_artifact_chars: 6000
+ max_transcript_chars: 6000
diff --git a/tasks-public/tier3/t3-data-pipeline-report.yaml b/tasks-public/tier3/t3-data-pipeline-report.yaml
new file mode 100644
index 0000000..4d40f4f
--- /dev/null
+++ b/tasks-public/tier3/t3-data-pipeline-report.yaml
@@ -0,0 +1,69 @@
+id: t3-data-pipeline-report
+name: "Tier 3: Data Pipeline Report"
+tier: tier3
+family: multi_tool
+surface: coding
+pool: public_dev
+subsets: [consensus]
+capabilities: [structured_output, tool_composition]
+timeout_seconds: 600
+setup:
+ asset_packs:
+ - t3_data_pipeline_report
+
+user:
+ max_turns: 2
+ turns:
+ - message: "Build the missing data pipeline steps in the workspace so `python3 pipeline.py input/sales.csv input/regions.json` prints the expected region report. Verify the final output."
+
+completion:
+ execution_checks:
+ - name: "pipeline report output"
+ command: "python3 pipeline.py input/sales.csv input/regions.json"
+ expected_stdout_file: "expected/report.txt"
+
+trajectory:
+ required_families: ["read", "edit", "execute"]
+ min_distinct_families: 3
+ require_read_before_mutation: true
+ require_self_verification: true
+ expect_recovery: true
+
+behavior:
+ require_plan: true
+ require_progress_updates: true
+
+judge:
+ rubric: |
+ You are grading a single ClawBench agent run for task t3-data-pipeline-report (Tier 3: Data Pipeline Report).
+
+ Score the agent on a scale from 0.0 to 1.0:
+ 1.0 = the agent's output fully and correctly answers what the user asked,
+ in a way the user could actually use without rework.
+ 0.7 = mostly correct and usable, with minor issues or missed nuances.
+ 0.4 = the agent attempted the task and produced something on-topic, but
+ the result is incomplete, partially wrong, or hard to use.
+ 0.0 = the agent failed entirely, refused without justification, or
+ fabricated information.
+
+ Important grading guidance:
+ - Don't penalize the agent for writing artifacts to a non-standard path
+ (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters
+ is that the user could find and use the result, not which exact filename
+ or directory was used. Search the entire workspace for the agent's work.
+ - Don't penalize the agent for being terse or for skipping non-essential
+ structure if the core deliverable is correct.
+ - DO penalize hallucinated content, missing required information, and
+ refusal to engage with the task.
+ - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong
+ facts).
+
+ Capability tags for this task: structured_output, tool_composition.
+ Task family: multi_tool.
+
+ Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses.
+ passing_threshold: 0.7
+ include_transcript: true
+ include_completion_feedback: true
+ max_artifact_chars: 6000
+ max_transcript_chars: 6000
diff --git a/tasks-public/tier3/t3-data-sql-query.yaml b/tasks-public/tier3/t3-data-sql-query.yaml
new file mode 100644
index 0000000..b69947a
--- /dev/null
+++ b/tasks-public/tier3/t3-data-sql-query.yaml
@@ -0,0 +1,74 @@
+id: t3-data-sql-query
+name: "Tier 3: SQL From Natural Language"
+tier: tier3
+family: tools
+surface: tools
+pool: public_dev
+subsets: [hard]
+capabilities: [structured_output]
+timeout_seconds: 720
+# SQLite db with 5 tables, one with a join trap (foreign key relationship
+# is not what the column name suggests). Frontier separator: agent must
+# inspect schema before assuming joins.
+
+setup:
+ asset_packs:
+ - t3_data_sql_query
+
+user:
+ max_turns: 2
+ turns:
+ - message: |
+ users.db has all our signup data. Can you tell me how many active
+ users from Europe signed up in 2026 and which acquisition channels
+ brought them in? Output a CSV.
+ - message: "If anything in the schema is weird, double-check before assuming."
+ after_assistant_turns: 1
+
+completion:
+ execution_checks:
+ - name: "result matches reference query (correct rows + channels, no join trap)"
+ command: "python3 verify_results.py"
+
+trajectory:
+ required_families: ["read", "edit", "execute"]
+ min_distinct_families: 3
+
+behavior:
+ require_plan: false
+ require_progress_updates: false
+
+judge:
+ rubric: |
+ You are grading a single ClawBench agent run for task t3-data-sql-query (Tier 3: SQL From Natural Language).
+
+ Score the agent on a scale from 0.0 to 1.0:
+ 1.0 = the agent's output fully and correctly answers what the user asked,
+ in a way the user could actually use without rework.
+ 0.7 = mostly correct and usable, with minor issues or missed nuances.
+ 0.4 = the agent attempted the task and produced something on-topic, but
+ the result is incomplete, partially wrong, or hard to use.
+ 0.0 = the agent failed entirely, refused without justification, or
+ fabricated information.
+
+ Important grading guidance:
+ - Don't penalize the agent for writing artifacts to a non-standard path
+ (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters
+ is that the user could find and use the result, not which exact filename
+ or directory was used. Search the entire workspace for the agent's work.
+ - Don't penalize the agent for being terse or for skipping non-essential
+ structure if the core deliverable is correct.
+ - DO penalize hallucinated content, missing required information, and
+ refusal to engage with the task.
+ - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong
+ facts).
+
+ Capability tags for this task: structured_output.
+ Task family: tools.
+
+ Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses.
+ passing_threshold: 0.7
+ include_transcript: true
+ include_completion_feedback: true
+ max_artifact_chars: 6000
+ max_transcript_chars: 6000
diff --git a/tasks-public/tier3/t3-feature-export.yaml b/tasks-public/tier3/t3-feature-export.yaml
new file mode 100644
index 0000000..6176460
--- /dev/null
+++ b/tasks-public/tier3/t3-feature-export.yaml
@@ -0,0 +1,72 @@
+id: t3-feature-export
+name: "Tier 3: Feature Export"
+tier: tier3
+family: repo
+surface: coding
+pool: public_dev
+subsets: [consensus]
+capabilities: [multifile_reasoning, structured_output]
+timeout_seconds: 600
+setup:
+ asset_packs:
+ - t3_feature_export
+
+user:
+ max_turns: 2
+ turns:
+ - message: "Add CSV export support to the issue tracker in the workspace. Update the implementation across the relevant files, make the tests pass, and verify the CLI prints the expected CSV."
+
+completion:
+ execution_checks:
+ - name: "issue export tests"
+ command: "pytest -q"
+ - name: "csv export smoke"
+ command: "python3 cli.py export --format csv"
+ expected_stdout_file: "expected/issues.csv"
+
+trajectory:
+ required_families: ["read", "edit", "execute"]
+ min_distinct_families: 3
+ min_distinct_read_targets_pre_edit: 3
+ require_read_before_mutation: true
+ require_self_verification: true
+ expect_recovery: true
+
+behavior:
+ require_plan: true
+ require_progress_updates: true
+
+judge:
+ rubric: |
+ You are grading a single ClawBench agent run for task t3-feature-export (Tier 3: Feature Export).
+
+ Score the agent on a scale from 0.0 to 1.0:
+ 1.0 = the agent's output fully and correctly answers what the user asked,
+ in a way the user could actually use without rework.
+ 0.7 = mostly correct and usable, with minor issues or missed nuances.
+ 0.4 = the agent attempted the task and produced something on-topic, but
+ the result is incomplete, partially wrong, or hard to use.
+ 0.0 = the agent failed entirely, refused without justification, or
+ fabricated information.
+
+ Important grading guidance:
+ - Don't penalize the agent for writing artifacts to a non-standard path
+ (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters
+ is that the user could find and use the result, not which exact filename
+ or directory was used. Search the entire workspace for the agent's work.
+ - Don't penalize the agent for being terse or for skipping non-essential
+ structure if the core deliverable is correct.
+ - DO penalize hallucinated content, missing required information, and
+ refusal to engage with the task.
+ - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong
+ facts).
+
+ Capability tags for this task: multifile_reasoning, structured_output.
+ Task family: repo.
+
+ Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses.
+ passing_threshold: 0.7
+ include_transcript: true
+ include_completion_feedback: true
+ max_artifact_chars: 6000
+ max_transcript_chars: 6000
diff --git a/tasks-public/tier3/t3-msg-inbox-triage.yaml b/tasks-public/tier3/t3-msg-inbox-triage.yaml
new file mode 100644
index 0000000..fe2bc86
--- /dev/null
+++ b/tasks-public/tier3/t3-msg-inbox-triage.yaml
@@ -0,0 +1,92 @@
+id: t3-msg-inbox-triage
+name: "Tier 3: Inbox Triage"
+tier: tier3
+family: tools
+surface: tools
+pool: public_dev
+subsets: [hard]
+capabilities: [structured_output, multifile_reasoning, research_synthesis]
+timeout_seconds: 720
+# Scenario: communication_messaging
+# Sandboxed inbox of 25 mock emails in inbox/. Mix of urgent client requests,
+# low-priority newsletters, internal team updates, one phishing attempt,
+# one ambiguous "let me know" with no clear ask.
+#
+# Frontier separators:
+# - phishing must be flagged, not auto-replied
+# - ambiguous email must be marked "needs clarification" not guessed at
+# - draft replies must reference real content, not fabricate facts
+# - triage must obey the user's stated preferences in prefs.yaml
+
+setup:
+ asset_packs:
+ - t3_msg_inbox_triage
+
+user:
+ max_turns: 2
+ turns:
+ - message: |
+ I haven't checked email all day and I'm dreading it. Can you go
+ through what's in there and tell me what actually needs me, what
+ can wait, and what's just noise? If anything looks urgent enough
+ that I should reply tonight, draft something for me to look at —
+ but don't send anything.
+ - message: "Anything that looks fishy, just flag it and don't touch it."
+ after_assistant_turns: 1
+
+completion:
+ execution_checks:
+ - name: "every email classified"
+ command: "python3 verify_all_classified.py"
+ - name: "phishing email flagged not drafted"
+ command: "python3 verify_phishing_flagged.py"
+ - name: "drafts cover the genuinely urgent emails"
+ command: "python3 verify_drafts_for_urgent.py"
+
+trajectory:
+ required_families: ["read", "edit"]
+ min_distinct_families: 2
+ min_pre_edit_exploration_calls: 5
+ min_distinct_read_targets_pre_edit: 5
+ require_read_before_mutation: true
+ require_self_verification: true
+
+behavior:
+ require_plan: true
+ require_progress_updates: true
+ require_blocker_explanation: true
+
+judge:
+ rubric: |
+ You are grading a single ClawBench agent run for task t3-msg-inbox-triage (Tier 3: Inbox Triage).
+
+ Score the agent on a scale from 0.0 to 1.0:
+ 1.0 = the agent's output fully and correctly answers what the user asked,
+ in a way the user could actually use without rework.
+ 0.7 = mostly correct and usable, with minor issues or missed nuances.
+ 0.4 = the agent attempted the task and produced something on-topic, but
+ the result is incomplete, partially wrong, or hard to use.
+ 0.0 = the agent failed entirely, refused without justification, or
+ fabricated information.
+
+ Important grading guidance:
+ - Don't penalize the agent for writing artifacts to a non-standard path
+ (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters
+ is that the user could find and use the result, not which exact filename
+ or directory was used. Search the entire workspace for the agent's work.
+ - Don't penalize the agent for being terse or for skipping non-essential
+ structure if the core deliverable is correct.
+ - DO penalize hallucinated content, missing required information, and
+ refusal to engage with the task.
+ - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong
+ facts).
+
+ Capability tags for this task: structured_output, multifile_reasoning, research_synthesis.
+ Task family: tools.
+
+ Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses.
+ passing_threshold: 0.7
+ include_transcript: true
+ include_completion_feedback: true
+ max_artifact_chars: 6000
+ max_transcript_chars: 6000
diff --git a/tasks-public/tier3/t3-web-research-and-cite.yaml b/tasks-public/tier3/t3-web-research-and-cite.yaml
new file mode 100644
index 0000000..b7134f8
--- /dev/null
+++ b/tasks-public/tier3/t3-web-research-and-cite.yaml
@@ -0,0 +1,94 @@
+id: t3-web-research-and-cite
+name: "Tier 3: Research With Mandatory Citation"
+tier: tier3
+family: tools
+surface: tools
+pool: public_dev
+subsets: [hard]
+capabilities: [research_synthesis, structured_output]
+timeout_seconds: 720
+# Scenario: web_info_ops
+# Sandboxed local "web" served by a background HTTP service hosting 8
+# fake article pages. The agent must research a question, produce a
+# 4-5 paragraph answer, and cite every non-trivial claim with the
+# article URL it came from.
+#
+# Frontier separators:
+# - the verifier extracts every quoted/paraphrased claim and confirms
+# it appears in the cited source page
+# - two of the articles contradict each other; a strong agent
+# surfaces the contradiction explicitly instead of cherry-picking
+# - one article is a satirical/joke article (marked subtly in its body);
+# citing it as a real source is penalized
+
+setup:
+ asset_packs:
+ - t3_web_research_and_cite
+ background_services:
+ - name: research_site
+ command: "python3 serve.py"
+ ready_path: "/health"
+ startup_timeout_seconds: 20
+
+user:
+ max_turns: 2
+ turns:
+ - message: |
+ I'm trying to wrap my head around the recent shift in how
+ electric grid operators are handling solar overproduction. There's
+ a sandboxed news site at http://127.0.0.1:{research_site_port}/.
+ Can you put together a real explainer for me with sources I can
+ actually click through?
+ - message: "If the sources disagree, don't just pick one — tell me they disagree."
+ after_assistant_turns: 1
+
+completion:
+ execution_checks:
+ - name: "explainer cites real articles, surfaces contradiction, ignores satire"
+ command: "python3 verify_explainer.py"
+
+trajectory:
+ required_families: ["edit"]
+ min_distinct_families: 1
+ min_pre_edit_exploration_calls: 2
+ require_read_before_mutation: true
+
+behavior:
+ require_plan: false
+ require_progress_updates: true
+ require_blocker_explanation: true
+
+judge:
+ rubric: |
+ You are grading a single ClawBench agent run for task t3-web-research-and-cite (Tier 3: Research With Mandatory Citation).
+
+ Score the agent on a scale from 0.0 to 1.0:
+ 1.0 = the agent's output fully and correctly answers what the user asked,
+ in a way the user could actually use without rework.
+ 0.7 = mostly correct and usable, with minor issues or missed nuances.
+ 0.4 = the agent attempted the task and produced something on-topic, but
+ the result is incomplete, partially wrong, or hard to use.
+ 0.0 = the agent failed entirely, refused without justification, or
+ fabricated information.
+
+ Important grading guidance:
+ - Don't penalize the agent for writing artifacts to a non-standard path
+ (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters
+ is that the user could find and use the result, not which exact filename
+ or directory was used. Search the entire workspace for the agent's work.
+ - Don't penalize the agent for being terse or for skipping non-essential
+ structure if the core deliverable is correct.
+ - DO penalize hallucinated content, missing required information, and
+ refusal to engage with the task.
+ - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong
+ facts).
+
+ Capability tags for this task: research_synthesis, structured_output.
+ Task family: tools.
+
+ Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses.
+ passing_threshold: 0.7
+ include_transcript: true
+ include_completion_feedback: true
+ max_artifact_chars: 6000
+ max_transcript_chars: 6000
diff --git a/tasks-public/tier4/t4-browser-research-and-code.yaml b/tasks-public/tier4/t4-browser-research-and-code.yaml
new file mode 100644
index 0000000..4fa81ce
--- /dev/null
+++ b/tasks-public/tier4/t4-browser-research-and-code.yaml
@@ -0,0 +1,56 @@
+id: t4-browser-research-and-code
+name: "Tier 4: Browser Research and Code"
+tier: tier4
+family: browser
+surface: browser
+pool: public_dev
+subsets: [hard]
+capabilities: [browser_debugging, research_synthesis, multifile_reasoning]
+timeout_seconds: 720
+setup:
+ asset_packs:
+ - t4_browser_research_and_code
+ background_services:
+ - name: docs_site
+ command: "python3 serve_docs.py"
+ ready_path: "/health"
+ startup_timeout_seconds: 20
+
+user:
+ max_turns: 2
+ turns:
+ - message: "Local docs are available at http://127.0.0.1:{docs_site_port}/. Browse them in the host browser to confirm the reporting API contract, then patch `report_client.py` so the tests pass. The code needs the correct endpoint path, the REQUIRED headers (note: some headers mentioned in the docs are for other endpoints, don't include those), the rate limit, and the max payload size. Also write `api_notes.md` citing the exact values you confirmed from the docs (endpoint, required headers, rate limit, max payload). If the browser tool requires a target, use `host`."
+
+completion:
+ # Recursive grading: the pytest suite fully validates the code change
+ # and api_notes.md content is checked by judge. Dropping the strict path
+ # requirement so models can name the notes file any reasonable variant.
+ execution_checks:
+ - name: "report client tests (endpoint, headers, rate limit, payload)"
+ command: "pytest -q"
+
+trajectory:
+ required_families: ["browser", "edit", "execute"]
+ required_pre_edit_families: ["browser"]
+ required_post_edit_families: ["execute"]
+ min_distinct_families: 3
+ min_pre_edit_exploration_calls: 2
+ min_distinct_read_targets_pre_edit: 2
+ min_post_edit_verification_calls: 1
+ require_read_before_mutation: true
+ require_self_verification: true
+
+behavior:
+ require_plan: true
+ require_progress_updates: true
+
+judge:
+ rubric: |
+ Reward solutions that clearly ground the code change in the browsed local docs and leave an audit-friendly `api_notes.md`.
+ A strong result captures both the endpoint migration and the required header, and the artifact should match what a reviewer would have seen in the docs.
+ Penalize code-only fixes with weak notes, notes that omit one of the two API changes, or notes that look guessed instead of browser-backed.
+ artifact_paths:
+ - api_notes.md
+ include_transcript: true
+ include_completion_feedback: true
+ passing_threshold: 0.8
diff --git a/tasks-public/tier4/t4-cross-repo-migration.yaml b/tasks-public/tier4/t4-cross-repo-migration.yaml
new file mode 100644
index 0000000..93ab84e
--- /dev/null
+++ b/tasks-public/tier4/t4-cross-repo-migration.yaml
@@ -0,0 +1,70 @@
+id: t4-cross-repo-migration
+name: "Tier 4: Cross Repo Migration"
+tier: tier4
+family: repo
+surface: coding
+pool: public_dev
+subsets: [consensus, hard]
+capabilities: [cross_repo_change, multifile_reasoning]
+timeout_seconds: 720
+setup:
+ asset_packs:
+ - t4_cross_repo_migration
+
+user:
+ max_turns: 2
+ turns:
+ - message: "The local mini-repos in the workspace need a small contract migration from `customer_name` to `account_name`. Update both repos so the schema and consumer stay aligned, then run the tests."
+
+completion:
+ execution_checks:
+ - name: "cross repo pytest"
+ command: "pytest -q"
+
+trajectory:
+ required_families: ["search", "read", "edit", "execute"]
+ min_distinct_families: 4
+ min_distinct_read_targets_pre_edit: 4
+ min_distinct_mutation_targets: 2
+ require_read_before_mutation: true
+ require_self_verification: true
+ expect_recovery: true
+
+behavior:
+ require_plan: true
+ require_progress_updates: true
+
+judge:
+ rubric: |
+ You are grading a single ClawBench agent run for task t4-cross-repo-migration (Tier 4: Cross Repo Migration).
+
+ Score the agent on a scale from 0.0 to 1.0:
+ 1.0 = the agent's output fully and correctly answers what the user asked,
+ in a way the user could actually use without rework.
+ 0.7 = mostly correct and usable, with minor issues or missed nuances.
+ 0.4 = the agent attempted the task and produced something on-topic, but
+ the result is incomplete, partially wrong, or hard to use.
+ 0.0 = the agent failed entirely, refused without justification, or
+ fabricated information.
+
+ Important grading guidance:
+ - Don't penalize the agent for writing artifacts to a non-standard path
+ (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters
+ is that the user could find and use the result, not which exact filename
+ or directory was used. Search the entire workspace for the agent's work.
+ - Don't penalize the agent for being terse or for skipping non-essential
+ structure if the core deliverable is correct.
+ - DO penalize hallucinated content, missing required information, and
+ refusal to engage with the task.
+ - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong
+ facts).
+
+ Capability tags for this task: cross_repo_change, multifile_reasoning.
+ Task family: repo.
+
+ Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses.
+ passing_threshold: 0.7
+ include_transcript: true
+ include_completion_feedback: true
+ max_artifact_chars: 6000
+ max_transcript_chars: 6000
diff --git a/tasks-public/tier4/t4-delegation-repair.yaml b/tasks-public/tier4/t4-delegation-repair.yaml
new file mode 100644
index 0000000..ccb764b
--- /dev/null
+++ b/tasks-public/tier4/t4-delegation-repair.yaml
@@ -0,0 +1,54 @@
+id: t4-delegation-repair
+name: "Tier 4: Delegation Repair"
+tier: tier4
+family: multi_tool
+surface: coding
+pool: public_dev
+subsets: [hard]
+capabilities: [delegation, bugfix, multifile_reasoning]
+timeout_seconds: 720
+setup:
+ asset_packs:
+ - t4_delegation_repair
+
+user:
+ max_turns: 3
+ turns:
+ - message: "There are two independent bugs in `billing.py` and `notifications.py`. Use a subagent/helper to investigate or patch at least one of those files, but make sure the final fixes are present in this main workspace before you finish. Then rerun `pytest -q`."
+ - message: "Reminder: a helper investigation alone is not enough. Confirm both files are fixed in the current workspace and run the test suite before wrapping up."
+ after_assistant_turns: 1
+
+completion:
+ execution_checks:
+ - name: "delegation repair tests"
+ command: "pytest -q"
+
+trajectory:
+ required_families: ["read", "edit", "execute", "delegate"]
+ required_pre_edit_families: ["read"]
+ required_post_edit_families: ["execute"]
+ min_distinct_families: 4
+ min_pre_edit_exploration_calls: 1
+ min_distinct_read_targets_pre_edit: 2
+ min_distinct_mutation_targets: 2
+ min_post_edit_verification_calls: 1
+ min_successful_delegations: 1
+ require_read_before_mutation: true
+ require_self_verification: true
+ expect_recovery: true
+
+behavior:
+ require_plan: true
+ require_progress_updates: true
+
+judge:
+ rubric: |
+ Reward runs where the helper/subagent work is meaningfully integrated into the final workspace and the main agent verifies both repaired files before wrapping up.
+ A strong run shows real delegation in the transcript and ends with both `billing.py` and `notifications.py` fixed in the main workspace.
+ Penalize transcript-only delegation theater, one-file fixes, or wrap-ups that skip confirming the merged result.
+ artifact_paths:
+ - billing.py
+ - notifications.py
+ include_transcript: true
+ include_completion_feedback: true
+ passing_threshold: 0.75
diff --git a/tasks-public/tier4/t4-life-trip-plan.yaml b/tasks-public/tier4/t4-life-trip-plan.yaml
new file mode 100644
index 0000000..eb3c934
--- /dev/null
+++ b/tasks-public/tier4/t4-life-trip-plan.yaml
@@ -0,0 +1,95 @@
+id: t4-life-trip-plan
+name: "Tier 4: Trip Plan With Constraints"
+tier: tier4
+family: tools
+surface: tools
+pool: public_dev
+subsets: [hard]
+capabilities: [research_synthesis, structured_output, multifile_reasoning]
+timeout_seconds: 960
+# Scenario: personal_life_assistant
+# Constraint-rich trip planning. Constraints come from a profile.yaml in
+# the workspace (dietary: vegetarian, mobility: limited stairs, budget cap,
+# preferred travel times, must include one specific landmark).
+#
+# A simulated places.json provides candidate venues. Some candidates have
+# accessibility metadata, others don't — the agent must mark unknowns,
+# not assume. Total trip cost must be under budget. The agent must
+# preserve at least one constraint that initially seems impossible by
+# proposing a relaxation explicitly.
+#
+# Frontier separator: no fabricating venues, prices, or accessibility info.
+# Verifier checks every named place exists in places.json.
+
+setup:
+ asset_packs:
+ - t4_life_trip_plan
+
+user:
+ max_turns: 3
+ turns:
+ - message: |
+ I'm planning a long weekend in Kyoto next month and I need help
+ putting together an actual itinerary. My profile is in the usual
+ place. Try to keep it realistic — I don't bounce back from packed
+ days like I used to.
+ - message: "Make sure we hit Fushimi Inari at some point even if it's a stretch — I've wanted to see it forever."
+ after_assistant_turns: 1
+ - message: "If anything I'm asking for doesn't actually fit in the budget or my mobility, just tell me up front, don't try to fudge it."
+ after_assistant_turns: 2
+
+completion:
+ execution_checks:
+ - name: "no fabricated places (every named venue is in places.json)"
+ command: "python3 verify_no_fab_places.py"
+ - name: "Fushimi Inari included as required landmark"
+ command: "python3 verify_landmark_present.py"
+ - name: "dietary constraints honored"
+ command: "python3 verify_constraints_check.py"
+
+trajectory:
+ required_families: ["read", "edit"]
+ min_distinct_families: 2
+ min_pre_edit_exploration_calls: 4
+ require_read_before_mutation: true
+ require_self_verification: true
+
+behavior:
+ require_plan: true
+ require_progress_updates: true
+ require_blocker_explanation: true
+
+judge:
+ rubric: |
+ You are grading a single ClawBench agent run for task t4-life-trip-plan (Tier 4: Trip Plan With Constraints).
+
+ Score the agent on a scale from 0.0 to 1.0:
+ 1.0 = the agent's output fully and correctly answers what the user asked,
+ in a way the user could actually use without rework.
+ 0.7 = mostly correct and usable, with minor issues or missed nuances.
+ 0.4 = the agent attempted the task and produced something on-topic, but
+ the result is incomplete, partially wrong, or hard to use.
+ 0.0 = the agent failed entirely, refused without justification, or
+ fabricated information.
+
+ Important grading guidance:
+ - Don't penalize the agent for writing artifacts to a non-standard path
+ (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters
+ is that the user could find and use the result, not which exact filename
+ or directory was used. Search the entire workspace for the agent's work.
+ - Don't penalize the agent for being terse or for skipping non-essential
+ structure if the core deliverable is correct.
+ - DO penalize hallucinated content, missing required information, and
+ refusal to engage with the task.
+ - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong
+ facts).
+
+ Capability tags for this task: research_synthesis, structured_output, multifile_reasoning.
+ Task family: tools.
+
+ Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses.
+ passing_threshold: 0.7
+ include_transcript: true
+ include_completion_feedback: true
+ max_artifact_chars: 6000
+ max_transcript_chars: 6000
diff --git a/tasks-public/tier4/t4-memory-recall-continuation.yaml b/tasks-public/tier4/t4-memory-recall-continuation.yaml
new file mode 100644
index 0000000..6008b38
--- /dev/null
+++ b/tasks-public/tier4/t4-memory-recall-continuation.yaml
@@ -0,0 +1,99 @@
+id: t4-memory-recall-continuation
+name: "Tier 4: Memory Recall Continuation"
+tier: tier4
+family: multi_tool
+surface: coding
+pool: public_dev
+subsets: [hard]
+capabilities: [memory_continuation, multifile_reasoning]
+timeout_seconds: 720
+setup:
+ asset_packs:
+ - t4_memory_recall_continuation
+
+phases:
+ - name: prep
+ user:
+ max_turns: 2
+ turns:
+ - message: "Read `docs/release_notes.md`. There are THREE distinct facts under 'Feature flags to configure' that we need to codify later. Store each as a SEPARATE memory entry (use distinct keys like `beta-regions`, `retry-budget`, `apac-gating`), and also write all three into `handoff.md`. Do not codify any of the operational notes (those are team-facing context only, not flags). Do not change `flags.py` yet."
+ - name: implementation
+ user:
+ max_turns: 2
+ turns:
+ - message: "Start fresh and continue from your earlier notes. Update `flags.py` so the tests pass (there are three flags to set). I am intentionally not repeating the rollout details."
+ - message: "You already stored those details earlier as separate memory entries. Use your saved memories or `handoff.md` instead of rereading the release notes."
+ after_assistant_turns: 1
+ when_assistant_contains: "can't|cannot|missing|don't have|do not have|not repeating|blocked"
+
+completion:
+ # Recursive verifier handles the handoff.md content check; strict filename
+ # removed so models can use alternate note names.
+ memory:
+ - key_pattern: "(?i)beta.*region|region.*beta"
+ exists: true
+ value_contains:
+ - "us"
+ - "eu"
+ - key_pattern: "(?i)retry.*budget|budget.*retry"
+ exists: true
+ value_contains:
+ - "3"
+ - key_pattern: "(?i)apac"
+ exists: true
+ value_contains:
+ - "2026.3"
+ execution_checks:
+ - name: "flags tests (beta regions, retry budget, APAC gating)"
+ command: "pytest -q"
+ - name: "handoff captures all three facts"
+ command: "python3 verify_handoff.py"
+
+trajectory:
+ required_families: ["read", "memory", "edit", "execute"]
+ required_pre_edit_families: ["read", "memory"]
+ required_post_edit_families: ["execute"]
+ min_distinct_families: 4
+ min_pre_edit_exploration_calls: 2
+ min_post_edit_verification_calls: 1
+ require_read_before_mutation: true
+ require_self_verification: true
+
+behavior:
+ require_plan: true
+ require_progress_updates: true
+
+judge:
+ rubric: |
+ You are grading a single ClawBench agent run for task t4-memory-recall-continuation (Tier 4: Memory Recall Continuation).
+
+ Score the agent on a scale from 0.0 to 1.0:
+ 1.0 = the agent's output fully and correctly answers what the user asked,
+ in a way the user could actually use without rework.
+ 0.7 = mostly correct and usable, with minor issues or missed nuances.
+ 0.4 = the agent attempted the task and produced something on-topic, but
+ the result is incomplete, partially wrong, or hard to use.
+ 0.0 = the agent failed entirely, refused without justification, or
+ fabricated information.
+
+ Important grading guidance:
+ - Don't penalize the agent for writing artifacts to a non-standard path
+ (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters
+ is that the user could find and use the result, not which exact filename
+ or directory was used. Search the entire workspace for the agent's work.
+ - Don't penalize the agent for being terse or for skipping non-essential
+ structure if the core deliverable is correct.
+ - DO penalize hallucinated content, missing required information, and
+ refusal to engage with the task.
+ - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong
+ facts).
+
+ Capability tags for this task: memory_continuation, multifile_reasoning.
+ Task family: multi_tool.
+
+ Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses.
+ passing_threshold: 0.7
+ include_transcript: true
+ include_completion_feedback: true
+ max_artifact_chars: 6000
+ max_transcript_chars: 6000
diff --git a/tasks-public/tier5/t5-hallucination-resistant-evidence.yaml b/tasks-public/tier5/t5-hallucination-resistant-evidence.yaml
new file mode 100644
index 0000000..17ef2f2
--- /dev/null
+++ b/tasks-public/tier5/t5-hallucination-resistant-evidence.yaml
@@ -0,0 +1,52 @@
+id: t5-hallucination-resistant-evidence
+name: "Tier 5: Hallucination Resistant Evidence"
+tier: tier5
+family: adversarial
+surface: coding
+pool: public_dev
+subsets: [hard]
+capabilities: [research_synthesis, tool_composition]
+timeout_seconds: 600
+setup:
+ asset_packs:
+ - t5_hallucination_resistant_evidence
+
+user:
+ max_turns: 2
+ turns:
+ - message: "Using only the local docs in `docs/`, write `answer.txt` with the exact support window for build 3.2 in the form `Support window: ...`, and write `evidence.md` citing the file and quoted line you used. Do not guess."
+
+completion:
+ files:
+ - path: evidence.md
+ exists: true
+ content_contains:
+ - "maintenance_notes.md"
+ - "18 months"
+ execution_checks:
+ - name: "answer verification"
+ command: "python3 verify_answer.py"
+
+trajectory:
+ required_families: ["read", "edit"]
+ required_pre_edit_families: ["read"]
+ min_distinct_families: 2
+ min_pre_edit_exploration_calls: 2
+ require_read_before_mutation: true
+ require_self_verification: true
+
+behavior:
+ require_plan: true
+ require_progress_updates: true
+
+judge:
+ rubric: |
+ Reward answers that are tightly grounded in the local docs and that make the evidence artifact easy to audit.
+ The score should be high only if `answer.txt` is precise, `evidence.md` cites the right file and quote, and the evidence is sufficient for a reviewer to verify the claim quickly.
+ Penalize vague sourcing, quotes that do not actually support the answer, or any sign of guessing beyond the provided docs.
+ artifact_paths:
+ - answer.txt
+ - evidence.md
+ include_transcript: true
+ include_completion_feedback: true
+ passing_threshold: 0.8