diff --git a/tasks-public/MANIFEST.yaml b/tasks-public/MANIFEST.yaml
new file mode 100644
index 0000000..e6ba69b
--- /dev/null
+++ b/tasks-public/MANIFEST.yaml
@@ -0,0 +1,220 @@
+manifest_version: 1
+release: clawbench-core-v1
+release_date: 2026-04-20
+benchmark_version: 0.4.0.dev1
+task_count: 19
+source_sweep: v2026-4-19-full
+openclaw_version: 2026.4.15-beta.1
+
+description: |
+  ClawBench Core v1 — a curated subset of 19 tasks from the internal
+  40-task ClawBench dev pool. Selected so that:
+    (a) all 8 measured frontier models produce the established ranking
+        order in the v4-19-full sweep,
+    (b) coverage is preserved across tiers (1–5) and task families
+        (tools, coding, repo, browser, multi_tool, adversarial),
+    (c) tasks with broken verifiers or near-zero cross-model SNR are
+        dropped.
+
+  Verification: mean run_score across these 19 tasks reproduces the
+  reference ranking with 0 inversions and min adjacent-rank gap of
+  0.0049 (well above the ~0.002 seed-noise floor).
+
+established_ranking:
+  - rank: 1
+    model: anthropic/claude-opus-4-6
+    display: Claude Opus 4.6
+    score: 0.8137
+  - rank: 2
+    model: anthropic/claude-opus-4-7
+    display: Claude Opus 4.7
+    score: 0.7824
+  - rank: 3
+    model: openai/gpt-5.4
+    display: GPT 5.4
+    score: 0.7647
+  - rank: 4
+    model: anthropic/claude-sonnet-4-6
+    display: Claude Sonnet 4.6
+    score: 0.7597
+  - rank: 5
+    model: openrouter/minimax/minimax-m2.7
+    display: MiniMax M2.7
+    score: 0.7475
+  - rank: 6
+    model: google/gemini-3.1-pro-preview
+    display: Gemini 3.1 Pro
+    score: 0.7408
+  - rank: 7
+    model: openrouter/qwen/qwen3.6-plus
+    display: Qwen 3.6 Plus
+    score: 0.7030
+  - rank: 8
+    model: openrouter/moonshotai/kimi-k2.5
+    display: Kimi K2.5
+    score: 0.6800
+
+coverage:
+  tiers:
+    tier1: 2
+    tier2: 7
+    tier3: 5
+    tier4: 4
+    tier5: 1
+  families:
+    tools: 7
+    coding: 2
+    repo: 3
+    browser: 2
+    multi_tool: 3
+    adversarial: 1
+    # Tier 3/4 some families overlap; see per-task manifest below.
+
+tasks:
+  - id: t1-bugfix-discount
+    tier: tier1
+    family: coding
+    capabilities: [bugfix]
+    path: tier1/t1-bugfix-discount.yaml
+    asset_pack: t1_bugfix_discount
+
+  - id: t1-fs-quick-note
+    tier: tier1
+    family: tools
+    capabilities: [structured_output]
+    path: tier1/t1-fs-quick-note.yaml
+    asset_pack: t1_fs_quick_note
+
+  - id: t2-add-tests-normalizer
+    tier: tier2
+    family: coding
+    capabilities: [test_authoring]
+    path: tier2/t2-add-tests-normalizer.yaml
+    asset_pack: t2_add_tests_normalizer
+
+  - id: t2-browser-form-fix
+    tier: tier2
+    family: browser
+    capabilities: [browser_debugging, bugfix]
+    path: tier2/t2-browser-form-fix.yaml
+    asset_pack: t2_browser_form_fix
+
+  - id: t2-config-loader
+    tier: tier2
+    family: repo
+    capabilities: [bugfix, multifile_reasoning]
+    path: tier2/t2-config-loader.yaml
+    asset_pack: t2_config_loader
+
+  - id: t2-fs-find-that-thing
+    tier: tier2
+    family: tools
+    capabilities: [structured_output]
+    path: tier2/t2-fs-find-that-thing.yaml
+    asset_pack: t2_fs_find_that_thing
+
+  - id: t2-msg-summarize-thread
+    tier: tier2
+    family: tools
+    capabilities: [research_synthesis, structured_output]
+    path: tier2/t2-msg-summarize-thread.yaml
+    asset_pack: t2_msg_summarize_thread
+
+  - id: t2-priv-redact-doc
+    tier: tier2
+    family: tools
+    capabilities: [structured_output, graceful_refusal]
+    path: tier2/t2-priv-redact-doc.yaml
+    asset_pack: t2_priv_redact_doc
+
+  - id: t3-data-pipeline-report
+    tier: tier3
+    family: multi_tool
+    capabilities: [structured_output, multifile_reasoning]
+    path: tier3/t3-data-pipeline-report.yaml
+    asset_pack: t3_data_pipeline_report
+
+  - id: t3-data-sql-query
+    tier: tier3
+    family: tools
+    capabilities: [structured_output]
+    path: tier3/t3-data-sql-query.yaml
+    asset_pack: t3_data_sql_query
+
+  - id: t3-feature-export
+    tier: tier3
+    family: repo
+    capabilities: [multifile_reasoning, structured_output]
+    path: tier3/t3-feature-export.yaml
+    asset_pack: t3_feature_export
+
+  - id: t3-msg-inbox-triage
+    tier: tier3
+    family: tools
+    capabilities: [structured_output, multifile_reasoning]
+    path: tier3/t3-msg-inbox-triage.yaml
+    asset_pack: t3_msg_inbox_triage
+
+  - id: t3-web-research-and-cite
+    tier: tier3
+    family: tools
+    capabilities: [research_synthesis]
+    path: tier3/t3-web-research-and-cite.yaml
+    asset_pack: t3_web_research_and_cite
+
+  - id: t4-browser-research-and-code
+    tier: tier4
+    family: browser
+    capabilities: [browser_debugging, research_synthesis]
+    path: tier4/t4-browser-research-and-code.yaml
+    asset_pack: t4_browser_research_and_code
+
+  - id: t4-cross-repo-migration
+    tier: tier4
+    family: repo
+    capabilities: [cross_repo_change, multifile_reasoning]
+    path: tier4/t4-cross-repo-migration.yaml
+    asset_pack: t4_cross_repo_migration
+
+  - id: t4-delegation-repair
+    tier: tier4
+    family: multi_tool
+    capabilities: [delegation, bugfix]
+    path: tier4/t4-delegation-repair.yaml
+    asset_pack: t4_delegation_repair
+
+  - id: t4-life-trip-plan
+    tier: tier4
+    family: tools
+    capabilities: [research_synthesis, structured_output]
+    path: tier4/t4-life-trip-plan.yaml
+    asset_pack: t4_life_trip_plan
+
+  - id: t4-memory-recall-continuation
+    tier: tier4
+    family: multi_tool
+    capabilities: [memory_continuation, multifile_reasoning]
+    path: tier4/t4-memory-recall-continuation.yaml
+    asset_pack: t4_memory_recall_continuation
+
+  - id: t5-hallucination-resistant-evidence
+    tier: tier5
+    family: adversarial
+    capabilities: [research_synthesis, tool_composition]
+    path: tier5/t5-hallucination-resistant-evidence.yaml
+    asset_pack: t5_hallucination_resistant_evidence
+
+notes: |
+  - The full private dev set (tasks/) contains 40 tasks. This Core-19
+    subset is the signal-rich, ranking-consistent public release.
+  - Additional 21 tasks are retained as a private holdout for
+    contamination-resistant measurement of future models.
+  - Task families "creative" and "long-horizon (Tier 6)" are absent
+    from Core v1; planned for a future release.
+  - Known caveats: t4-memory-recall-continuation has a verifier that
+    penalizes agents that respond in conversation rather than via file
+    artifacts. All models face the same verifier, so the comparison is
+    internally fair, but absolute scores understate capability.
+  - t5-hallucination-resistant-evidence has low cross-model SNR (about
+    0.25) in v4-19-full; included for adversarial-family coverage
+    despite this. Consider upgrading verifier in a future release.
diff --git a/tasks-public/README.md b/tasks-public/README.md
new file mode 100644
index 0000000..8301cd9
--- /dev/null
+++ b/tasks-public/README.md
@@ -0,0 +1,132 @@
+# ClawBench Core v1 — Public Task Set (19 tasks)
+
+A curated 19-task subset of the full ClawBench v0.4.0.dev1 dev pool,
+selected for ranking consistency and capability coverage.
+
+## What this is
+
+19 tasks, 3 runs each → 57 runs per model. About half the compute of
+the full 40-task sweep, with no loss of discriminative power on the
+measured 8-model panel.
+
+Derived from the v2026-4-19-full sweep archive by greedy task
+selection: iteratively drop tasks that either (a) introduce ranking
+inversions vs the reference ordering or (b) have near-zero cross-model
+SNR and add only noise.
+
+## Established ranking (from v4-19-full sweep)
+
+Mean run_score across the 19 tasks:
+
+| Rank | Model | Score |
+|:---:|---|:---:|
+| 1 | Claude Opus 4.6 | 0.8137 |
+| 2 | Claude Opus 4.7 | 0.7824 |
+| 3 | GPT 5.4 | 0.7647 |
+| 4 | Claude Sonnet 4.6 | 0.7597 |
+| 5 | MiniMax M2.7 | 0.7475 |
+| 6 | Gemini 3.1 Pro | 0.7408 |
+| 7 | Qwen 3.6 Plus | 0.7030 |
+| 8 | Kimi K2.5 | 0.6800 |
+
+- **0 ranking inversions** on the 19-task mean.
+- **Min adjacent-rank gap: 0.0049** (well above the ~0.002 seed-noise
+  floor estimated from inter-run variance).
+- **Top-to-bottom spread: 0.134** (vs 0.097 for smaller robust sets).
+
+## Coverage
+
+| Dimension | Breakdown |
+|---|---|
+| Tiers | T1=2, T2=7, T3=5, T4=4, T5=1 |
+| Families | tools=7, coding=2, repo=3, browser=2, multi_tool=3, adversarial=1 |
+| Capabilities | bugfix, refactor, test_authoring, multifile_reasoning, browser_debugging, structured_output, graceful_refusal, delegation, tool_composition, research_synthesis, cross_repo_change, memory_continuation |
+
+## Directory layout
+
+```
+tasks-public/
+├── MANIFEST.yaml          # Machine-readable task list + metadata
+├── README.md              # This file
+├── tier1/                 # 2 task YAMLs
+├── tier2/                 # 7 task YAMLs
+├── tier3/                 # 5 task YAMLs
+├── tier4/                 # 4 task YAMLs
+├── tier5/                 # 1 task YAML
+└── assets/                # 19 asset packs (verifier scripts + fixtures)
+```
+
+## How to run Core v1
+
+Using the ClawBench harness:
+
+```bash
+# Explicit task-by-task (pass -t for each of 19 tasks):
+clawbench run \
+  --model anthropic/claude-opus-4-6 \
+  --runs 3 \
+  --concurrency 4 \
+  --profile profiles/frontier_opus_4_6.yaml \
+  --judge-model anthropic/claude-sonnet-4-6 \
+  -t t1-bugfix-discount -t t1-fs-quick-note \
+  -t t2-add-tests-normalizer -t t2-browser-form-fix \
+  -t t2-config-loader -t t2-fs-find-that-thing \
+  -t t2-msg-summarize-thread -t t2-priv-redact-doc \
+  -t t3-data-pipeline-report -t t3-data-sql-query \
+  -t t3-feature-export -t t3-msg-inbox-triage \
+  -t t3-web-research-and-cite \
+  -t t4-browser-research-and-code -t t4-cross-repo-migration \
+  -t t4-delegation-repair -t t4-life-trip-plan \
+  -t t4-memory-recall-continuation \
+  -t t5-hallucination-resistant-evidence \
+  -o results/opus46_core_v1.json
+```
+
+Or point the harness at this directory by setting the task root in
+your ClawBench config. See MANIFEST.yaml for a programmatic list.
+
+## Reproducibility caveats
+
+- **Exact score reproduction is not guaranteed.** Even with the same
+  OpenClaw version, re-runs exhibit seed noise (~0.02 stddev per task,
+  per model). Rankings are stable; absolute scores drift within that
+  envelope.
+- **OpenRouter-routed models** (`openrouter/*`) can have their
+  scores shift if OpenRouter repoints its model slug to a different
+  underlying provider. We observed this with GLM 5.1 between
+  2026-04-20 14:00 and 17:00 PST. Pin to canonical model versions
+  (e.g. `z-ai/glm-5-turbo-20260315`) for stable measurement.
+- **OpenClaw platform version matters.** Upgrading from 4.9 → 4.15-beta.1
+  shifted scores by +0.13 to +0.29 across models. Pin via Docker tag.
+- **Judge scores** come from Claude Sonnet 4.6 via direct Anthropic
+  API (with a fallback from the gateway judge). Scores assume the
+  judge is working correctly; re-judging broken runs may be required
+  (see `scripts/rejudge_all.py` in the main repo).
+
+## What's NOT in Core v1
+
+21 tasks from the full dev pool are held back:
+- **9 ceiling tasks** (all frontier models score >0.85) — don't
+  discriminate, future releases may phase them out.
+- **9 noise tasks** (cross-model SNR < 0.5) — either broken verifiers
+  or genuinely ambiguous prompts. Scheduled for redesign.
+- **3 ranking-breaker tasks** — tasks where the cross-model ordering
+  conflicts with the reference ranking (e.g. `t2-node-search-patch`,
+  `t5-contradictory-requirements`). Not broken per se; just
+  inconsistent with the headline.
+
+Also missing entirely from Core v1:
+- **Tier 6 long-horizon (100+ turn) tasks** — planned for v2.
+- **Creative synthesis / style-matching tasks** — planned for v2.
+- **Paraphrased prompt pairs** for perturbation-sensitivity
+  measurement — planned for v2.
+
+## Versioning
+
+| Version | Tasks | Change |
+|:---:|:---:|---|
+| Core v1 | 19 | Initial public release (this) |
+| Core v2 | ~24 | Planned: +Tier 6, +paraphrase pairs, -2 noise tasks |
+
+Pin to `clawbench-core-v1` in the MANIFEST for reproducible
+comparison across releases.
diff --git a/tasks-public/assets/t1_bugfix_discount/cart.py b/tasks-public/assets/t1_bugfix_discount/cart.py
new file mode 100644
index 0000000..627f4a5
--- /dev/null
+++ b/tasks-public/assets/t1_bugfix_discount/cart.py
@@ -0,0 +1,6 @@
+from pricing import apply_discount
+
+
+def checkout_total(subtotal: int, discount_percent: int) -> int:
+    return apply_discount(subtotal, discount_percent)
+
diff --git a/tasks-public/assets/t1_bugfix_discount/pricing.py b/tasks-public/assets/t1_bugfix_discount/pricing.py
new file mode 100644
index 0000000..448f5ce
--- /dev/null
+++ b/tasks-public/assets/t1_bugfix_discount/pricing.py
@@ -0,0 +1,4 @@
+def apply_discount(subtotal_cents: int, discount_percent: int) -> int:
+    # BUG: this subtracts the raw percent value instead of a percentage of the subtotal.
+    return subtotal_cents - discount_percent
+
diff --git a/tasks-public/assets/t1_bugfix_discount/tests/test_pricing.py b/tasks-public/assets/t1_bugfix_discount/tests/test_pricing.py
new file mode 100644
index 0000000..9f1c7c0
--- /dev/null
+++ b/tasks-public/assets/t1_bugfix_discount/tests/test_pricing.py
@@ -0,0 +1,10 @@
+from cart import checkout_total
+
+
+def test_percentage_discount_applies_to_full_subtotal():
+    assert checkout_total(2_000, 10) == 1_800
+
+
+def test_zero_discount_keeps_subtotal():
+    assert checkout_total(1_250, 0) == 1_250
+
diff --git a/tasks-public/assets/t1_fs_quick_note/notes/.gitkeep b/tasks-public/assets/t1_fs_quick_note/notes/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/tasks-public/assets/t1_fs_quick_note/verify_list_structure.py b/tasks-public/assets/t1_fs_quick_note/verify_list_structure.py
new file mode 100644
index 0000000..a86e4d2
--- /dev/null
+++ b/tasks-public/assets/t1_fs_quick_note/verify_list_structure.py
@@ -0,0 +1,57 @@
+"""Recursive workspace search verifier."""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+EXCLUDE_FRAGMENTS = (
+    "verify_", "/.git/", "/.openclaw/",
+    "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md",
+    "USER.md", "SOUL.md", "HEARTBEAT.md",
+)
+TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log",
+                  ".jsonl", ".html", ".sh", ".py")
+
+
+def iter_workspace_text_files(root: Path = Path(".")):
+    for path in root.rglob("*"):
+        if not path.is_file():
+            continue
+        sp = str(path)
+        if any(frag in sp for frag in EXCLUDE_FRAGMENTS):
+            continue
+        if path.suffix.lower() not in TEXT_SUFFIXES:
+            continue
+        try:
+            yield path, path.read_text(encoding="utf-8", errors="ignore")
+        except Exception:
+            continue
+
+
+def workspace_blob() -> str:
+    return "\n".join(text for _, text in iter_workspace_text_files())
+
+
+import re
+
+LIST_PATTERNS = [
+    re.compile(r"^\s*[-*+]\s+"),
+    re.compile(r"^\s*\d+[.)]\s+"),
+    re.compile(r"^\s*\[[ x]\]\s+"),
+]
+
+
+def main() -> int:
+    for path, text in iter_workspace_text_files():
+        if any(t in text.lower() for t in ("dry clean", "sam", "babysit", "60")):
+            list_lines = sum(1 for line in text.splitlines() if any(p.match(line) for p in LIST_PATTERNS))
+            if list_lines >= 3:
+                print(f"PASS: list-formatted note found at {path} ({list_lines} list lines)")
+                return 0
+    print("FAIL: no list-structured note found anywhere in workspace")
+    return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tasks-public/assets/t1_fs_quick_note/verify_three_items.py b/tasks-public/assets/t1_fs_quick_note/verify_three_items.py
new file mode 100644
index 0000000..50d073c
--- /dev/null
+++ b/tasks-public/assets/t1_fs_quick_note/verify_three_items.py
@@ -0,0 +1,56 @@
+"""Recursive workspace search verifier."""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+EXCLUDE_FRAGMENTS = (
+    "verify_", "/.git/", "/.openclaw/",
+    "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md",
+    "USER.md", "SOUL.md", "HEARTBEAT.md",
+)
+TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log",
+                  ".jsonl", ".html", ".sh", ".py")
+
+
+def iter_workspace_text_files(root: Path = Path(".")):
+    for path in root.rglob("*"):
+        if not path.is_file():
+            continue
+        sp = str(path)
+        if any(frag in sp for frag in EXCLUDE_FRAGMENTS):
+            continue
+        if path.suffix.lower() not in TEXT_SUFFIXES:
+            continue
+        try:
+            yield path, path.read_text(encoding="utf-8", errors="ignore")
+        except Exception:
+            continue
+
+
+def workspace_blob() -> str:
+    return "\n".join(text for _, text in iter_workspace_text_files())
+
+
+def main() -> int:
+    blob = workspace_blob().lower()
+    if not blob:
+        print("FAIL: workspace contains no agent-written text files")
+        return 1
+    needed = ['dry clean', 'sam', 'babysit']
+    if not all(s in blob for s in needed):
+        missing = [s for s in needed if s not in blob]
+        print(f"FAIL: workspace missing required content: {missing}")
+        return 1
+    needed = ['60']
+    if not all(s in blob for s in needed):
+        missing = [s for s in needed if s not in blob]
+        print(f"FAIL: workspace missing required content: {missing}")
+        return 1
+    print("PASS: t1_fs_quick_note/verify_three_items.py")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tasks-public/assets/t2_add_tests_normalizer/normalizer.py b/tasks-public/assets/t2_add_tests_normalizer/normalizer.py
new file mode 100644
index 0000000..c0474e2
--- /dev/null
+++ b/tasks-public/assets/t2_add_tests_normalizer/normalizer.py
@@ -0,0 +1,14 @@
+import re
+
+EMOJI_RE = re.compile(r"[\U0001F300-\U0001FAFF]")
+
+
+def normalize_title(text: str) -> str:
+    cleaned = " ".join(text.split())
+    cleaned = EMOJI_RE.sub("", cleaned)
+    return cleaned.strip().title()
+
+
+def normalize_tags(raw: str) -> list[str]:
+    return [part.strip().lower() for part in raw.split(",") if part.strip()]
+
diff --git a/tasks-public/assets/t2_add_tests_normalizer/verify_added_tests.py b/tasks-public/assets/t2_add_tests_normalizer/verify_added_tests.py
new file mode 100644
index 0000000..94e94c9
--- /dev/null
+++ b/tasks-public/assets/t2_add_tests_normalizer/verify_added_tests.py
@@ -0,0 +1,74 @@
+from __future__ import annotations
+
+import subprocess
+import sys
+from pathlib import Path
+
+
+BUGGY_EMOJI = """import re
+
+EMOJI_RE = re.compile(r"[\\U0001F300-\\U0001FAFF]")
+
+
+def normalize_title(text: str) -> str:
+    cleaned = " ".join(text.split())
+    return cleaned.strip().title()
+
+
+def normalize_tags(raw: str) -> list[str]:
+    return [part.strip().lower() for part in raw.split(",") if part.strip()]
+"""
+
+BUGGY_TAGS = """import re
+
+EMOJI_RE = re.compile(r"[\\U0001F300-\\U0001FAFF]")
+
+
+def normalize_title(text: str) -> str:
+    cleaned = " ".join(text.split())
+    cleaned = EMOJI_RE.sub("", cleaned)
+    return cleaned.strip().title()
+
+
+def normalize_tags(raw: str) -> list[str]:
+    return [part.strip().lower() for part in raw.split(",")]
+"""
+
+
+def _run_pytest(*args: str) -> subprocess.CompletedProcess[str]:
+    return subprocess.run(
+        [sys.executable, "-m", "pytest", "-q", *args],
+        check=False,
+        capture_output=True,
+        text=True,
+    )
+
+
+def _expect_mutant_failure(normalizer_path: Path, mutant_source: str, label: str) -> None:
+    backup = normalizer_path.read_text(encoding="utf-8")
+    normalizer_path.write_text(mutant_source, encoding="utf-8")
+    try:
+        result = _run_pytest("tests/test_normalizer.py")
+        assert result.returncode != 0, f"student tests did not catch mutant: {label}"
+    finally:
+        normalizer_path.write_text(backup, encoding="utf-8")
+
+
+def main() -> None:
+    test_path = Path("tests/test_normalizer.py")
+    assert test_path.exists(), "tests/test_normalizer.py is missing"
+
+    baseline = _run_pytest()
+    assert baseline.returncode == 0, baseline.stdout + baseline.stderr
+
+    normalizer_path = Path("normalizer.py")
+    _expect_mutant_failure(normalizer_path, BUGGY_EMOJI, "emoji stripping")
+    _expect_mutant_failure(normalizer_path, BUGGY_TAGS, "blank tag handling")
+
+    source = test_path.read_text(encoding="utf-8").lower()
+    assert "normalize_title" in source
+    assert "normalize_tags" in source
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tasks-public/assets/t2_browser_form_fix/app.js b/tasks-public/assets/t2_browser_form_fix/app.js
new file mode 100644
index 0000000..0559355
--- /dev/null
+++ b/tasks-public/assets/t2_browser_form_fix/app.js
@@ -0,0 +1,16 @@
+const form = document.getElementById("contact-formm");
+const emailInput = document.getElementById("email");
+const statusNode = document.getElementById("status");
+
+if (form) {
+  form.addEventListener("submit", (event) => {
+    event.preventDefault();
+    const email = emailInput.value.trim();
+    if (!email.includes("@")) {
+      statusNode.textContent = "Enter a valid email.";
+      return;
+    }
+    statusNode.textContent = `Saved ${email}`;
+  });
+}
+
diff --git a/tasks-public/assets/t2_browser_form_fix/index.html b/tasks-public/assets/t2_browser_form_fix/index.html
new file mode 100644
index 0000000..b1d64df
--- /dev/null
+++ b/tasks-public/assets/t2_browser_form_fix/index.html
@@ -0,0 +1,20 @@
+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8" />
+    <title>Newsletter Signup</title>
+    <script defer src="app.js"></script>
+  </head>
+  <body>
+    <main>
+      <h1>Join the Newsletter</h1>
+      <form id="contact-form">
+        <label for="email">Email</label>
+        <input id="email" name="email" type="email" />
+        <button id="submit-button" type="submit">Sign up</button>
+      </form>
+      <p id="status" aria-live="polite"></p>
+    </main>
+  </body>
+</html>
+
diff --git a/tasks-public/assets/t2_browser_form_fix/serve.py b/tasks-public/assets/t2_browser_form_fix/serve.py
new file mode 100644
index 0000000..9eec359
--- /dev/null
+++ b/tasks-public/assets/t2_browser_form_fix/serve.py
@@ -0,0 +1,21 @@
+from __future__ import annotations
+
+import os
+from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer
+
+
+class Handler(SimpleHTTPRequestHandler):
+    def do_GET(self) -> None:  # noqa: N802
+        if self.path == "/health":
+            self.send_response(200)
+            self.end_headers()
+            self.wfile.write(b"ok")
+            return
+        return super().do_GET()
+
+
+if __name__ == "__main__":
+    port = int(os.environ.get("PORT", "8123"))
+    server = ThreadingHTTPServer(("127.0.0.1", port), Handler)
+    server.serve_forever()
+
diff --git a/tasks-public/assets/t2_browser_form_fix/verify_form.cjs b/tasks-public/assets/t2_browser_form_fix/verify_form.cjs
new file mode 100644
index 0000000..b839c61
--- /dev/null
+++ b/tasks-public/assets/t2_browser_form_fix/verify_form.cjs
@@ -0,0 +1,23 @@
+const { chromium } = require("playwright");
+
+async function main() {
+  const url = process.argv[2];
+  const browser = await chromium.launch({ headless: true });
+  const page = await browser.newPage();
+  await page.goto(url, { waitUntil: "networkidle" });
+  await page.fill("#email", "reader@example.com");
+  await page.click("#submit-button");
+  await page.waitForFunction(() => document.querySelector("#status").textContent.includes("Saved"), null, {
+    timeout: 3000,
+  });
+  const status = await page.textContent("#status");
+  await browser.close();
+  if (status.trim() !== "Saved reader@example.com") {
+    throw new Error(`Unexpected status: ${status}`);
+  }
+}
+
+main().catch((error) => {
+  console.error(error.message || String(error));
+  process.exit(1);
+});
diff --git a/tasks-public/assets/t2_config_loader/app_config.py b/tasks-public/assets/t2_config_loader/app_config.py
new file mode 100644
index 0000000..0ac5c48
--- /dev/null
+++ b/tasks-public/assets/t2_config_loader/app_config.py
@@ -0,0 +1,6 @@
+DEFAULTS = {
+    "host": "127.0.0.1",
+    "port": 8080,
+    "debug": False,
+}
+
diff --git a/tasks-public/assets/t2_config_loader/config_loader.py b/tasks-public/assets/t2_config_loader/config_loader.py
new file mode 100644
index 0000000..3c7f7c0
--- /dev/null
+++ b/tasks-public/assets/t2_config_loader/config_loader.py
@@ -0,0 +1,20 @@
+from __future__ import annotations
+
+import json
+import os
+from pathlib import Path
+
+from app_config import DEFAULTS
+
+
+def load_config(path: str | None = None) -> dict[str, object]:
+    config = dict(DEFAULTS)
+    if path:
+        config.update(json.loads(Path(path).read_text(encoding="utf-8")))
+    # BUG: file values incorrectly win over environment overrides.
+    if "APP_PORT" in os.environ and path:
+        config["port"] = json.loads(Path(path).read_text(encoding="utf-8")).get("port", DEFAULTS["port"])
+    if "APP_DEBUG" in os.environ:
+        config["debug"] = os.environ["APP_DEBUG"]
+    return config
+
diff --git a/tasks-public/assets/t2_config_loader/tests/test_config_loader.py b/tasks-public/assets/t2_config_loader/tests/test_config_loader.py
new file mode 100644
index 0000000..b227ce5
--- /dev/null
+++ b/tasks-public/assets/t2_config_loader/tests/test_config_loader.py
@@ -0,0 +1,20 @@
+from __future__ import annotations
+
+import json
+
+from config_loader import load_config
+
+
+def test_env_port_overrides_file(tmp_path, monkeypatch):
+    config_path = tmp_path / "config.json"
+    config_path.write_text(json.dumps({"port": 9000, "debug": False}), encoding="utf-8")
+    monkeypatch.setenv("APP_PORT", "9200")
+    cfg = load_config(str(config_path))
+    assert cfg["port"] == 9200
+
+
+def test_debug_flag_is_boolean(monkeypatch):
+    monkeypatch.setenv("APP_DEBUG", "true")
+    cfg = load_config(None)
+    assert cfg["debug"] is True
+
diff --git a/tasks-public/assets/t2_fs_find_that_thing/.correct_filename.txt b/tasks-public/assets/t2_fs_find_that_thing/.correct_filename.txt
new file mode 100644
index 0000000..edc85c6
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/.correct_filename.txt
@@ -0,0 +1 @@
+q3_marketing_budget_v3.xlsx
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_1.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_1.txt
new file mode 100644
index 0000000..6aba593
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_1.txt
@@ -0,0 +1 @@
+filler 1
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_10.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_10.txt
new file mode 100644
index 0000000..9818d50
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_10.txt
@@ -0,0 +1 @@
+filler 10
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_11.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_11.txt
new file mode 100644
index 0000000..22c8f8d
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_11.txt
@@ -0,0 +1 @@
+filler 11
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_12.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_12.txt
new file mode 100644
index 0000000..ab2924d
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_12.txt
@@ -0,0 +1 @@
+filler 12
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_13.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_13.txt
new file mode 100644
index 0000000..2e4656e
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_13.txt
@@ -0,0 +1 @@
+filler 13
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_14.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_14.txt
new file mode 100644
index 0000000..2f6e834
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_14.txt
@@ -0,0 +1 @@
+filler 14
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_15.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_15.txt
new file mode 100644
index 0000000..204e7a6
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_15.txt
@@ -0,0 +1 @@
+filler 15
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_16.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_16.txt
new file mode 100644
index 0000000..bff1b76
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_16.txt
@@ -0,0 +1 @@
+filler 16
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_17.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_17.txt
new file mode 100644
index 0000000..0e910f0
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_17.txt
@@ -0,0 +1 @@
+filler 17
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_18.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_18.txt
new file mode 100644
index 0000000..b003e84
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_18.txt
@@ -0,0 +1 @@
+filler 18
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_19.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_19.txt
new file mode 100644
index 0000000..c5dff1b
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_19.txt
@@ -0,0 +1 @@
+filler 19
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_2.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_2.txt
new file mode 100644
index 0000000..bed6718
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_2.txt
@@ -0,0 +1 @@
+filler 2
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_20.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_20.txt
new file mode 100644
index 0000000..a64b357
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_20.txt
@@ -0,0 +1 @@
+filler 20
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_21.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_21.txt
new file mode 100644
index 0000000..3e25237
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_21.txt
@@ -0,0 +1 @@
+filler 21
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_22.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_22.txt
new file mode 100644
index 0000000..10490cd
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_22.txt
@@ -0,0 +1 @@
+filler 22
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_23.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_23.txt
new file mode 100644
index 0000000..c850d4f
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_23.txt
@@ -0,0 +1 @@
+filler 23
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_24.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_24.txt
new file mode 100644
index 0000000..d260084
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_24.txt
@@ -0,0 +1 @@
+filler 24
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_25.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_25.txt
new file mode 100644
index 0000000..2dd16e0
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_25.txt
@@ -0,0 +1 @@
+filler 25
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_3.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_3.txt
new file mode 100644
index 0000000..f787b2a
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_3.txt
@@ -0,0 +1 @@
+filler 3
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_4.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_4.txt
new file mode 100644
index 0000000..9430fdb
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_4.txt
@@ -0,0 +1 @@
+filler 4
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_5.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_5.txt
new file mode 100644
index 0000000..b6a9ec7
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_5.txt
@@ -0,0 +1 @@
+filler 5
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_6.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_6.txt
new file mode 100644
index 0000000..6a1cd0c
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_6.txt
@@ -0,0 +1 @@
+filler 6
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_7.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_7.txt
new file mode 100644
index 0000000..c87673b
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_7.txt
@@ -0,0 +1 @@
+filler 7
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_8.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_8.txt
new file mode 100644
index 0000000..8e9b634
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_8.txt
@@ -0,0 +1 @@
+filler 8
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_9.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_9.txt
new file mode 100644
index 0000000..b73e005
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_9.txt
@@ -0,0 +1 @@
+filler 9
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/q2_marketing_budget.xlsx b/tasks-public/assets/t2_fs_find_that_thing/Documents/q2_marketing_budget.xlsx
new file mode 100644
index 0000000..3cf919c
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/q2_marketing_budget.xlsx
@@ -0,0 +1,4 @@
+SHEET: Q2 Marketing Budget
+Region,Q2 Spend
+NorthAmerica,380000
+TOTAL,820000
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/q3_marketing_budget_v3.xlsx b/tasks-public/assets/t2_fs_find_that_thing/Documents/q3_marketing_budget_v3.xlsx
new file mode 100644
index 0000000..36c7487
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/q3_marketing_budget_v3.xlsx
@@ -0,0 +1,8 @@
+SHEET: Regional Breakdown
+Q3 Marketing Budget by Region
+Region,Q3 Spend,Notes
+NorthAmerica,420000,Display + paid social
+EMEA,310000,Conference sponsorships
+APAC,180000,Influencer pilot
+LATAM,90000,Brand awareness
+TOTAL,1000000
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/q3_sales_breakdown.xlsx b/tasks-public/assets/t2_fs_find_that_thing/Documents/q3_sales_breakdown.xlsx
new file mode 100644
index 0000000..82a38ec
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/q3_sales_breakdown.xlsx
@@ -0,0 +1,4 @@
+SHEET: Q3 Sales Numbers
+Region,Q3 Revenue
+NorthAmerica,2400000
+TOTAL,5800000
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_1.pdf b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_1.pdf
new file mode 100644
index 0000000..6aba593
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_1.pdf
@@ -0,0 +1 @@
+filler 1
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_10.pdf b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_10.pdf
new file mode 100644
index 0000000..9818d50
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_10.pdf
@@ -0,0 +1 @@
+filler 10
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_2.pdf b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_2.pdf
new file mode 100644
index 0000000..bed6718
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_2.pdf
@@ -0,0 +1 @@
+filler 2
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_3.pdf b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_3.pdf
new file mode 100644
index 0000000..f787b2a
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_3.pdf
@@ -0,0 +1 @@
+filler 3
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_4.pdf b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_4.pdf
new file mode 100644
index 0000000..9430fdb
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_4.pdf
@@ -0,0 +1 @@
+filler 4
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_5.pdf b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_5.pdf
new file mode 100644
index 0000000..b6a9ec7
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_5.pdf
@@ -0,0 +1 @@
+filler 5
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_6.pdf b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_6.pdf
new file mode 100644
index 0000000..6a1cd0c
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_6.pdf
@@ -0,0 +1 @@
+filler 6
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_7.pdf b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_7.pdf
new file mode 100644
index 0000000..c87673b
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_7.pdf
@@ -0,0 +1 @@
+filler 7
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_8.pdf b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_8.pdf
new file mode 100644
index 0000000..8e9b634
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_8.pdf
@@ -0,0 +1 @@
+filler 8
diff --git a/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_9.pdf b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_9.pdf
new file mode 100644
index 0000000..b73e005
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_9.pdf
@@ -0,0 +1 @@
+filler 9
diff --git a/tasks-public/assets/t2_fs_find_that_thing/verify_correct_file.py b/tasks-public/assets/t2_fs_find_that_thing/verify_correct_file.py
new file mode 100644
index 0000000..48af355
--- /dev/null
+++ b/tasks-public/assets/t2_fs_find_that_thing/verify_correct_file.py
@@ -0,0 +1,76 @@
+"""Recursive workspace search verifier."""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+EXCLUDE_FRAGMENTS = (
+    "verify_", "/.git/", "/.openclaw/",
+    "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md",
+    "USER.md", "SOUL.md", "HEARTBEAT.md",
+)
+TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log",
+                  ".jsonl", ".html", ".sh", ".py")
+
+
+def iter_workspace_text_files(root: Path = Path(".")):
+    for path in root.rglob("*"):
+        if not path.is_file():
+            continue
+        sp = str(path)
+        if any(frag in sp for frag in EXCLUDE_FRAGMENTS):
+            continue
+        if path.suffix.lower() not in TEXT_SUFFIXES:
+            continue
+        try:
+            yield path, path.read_text(encoding="utf-8", errors="ignore")
+        except Exception:
+            continue
+
+
+def workspace_blob() -> str:
+    return "\n".join(text for _, text in iter_workspace_text_files())
+
+
+def main() -> int:
+    # The agent must surface the Q3 marketing budget content. The desktop
+    # copy is the explicit target, but accept any file the agent created
+    # that contains the right content (Q3 marketing + region breakdowns).
+    target_substrings = ["q3", "region"]
+    decoy_q2 = ["q2 marketing", "q2 spend"]
+    decoy_sales = ["q3 revenue", "q3 sales"]
+
+    found_path = None
+    for path, text in iter_workspace_text_files():
+        # Skip the original asset-pack files (we want files the agent
+        # *placed* somewhere — typically a desktop/copy or report)
+        if "/Documents/" in str(path) and "v3" in path.name:
+            continue
+        text_lower = text.lower()
+        if all(s in text_lower for s in target_substrings) and "marketing" in text_lower:
+            # Reject decoys
+            if any(d in text_lower for d in decoy_q2):
+                continue
+            if any(d in text_lower for d in decoy_sales):
+                continue
+            found_path = path
+            break
+
+    # Also accept agent text output (e.g. answer.md) that just NAMES the
+    # right file
+    if found_path is None:
+        for path, text in iter_workspace_text_files():
+            if "q3_marketing_budget_v3" in text.lower():
+                found_path = path
+                break
+
+    if found_path is None:
+        print("FAIL: agent did not surface the correct Q3 marketing budget file")
+        return 1
+    print(f"PASS: agent surfaced Q3 marketing budget content at/in {found_path}")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tasks-public/assets/t2_msg_summarize_thread/thread.txt b/tasks-public/assets/t2_msg_summarize_thread/thread.txt
new file mode 100644
index 0000000..4a52e27
--- /dev/null
+++ b/tasks-public/assets/t2_msg_summarize_thread/thread.txt
@@ -0,0 +1,29 @@
+Channel: #design-redesign
+Date range: 2026-04-05 to 2026-04-08
+
+[Apr 5 09:14] Marcus: Quick proposal — for the homepage refresh, let's go with option A (single hero image, no carousel). Carousels test poorly.
+[Apr 5 09:18] Priya: I'm fine with A. Anything but the auto-rotating mess we have today.
+[Apr 5 09:22] Sam: Agree on A. Carousels are a UX antipattern.
+[Apr 5 09:30] Marcus: Cool, let's call it. Option A it is. I'll spec it out.
+[Apr 5 10:01] Priya: For typography, can we move to Inter? Easier reading and we already license it.
+[Apr 5 10:15] Sam: +1 Inter
+[Apr 5 11:42] Marcus: Inter approved. I'll add it to the spec.
+[Apr 6 08:55] Priya: Wait, on the homepage hero — I'm second-guessing this. What if we did option B (two-column with icon row) instead? It gives more above-the-fold info.
+[Apr 6 09:20] Marcus: Fair point. Let me think.
+[Apr 6 10:30] Sam: I prefer B too actually. More info density.
+[Apr 6 13:15] Marcus: OK I'm convinced. Switching to option B. Scratch yesterday's call. Final answer: B.
+[Apr 6 14:00] Sam: Great. So B for hero, Inter for type.
+[Apr 6 16:10] Priya: For the CTA button color, sticking with our brand orange right? #FF6B35.
+[Apr 6 16:14] Marcus: Yes brand orange. Don't touch the brand colors.
+[Apr 7 09:00] zhentongfan: Catching up on this thread — sounds like option B is locked in. I can take the spec writeup if Marcus is busy.
+[Apr 7 09:05] Marcus: Thanks zhentongfan, that'd be great. I owe you one.
+[Apr 7 09:30] zhentongfan: I'll have a draft by end of day Friday.
+[Apr 7 11:20] Priya: Open question — what happens to the testimonial section? Option B doesn't have a slot for it.
+[Apr 7 11:25] Sam: Good catch. Move it below the fold? Or kill it?
+[Apr 7 11:30] Priya: I'd vote move below the fold, not kill. Sales team will riot if we kill testimonials.
+[Apr 7 14:40] Marcus: Let's keep testimonials, just below the fold. Not killing them.
+[Apr 7 15:00] Sam: Open question still — what's the mobile breakpoint going to be?
+[Apr 7 15:30] Marcus: Open question for now. Let's defer to next sprint.
+[Apr 8 10:15] Priya: One more — favicon update? The current one is from 2019.
+[Apr 8 10:20] Sam: Lol yes please. Open item.
+[Apr 8 11:00] Marcus: Adding favicon to the followup list. Open question: who owns the asset.
diff --git a/tasks-public/assets/t2_msg_summarize_thread/verify_commitments.py b/tasks-public/assets/t2_msg_summarize_thread/verify_commitments.py
new file mode 100644
index 0000000..61babe1
--- /dev/null
+++ b/tasks-public/assets/t2_msg_summarize_thread/verify_commitments.py
@@ -0,0 +1,54 @@
+"""Recursive workspace search verifier."""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+EXCLUDE_FRAGMENTS = (
+    "verify_", "/.git/", "/.openclaw/",
+    "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md",
+    "USER.md", "SOUL.md", "HEARTBEAT.md",
+)
+TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log",
+                  ".jsonl", ".html", ".sh", ".py")
+
+
+def iter_workspace_text_files(root: Path = Path(".")):
+    for path in root.rglob("*"):
+        if not path.is_file():
+            continue
+        sp = str(path)
+        if any(frag in sp for frag in EXCLUDE_FRAGMENTS):
+            continue
+        if path.suffix.lower() not in TEXT_SUFFIXES:
+            continue
+        try:
+            yield path, path.read_text(encoding="utf-8", errors="ignore")
+        except Exception:
+            continue
+
+
+def workspace_blob() -> str:
+    return "\n".join(text for _, text in iter_workspace_text_files())
+
+
+def main() -> int:
+    blob = workspace_blob().lower()
+    if not blob:
+        print("FAIL: workspace contains no agent-written text files")
+        return 1
+    any_of = ['spec', 'writeup', 'write-up']
+    if not any(s in blob for s in any_of):
+        print(f"FAIL: workspace missing any of: {any_of}")
+        return 1
+    any_of = ['friday', 'you ', 'your ']
+    if not any(s in blob for s in any_of):
+        print(f"FAIL: workspace missing any of: {any_of}")
+        return 1
+    print("PASS: t2_msg_summarize_thread/verify_commitments.py")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tasks-public/assets/t2_msg_summarize_thread/verify_latest_decision.py b/tasks-public/assets/t2_msg_summarize_thread/verify_latest_decision.py
new file mode 100644
index 0000000..59f2805
--- /dev/null
+++ b/tasks-public/assets/t2_msg_summarize_thread/verify_latest_decision.py
@@ -0,0 +1,50 @@
+"""Recursive workspace search verifier."""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+EXCLUDE_FRAGMENTS = (
+    "verify_", "/.git/", "/.openclaw/",
+    "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md",
+    "USER.md", "SOUL.md", "HEARTBEAT.md",
+)
+TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log",
+                  ".jsonl", ".html", ".sh", ".py")
+
+
+def iter_workspace_text_files(root: Path = Path(".")):
+    for path in root.rglob("*"):
+        if not path.is_file():
+            continue
+        sp = str(path)
+        if any(frag in sp for frag in EXCLUDE_FRAGMENTS):
+            continue
+        if path.suffix.lower() not in TEXT_SUFFIXES:
+            continue
+        try:
+            yield path, path.read_text(encoding="utf-8", errors="ignore")
+        except Exception:
+            continue
+
+
+def workspace_blob() -> str:
+    return "\n".join(text for _, text in iter_workspace_text_files())
+
+
+def main() -> int:
+    blob = workspace_blob().lower()
+    if not blob:
+        print("FAIL: workspace contains no agent-written text files")
+        return 1
+    any_of = ['option b', 'two-column', 'two column']
+    if not any(s in blob for s in any_of):
+        print(f"FAIL: workspace missing any of: {any_of}")
+        return 1
+    print("PASS: t2_msg_summarize_thread/verify_latest_decision.py")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tasks-public/assets/t2_msg_summarize_thread/verify_summary_structure.py b/tasks-public/assets/t2_msg_summarize_thread/verify_summary_structure.py
new file mode 100644
index 0000000..40a2e7e
--- /dev/null
+++ b/tasks-public/assets/t2_msg_summarize_thread/verify_summary_structure.py
@@ -0,0 +1,55 @@
+"""Recursive workspace search verifier."""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+EXCLUDE_FRAGMENTS = (
+    "verify_", "/.git/", "/.openclaw/",
+    "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md",
+    "USER.md", "SOUL.md", "HEARTBEAT.md",
+)
+TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log",
+                  ".jsonl", ".html", ".sh", ".py")
+
+
+def iter_workspace_text_files(root: Path = Path(".")):
+    for path in root.rglob("*"):
+        if not path.is_file():
+            continue
+        sp = str(path)
+        if any(frag in sp for frag in EXCLUDE_FRAGMENTS):
+            continue
+        if path.suffix.lower() not in TEXT_SUFFIXES:
+            continue
+        try:
+            yield path, path.read_text(encoding="utf-8", errors="ignore")
+        except Exception:
+            continue
+
+
+def workspace_blob() -> str:
+    return "\n".join(text for _, text in iter_workspace_text_files())
+
+
+def main() -> int:
+    blob = workspace_blob().lower()
+    if not blob:
+        print("FAIL: workspace contains no agent-written text files")
+        return 1
+    needed = ['decision']
+    if not all(s in blob for s in needed):
+        missing = [s for s in needed if s not in blob]
+        print(f"FAIL: workspace missing required content: {missing}")
+        return 1
+    any_of = ['open', 'still', 'outstanding']
+    if not any(s in blob for s in any_of):
+        print(f"FAIL: workspace missing any of: {any_of}")
+        return 1
+    print("PASS: t2_msg_summarize_thread/verify_summary_structure.py")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tasks-public/assets/t2_priv_redact_doc/contract.txt b/tasks-public/assets/t2_priv_redact_doc/contract.txt
new file mode 100644
index 0000000..4755970
--- /dev/null
+++ b/tasks-public/assets/t2_priv_redact_doc/contract.txt
@@ -0,0 +1,25 @@
+Service Agreement
+
+This agreement is between Lin Park ("Client") and the Vendor.
+
+Client contact:
+  Name: Lin Park
+  Email: lin.park@personalmail.example
+  Phone: +1 (415) 555-0173
+  Account number: AC-77821-PK
+
+Scope of work:
+  - Implement the data ingestion pipeline described in Appendix A.
+  - Deliver weekly progress reports.
+  - Handover by Q3 2026.
+
+Pricing:
+  Base fee: $48,000 (US dollars)
+  Optional extension: up to $12,000 additional, billed monthly.
+
+Termination:
+  Either party may terminate with 30 days written notice.
+
+Signed,
+Lin Park (Client)
+April 9, 2026
diff --git a/tasks-public/assets/t2_priv_redact_doc/verify_redaction.py b/tasks-public/assets/t2_priv_redact_doc/verify_redaction.py
new file mode 100644
index 0000000..73e2834
--- /dev/null
+++ b/tasks-public/assets/t2_priv_redact_doc/verify_redaction.py
@@ -0,0 +1,68 @@
+"""Recursive workspace search verifier."""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+EXCLUDE_FRAGMENTS = (
+    "verify_", "/.git/", "/.openclaw/",
+    "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md",
+    "USER.md", "SOUL.md", "HEARTBEAT.md",
+)
+TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log",
+                  ".jsonl", ".html", ".sh", ".py")
+
+
+def iter_workspace_text_files(root: Path = Path(".")):
+    for path in root.rglob("*"):
+        if not path.is_file():
+            continue
+        sp = str(path)
+        if any(frag in sp for frag in EXCLUDE_FRAGMENTS):
+            continue
+        if path.suffix.lower() not in TEXT_SUFFIXES:
+            continue
+        try:
+            yield path, path.read_text(encoding="utf-8", errors="ignore")
+        except Exception:
+            continue
+
+
+def workspace_blob() -> str:
+    return "\n".join(text for _, text in iter_workspace_text_files())
+
+
+PII = [
+    "Lin Park", "lin.park@personalmail.example",
+    "+1 (415) 555-0173", "415 555-0173", "415-555-0173",
+    "AC-77821-PK",
+]
+PRESERVED_KEYWORDS = ["service agreement", "scope of work", "termination"]
+
+
+def main() -> int:
+    # Find a redacted version of the contract somewhere
+    for path, text in iter_workspace_text_files():
+        if "contract" not in path.name.lower() or "redact" not in path.name.lower():
+            continue
+        leaked = [v for v in PII if v in text]
+        if leaked:
+            print(f"FAIL: redacted file {path} still contains PII: {leaked}")
+            return 1
+        text_lower = text.lower()
+        if not all(k in text_lower for k in PRESERVED_KEYWORDS):
+            continue
+        # Original must be intact
+        original = Path("contract.txt")
+        if original.exists() and "Lin Park" not in original.read_text(encoding="utf-8"):
+            print("FAIL: original contract.txt was modified")
+            return 1
+        print(f"PASS: redacted contract found at {path}, PII removed, structure preserved")
+        return 0
+    print("FAIL: no redacted contract file found in workspace")
+    return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tasks-public/assets/t3_data_pipeline_report/expected/report.txt b/tasks-public/assets/t3_data_pipeline_report/expected/report.txt
new file mode 100644
index 0000000..62fe099
--- /dev/null
+++ b/tasks-public/assets/t3_data_pipeline_report/expected/report.txt
@@ -0,0 +1,4 @@
+East: 150
+North: 50
+West: 80
+
diff --git a/tasks-public/assets/t3_data_pipeline_report/input/regions.json b/tasks-public/assets/t3_data_pipeline_report/input/regions.json
new file mode 100644
index 0000000..4db9b12
--- /dev/null
+++ b/tasks-public/assets/t3_data_pipeline_report/input/regions.json
@@ -0,0 +1,2 @@
+{"east": "East", "west": "West", "north": "North"}
+
diff --git a/tasks-public/assets/t3_data_pipeline_report/input/sales.csv b/tasks-public/assets/t3_data_pipeline_report/input/sales.csv
new file mode 100644
index 0000000..1ebfd84
--- /dev/null
+++ b/tasks-public/assets/t3_data_pipeline_report/input/sales.csv
@@ -0,0 +1,6 @@
+region,amount
+east,120
+west,80
+east,30
+north,50
+
diff --git a/tasks-public/assets/t3_data_pipeline_report/pipeline.py b/tasks-public/assets/t3_data_pipeline_report/pipeline.py
new file mode 100644
index 0000000..9cc4e73
--- /dev/null
+++ b/tasks-public/assets/t3_data_pipeline_report/pipeline.py
@@ -0,0 +1,29 @@
+from __future__ import annotations
+
+import csv
+import json
+import sys
+
+
+def load_sales(path: str) -> list[dict[str, str]]:
+    with open(path, encoding="utf-8") as handle:
+        return list(csv.DictReader(handle))
+
+
+def load_regions(path: str) -> dict[str, str]:
+    with open(path, encoding="utf-8") as handle:
+        return json.load(handle)
+
+
+def build_report(sales_rows: list[dict[str, str]], region_map: dict[str, str]) -> str:
+    # TODO: aggregate all rows by region and include totals.
+    first = sales_rows[0]
+    region_name = region_map[first["region"]]
+    return f"{region_name}: {first['amount']}"
+
+
+if __name__ == "__main__":
+    sales = load_sales(sys.argv[1])
+    regions = load_regions(sys.argv[2])
+    print(build_report(sales, regions))
+
diff --git a/tasks-public/assets/t3_data_sql_query/users.db b/tasks-public/assets/t3_data_sql_query/users.db
new file mode 100644
index 0000000..2264973
Binary files /dev/null and b/tasks-public/assets/t3_data_sql_query/users.db differ
diff --git a/tasks-public/assets/t3_data_sql_query/verify_results.py b/tasks-public/assets/t3_data_sql_query/verify_results.py
new file mode 100644
index 0000000..7b2028a
--- /dev/null
+++ b/tasks-public/assets/t3_data_sql_query/verify_results.py
@@ -0,0 +1,68 @@
+"""Recursive workspace search verifier."""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+EXCLUDE_FRAGMENTS = (
+    "verify_", "/.git/", "/.openclaw/",
+    "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md",
+    "USER.md", "SOUL.md", "HEARTBEAT.md",
+)
+TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log",
+                  ".jsonl", ".html", ".sh", ".py")
+
+
+def iter_workspace_text_files(root: Path = Path(".")):
+    for path in root.rglob("*"):
+        if not path.is_file():
+            continue
+        sp = str(path)
+        if any(frag in sp for frag in EXCLUDE_FRAGMENTS):
+            continue
+        if path.suffix.lower() not in TEXT_SUFFIXES:
+            continue
+        try:
+            yield path, path.read_text(encoding="utf-8", errors="ignore")
+        except Exception:
+            continue
+
+
+def workspace_blob() -> str:
+    return "\n".join(text for _, text in iter_workspace_text_files())
+
+
+import re, csv, io
+
+def main() -> int:
+    # Find a CSV-shaped file with the EU 2026 active signups data
+    for path, text in iter_workspace_text_files():
+        if path.suffix.lower() != ".csv":
+            continue
+        rows = list(csv.reader(io.StringIO(text)))
+        if not rows:
+            continue
+        first_is_header = not any(any(c.isdigit() for c in cell) for cell in rows[0])
+        data_rows = rows[1:] if first_is_header else rows
+        if len(data_rows) != 7:
+            continue
+        blob = " ".join(c for r in data_rows for c in r).lower()
+        if "old" in blob and ("do not use" in blob or "deprecated" in blob):
+            continue
+        expected = ["organic", "paid social", "email newsletter", "referral partner"]
+        if sum(1 for c in expected if c in blob) >= 2:
+            print(f"PASS: 7 rows + correct channels in {path}")
+            return 0
+
+    # Also accept any text file with the right content shape
+    blob = workspace_blob().lower()
+    if "7" in blob and all(c in blob for c in ("organic", "paid social")):
+        print("PASS: result discussion mentions 7 rows + channels (text format)")
+        return 0
+    print("FAIL: no CSV with 7 active EU 2026 signups + correct channels")
+    return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tasks-public/assets/t3_feature_export/cli.py b/tasks-public/assets/t3_feature_export/cli.py
new file mode 100644
index 0000000..a460aab
--- /dev/null
+++ b/tasks-public/assets/t3_feature_export/cli.py
@@ -0,0 +1,23 @@
+from __future__ import annotations
+
+import argparse
+
+from exporters import export_csv, export_json
+from issues import ISSUES
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("command", choices=["export"])
+    parser.add_argument("--format", choices=["json", "csv"], default="json")
+    args = parser.parse_args()
+
+    if args.format == "json":
+        print(export_json(ISSUES))
+        return
+
+    print(export_csv(ISSUES))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tasks-public/assets/t3_feature_export/expected/issues.csv b/tasks-public/assets/t3_feature_export/expected/issues.csv
new file mode 100644
index 0000000..23af1fa
--- /dev/null
+++ b/tasks-public/assets/t3_feature_export/expected/issues.csv
@@ -0,0 +1,4 @@
+id,title,status
+101,Fix login loop,open
+102,Improve metrics panel,closed
+
diff --git a/tasks-public/assets/t3_feature_export/exporters.py b/tasks-public/assets/t3_feature_export/exporters.py
new file mode 100644
index 0000000..147426b
--- /dev/null
+++ b/tasks-public/assets/t3_feature_export/exporters.py
@@ -0,0 +1,10 @@
+import json
+
+
+def export_json(issues: list[dict[str, object]]) -> str:
+    return json.dumps(issues, sort_keys=True)
+
+
+def export_csv(issues: list[dict[str, object]]) -> str:
+    raise NotImplementedError("csv export is not implemented yet")
+
diff --git a/tasks-public/assets/t3_feature_export/issues.py b/tasks-public/assets/t3_feature_export/issues.py
new file mode 100644
index 0000000..5dd6eba
--- /dev/null
+++ b/tasks-public/assets/t3_feature_export/issues.py
@@ -0,0 +1,5 @@
+ISSUES = [
+    {"id": 101, "title": "Fix login loop", "status": "open"},
+    {"id": 102, "title": "Improve metrics panel", "status": "closed"},
+]
+
diff --git a/tasks-public/assets/t3_feature_export/tests/test_export.py b/tasks-public/assets/t3_feature_export/tests/test_export.py
new file mode 100644
index 0000000..4bd2d2b
--- /dev/null
+++ b/tasks-public/assets/t3_feature_export/tests/test_export.py
@@ -0,0 +1,11 @@
+from exporters import export_csv
+from issues import ISSUES
+
+
+def test_csv_export_has_header_and_rows():
+    assert export_csv(ISSUES) == (
+        "id,title,status\n"
+        "101,Fix login loop,open\n"
+        "102,Improve metrics panel,closed\n"
+    )
+
diff --git a/tasks-public/assets/t3_msg_inbox_triage/inbox/01_urgent_client_outage.txt b/tasks-public/assets/t3_msg_inbox_triage/inbox/01_urgent_client_outage.txt
new file mode 100644
index 0000000..9e08698
--- /dev/null
+++ b/tasks-public/assets/t3_msg_inbox_triage/inbox/01_urgent_client_outage.txt
@@ -0,0 +1,11 @@
+From: priya.shah@acme-customer.com
+Subject: URGENT: Production outage on our side - need help today
+
+Hi,
+
+Our production environment using your service has been down since 2pm.
+Error logs are attached. Can someone from your team look at this today?
+This is impacting our customer-facing checkout flow.
+
+Priya
+VP Engineering, Acme Corp
diff --git a/tasks-public/assets/t3_msg_inbox_triage/inbox/02_newsletter_techweekly.txt b/tasks-public/assets/t3_msg_inbox_triage/inbox/02_newsletter_techweekly.txt
new file mode 100644
index 0000000..70c3c6e
--- /dev/null
+++ b/tasks-public/assets/t3_msg_inbox_triage/inbox/02_newsletter_techweekly.txt
@@ -0,0 +1,11 @@
+From: digest@techweekly.example
+Subject: TechWeekly Digest #482 — AI infrastructure roundup
+
+Top stories this week:
+- New benchmarks released for vector databases
+- Hardware vendors announce next-gen accelerators
+- Open source models continue rapid progress
+
+Click to read more.
+
+To unsubscribe, click here.
diff --git a/tasks-public/assets/t3_msg_inbox_triage/inbox/03_phishing_attempt.txt b/tasks-public/assets/t3_msg_inbox_triage/inbox/03_phishing_attempt.txt
new file mode 100644
index 0000000..8efbb87
--- /dev/null
+++ b/tasks-public/assets/t3_msg_inbox_triage/inbox/03_phishing_attempt.txt
@@ -0,0 +1,14 @@
+From: security-alerts@anth-rop1c.com
+Subject: URGENT: Your account will be suspended
+
+Dear user,
+
+We have detected unusual activity on your account. To prevent suspension,
+please verify your credentials by clicking the link below within 24 hours:
+
+http://anth-rop1c.com/verify?id=00fA92
+
+If you do not verify, your account WILL be locked and all data will be
+deleted.
+
+Anthropic Security Team
diff --git a/tasks-public/assets/t3_msg_inbox_triage/inbox/04_team_status_update.txt b/tasks-public/assets/t3_msg_inbox_triage/inbox/04_team_status_update.txt
new file mode 100644
index 0000000..60bcba1
--- /dev/null
+++ b/tasks-public/assets/t3_msg_inbox_triage/inbox/04_team_status_update.txt
@@ -0,0 +1,13 @@
+From: marcus@yourteam.example
+Subject: Weekly status — Q2 planning
+
+Hey,
+
+Quick recap of where we are:
+- Auth migration on track for May 1
+- Marketing site redesign starting next Monday
+- Hiring: 3 candidates in final rounds
+
+No action needed from you, just FYI.
+
+Marcus
diff --git a/tasks-public/assets/t3_msg_inbox_triage/inbox/05_ambiguous_let_me_know.txt b/tasks-public/assets/t3_msg_inbox_triage/inbox/05_ambiguous_let_me_know.txt
new file mode 100644
index 0000000..7e3e0ce
--- /dev/null
+++ b/tasks-public/assets/t3_msg_inbox_triage/inbox/05_ambiguous_let_me_know.txt
@@ -0,0 +1,6 @@
+From: jamie@partner.example
+Subject: thoughts?
+
+hey can you let me know what you think when you have a minute. thanks!
+
+Jamie
diff --git a/tasks-public/assets/t3_msg_inbox_triage/inbox/06_legal_review_request.txt b/tasks-public/assets/t3_msg_inbox_triage/inbox/06_legal_review_request.txt
new file mode 100644
index 0000000..6fe403a
--- /dev/null
+++ b/tasks-public/assets/t3_msg_inbox_triage/inbox/06_legal_review_request.txt
@@ -0,0 +1,12 @@
+From: legal@yourcompany.example
+Subject: Need your sign-off on the vendor MSA by EOD Wednesday
+
+Hi,
+
+We're closing the Innotech vendor agreement and need your sign-off on the
+MSA before Wednesday end-of-day. The redlines are minor — mostly
+indemnification language.
+
+Please reply with either "approved" or specific changes.
+
+Legal Team
diff --git a/tasks-public/assets/t3_msg_inbox_triage/inbox/07_meetup_invite.txt b/tasks-public/assets/t3_msg_inbox_triage/inbox/07_meetup_invite.txt
new file mode 100644
index 0000000..6067b9e
--- /dev/null
+++ b/tasks-public/assets/t3_msg_inbox_triage/inbox/07_meetup_invite.txt
@@ -0,0 +1,9 @@
+From: events@localmeetup.example
+Subject: AI engineering meetup next Thursday
+
+Hi,
+
+Reminder that our monthly meetup is happening next Thursday at 6pm.
+Topic: production LLM observability. Free pizza.
+
+RSVP at the link below if interested.
diff --git a/tasks-public/assets/t3_msg_inbox_triage/inbox/08_recruiter_cold_outreach.txt b/tasks-public/assets/t3_msg_inbox_triage/inbox/08_recruiter_cold_outreach.txt
new file mode 100644
index 0000000..71edd0f
--- /dev/null
+++ b/tasks-public/assets/t3_msg_inbox_triage/inbox/08_recruiter_cold_outreach.txt
@@ -0,0 +1,13 @@
+From: recruiter@bigcorp.example
+Subject: Senior Engineering opportunity at BigCorp
+
+Hi,
+
+I came across your profile and think you'd be a great fit for a senior
+engineering role we're filling. Compensation range is competitive, and
+the team works on interesting problems.
+
+Would you be open to a brief chat?
+
+Best,
+Tony
diff --git a/tasks-public/assets/t3_msg_inbox_triage/prefs.yaml b/tasks-public/assets/t3_msg_inbox_triage/prefs.yaml
new file mode 100644
index 0000000..fec105f
--- /dev/null
+++ b/tasks-public/assets/t3_msg_inbox_triage/prefs.yaml
@@ -0,0 +1,10 @@
+priority_signals:
+  urgent_keywords: ["urgent", "outage", "production", "EOD", "end of day", "today"]
+  trusted_domains: ["acme-customer.com", "yourcompany.example", "yourteam.example"]
+  newsletter_domains: ["techweekly.example", "localmeetup.example"]
+  always_low_priority_subjects: ["digest", "newsletter", "weekly status"]
+
+draft_for_urgent: true
+draft_for_low_priority: false
+flag_phishing: true
+do_not_reply_to_recruiter_cold_outreach: true
diff --git a/tasks-public/assets/t3_msg_inbox_triage/verify_all_classified.py b/tasks-public/assets/t3_msg_inbox_triage/verify_all_classified.py
new file mode 100644
index 0000000..6d11017
--- /dev/null
+++ b/tasks-public/assets/t3_msg_inbox_triage/verify_all_classified.py
@@ -0,0 +1,58 @@
+"""Recursive workspace search verifier."""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+EXCLUDE_FRAGMENTS = (
+    "verify_", "/.git/", "/.openclaw/",
+    "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md",
+    "USER.md", "SOUL.md", "HEARTBEAT.md",
+)
+TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log",
+                  ".jsonl", ".html", ".sh", ".py")
+
+
+def iter_workspace_text_files(root: Path = Path(".")):
+    for path in root.rglob("*"):
+        if not path.is_file():
+            continue
+        sp = str(path)
+        if any(frag in sp for frag in EXCLUDE_FRAGMENTS):
+            continue
+        if path.suffix.lower() not in TEXT_SUFFIXES:
+            continue
+        try:
+            yield path, path.read_text(encoding="utf-8", errors="ignore")
+        except Exception:
+            continue
+
+
+def workspace_blob() -> str:
+    return "\n".join(text for _, text in iter_workspace_text_files())
+
+
+def main() -> int:
+    blob = workspace_blob().lower()
+    if not blob:
+        print("FAIL: workspace contains no agent-written text files")
+        return 1
+    any_of = ['acme', 'outage', 'production']
+    if not any(s in blob for s in any_of):
+        print(f"FAIL: workspace missing any of: {any_of}")
+        return 1
+    any_of = ['legal', 'msa', 'innotech', 'vendor']
+    if not any(s in blob for s in any_of):
+        print(f"FAIL: workspace missing any of: {any_of}")
+        return 1
+    any_of = ['phishing', 'suspicious', 'anth-rop1c', 'anth-rop']
+    if not any(s in blob for s in any_of):
+        print(f"FAIL: workspace missing any of: {any_of}")
+        return 1
+    print("PASS: t3_msg_inbox_triage/verify_all_classified.py")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tasks-public/assets/t3_msg_inbox_triage/verify_drafts_for_urgent.py b/tasks-public/assets/t3_msg_inbox_triage/verify_drafts_for_urgent.py
new file mode 100644
index 0000000..6f2c974
--- /dev/null
+++ b/tasks-public/assets/t3_msg_inbox_triage/verify_drafts_for_urgent.py
@@ -0,0 +1,54 @@
+"""Recursive workspace search verifier."""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+EXCLUDE_FRAGMENTS = (
+    "verify_", "/.git/", "/.openclaw/",
+    "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md",
+    "USER.md", "SOUL.md", "HEARTBEAT.md",
+)
+TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log",
+                  ".jsonl", ".html", ".sh", ".py")
+
+
+def iter_workspace_text_files(root: Path = Path(".")):
+    for path in root.rglob("*"):
+        if not path.is_file():
+            continue
+        sp = str(path)
+        if any(frag in sp for frag in EXCLUDE_FRAGMENTS):
+            continue
+        if path.suffix.lower() not in TEXT_SUFFIXES:
+            continue
+        try:
+            yield path, path.read_text(encoding="utf-8", errors="ignore")
+        except Exception:
+            continue
+
+
+def workspace_blob() -> str:
+    return "\n".join(text for _, text in iter_workspace_text_files())
+
+
+def main() -> int:
+    blob = workspace_blob().lower()
+    if not blob:
+        print("FAIL: workspace contains no agent-written text files")
+        return 1
+    any_of = ['acme', 'outage', 'priya']
+    if not any(s in blob for s in any_of):
+        print(f"FAIL: workspace missing any of: {any_of}")
+        return 1
+    any_of = ['legal', 'msa', 'innotech']
+    if not any(s in blob for s in any_of):
+        print(f"FAIL: workspace missing any of: {any_of}")
+        return 1
+    print("PASS: t3_msg_inbox_triage/verify_drafts_for_urgent.py")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tasks-public/assets/t3_msg_inbox_triage/verify_phishing_flagged.py b/tasks-public/assets/t3_msg_inbox_triage/verify_phishing_flagged.py
new file mode 100644
index 0000000..5b84145
--- /dev/null
+++ b/tasks-public/assets/t3_msg_inbox_triage/verify_phishing_flagged.py
@@ -0,0 +1,50 @@
+"""Recursive workspace search verifier."""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+EXCLUDE_FRAGMENTS = (
+    "verify_", "/.git/", "/.openclaw/",
+    "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md",
+    "USER.md", "SOUL.md", "HEARTBEAT.md",
+)
+TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log",
+                  ".jsonl", ".html", ".sh", ".py")
+
+
+def iter_workspace_text_files(root: Path = Path(".")):
+    for path in root.rglob("*"):
+        if not path.is_file():
+            continue
+        sp = str(path)
+        if any(frag in sp for frag in EXCLUDE_FRAGMENTS):
+            continue
+        if path.suffix.lower() not in TEXT_SUFFIXES:
+            continue
+        try:
+            yield path, path.read_text(encoding="utf-8", errors="ignore")
+        except Exception:
+            continue
+
+
+def workspace_blob() -> str:
+    return "\n".join(text for _, text in iter_workspace_text_files())
+
+
+def main() -> int:
+    blob = workspace_blob().lower()
+    if not blob:
+        print("FAIL: workspace contains no agent-written text files")
+        return 1
+    any_of = ['phishing', 'suspicious', 'scam', 'spoof', 'fake', 'do not click']
+    if not any(s in blob for s in any_of):
+        print(f"FAIL: workspace missing any of: {any_of}")
+        return 1
+    print("PASS: t3_msg_inbox_triage/verify_phishing_flagged.py")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tasks-public/assets/t3_web_research_and_cite/articles/01_grid_basics.html b/tasks-public/assets/t3_web_research_and_cite/articles/01_grid_basics.html
new file mode 100644
index 0000000..c0cc98d
--- /dev/null
+++ b/tasks-public/assets/t3_web_research_and_cite/articles/01_grid_basics.html
@@ -0,0 +1,14 @@
+<!doctype html>
+<html><head><title>Solar Curtailment 101 — Grid Operator Quarterly</title></head>
+<body>
+<article>
+<h1>Solar Curtailment 101</h1>
+<p>Source: Grid Operator Quarterly | Published 2025-11-14 | author: Lin Park</p>
+<p>When solar output exceeds local demand, grid operators traditionally
+"curtail" — instructing solar farms to reduce production. This wastes
+clean energy. In 2024, California's CAISO curtailed 3.2 TWh of solar.</p>
+<p>Operators are increasingly pivoting to active management: shifting
+loads, charging batteries, and exporting to neighboring regions instead
+of curtailing.</p>
+</article>
+</body></html>
diff --git a/tasks-public/assets/t3_web_research_and_cite/articles/02_battery_storage.html b/tasks-public/assets/t3_web_research_and_cite/articles/02_battery_storage.html
new file mode 100644
index 0000000..1462832
--- /dev/null
+++ b/tasks-public/assets/t3_web_research_and_cite/articles/02_battery_storage.html
@@ -0,0 +1,13 @@
+<!doctype html>
+<html><head><title>Battery Storage Soaks Up Excess Solar — Energy Wire</title></head>
+<body>
+<article>
+<h1>Battery Storage Soaks Up Excess Solar</h1>
+<p>Source: Energy Wire | Published 2026-02-03 | author: Maya Johansson</p>
+<p>Utility-scale battery installations doubled in 2025. The
+California Independent System Operator reports that storage absorbed
+roughly 40 percent of would-be-curtailed midday solar in Q4 2025.</p>
+<p>Texas ERCOT followed a similar trajectory, with battery storage
+helping smooth duck-curve effects.</p>
+</article>
+</body></html>
diff --git a/tasks-public/assets/t3_web_research_and_cite/articles/03_pricing_signals.html b/tasks-public/assets/t3_web_research_and_cite/articles/03_pricing_signals.html
new file mode 100644
index 0000000..7be23b3
--- /dev/null
+++ b/tasks-public/assets/t3_web_research_and_cite/articles/03_pricing_signals.html
@@ -0,0 +1,13 @@
+<!doctype html>
+<html><head><title>Negative Price Hours Are the New Normal — Power Markets Today</title></head>
+<body>
+<article>
+<h1>Negative Price Hours Are the New Normal</h1>
+<p>Source: Power Markets Today | Published 2026-01-22 | author: Dev Patel</p>
+<p>European wholesale markets saw record numbers of negative pricing
+hours in 2025. Germany alone recorded 466 hours of sub-zero spot
+prices, primarily during high solar generation periods.</p>
+<p>This is creating both opportunities (for flexible loads) and
+challenges (for project economics).</p>
+</article>
+</body></html>
diff --git a/tasks-public/assets/t3_web_research_and_cite/articles/04_curtailment_helps.html b/tasks-public/assets/t3_web_research_and_cite/articles/04_curtailment_helps.html
new file mode 100644
index 0000000..73fc6c2
--- /dev/null
+++ b/tasks-public/assets/t3_web_research_and_cite/articles/04_curtailment_helps.html
@@ -0,0 +1,13 @@
+<!doctype html>
+<html><head><title>Curtailment Is Actually a Feature, Not a Bug — Contrarian View</title></head>
+<body>
+<article>
+<h1>Curtailment Is Actually a Feature, Not a Bug</h1>
+<p>Source: Energy Contrarian | Published 2026-03-12 | author: Jordan Hayes</p>
+<p>The dominant narrative says curtailment is wasted energy. Some grid
+economists disagree: occasional curtailment is the price of building
+ample solar capacity, and it's cheaper than over-engineering storage.</p>
+<p>This view directly contradicts the storage-first orthodoxy held by
+most operators today.</p>
+</article>
+</body></html>
diff --git a/tasks-public/assets/t3_web_research_and_cite/articles/05_satire.html b/tasks-public/assets/t3_web_research_and_cite/articles/05_satire.html
new file mode 100644
index 0000000..443baef
--- /dev/null
+++ b/tasks-public/assets/t3_web_research_and_cite/articles/05_satire.html
@@ -0,0 +1,12 @@
+<!doctype html>
+<html><head><title>BREAKING: Sun Switched Off to Help Grid Operators (SATIRE)</title></head>
+<body>
+<article>
+<h1>BREAKING: Sun Switched Off to Help Grid Operators</h1>
+<p>Source: The Onion-Adjacent Energy Times | Published 2026-04-01 | tagged: SATIRE / HUMOR</p>
+<p>Grid operators rejoiced today as the sun was officially placed on
+standby mode by the Department of Celestial Affairs. "Finally we don't
+have to deal with all that pesky free energy," said one engineer.</p>
+<p>NOTE: This article is satire and should not be cited as fact.</p>
+</article>
+</body></html>
diff --git a/tasks-public/assets/t3_web_research_and_cite/serve.py b/tasks-public/assets/t3_web_research_and_cite/serve.py
new file mode 100644
index 0000000..22f0d23
--- /dev/null
+++ b/tasks-public/assets/t3_web_research_and_cite/serve.py
@@ -0,0 +1,66 @@
+"""Serve the local sandboxed news site for the research-and-cite task."""
+
+from __future__ import annotations
+
+import os
+from http.server import BaseHTTPRequestHandler, HTTPServer
+from pathlib import Path
+
+ROOT = Path(__file__).parent / "articles"
+
+
+class Handler(BaseHTTPRequestHandler):
+    def do_GET(self) -> None:  # noqa: N802
+        path = self.path.split("?")[0]
+        if path == "/health":
+            self.send_response(200)
+            self.send_header("Content-Type", "application/json")
+            self.end_headers()
+            self.wfile.write(b'{"ok": true}')
+            return
+        if path == "/" or path == "/index":
+            self._index()
+            return
+        if path.startswith("/article/"):
+            slug = path.split("/", 2)[2]
+            article = ROOT / f"{slug}.html"
+            if article.exists():
+                self._html(article.read_bytes())
+                return
+        self.send_response(404)
+        self.end_headers()
+        self.wfile.write(b"not found")
+
+    def _index(self) -> None:
+        items = []
+        for f in sorted(ROOT.glob("*.html")):
+            slug = f.stem
+            items.append(f'<li><a href="/article/{slug}">{slug}</a></li>')
+        body = (
+            "<!doctype html><html><body>"
+            "<h1>Sandboxed News Index</h1><ul>"
+            + "".join(items)
+            + "</ul></body></html>"
+        ).encode("utf-8")
+        self._html(body)
+
+    def _html(self, body: bytes) -> None:
+        self.send_response(200)
+        self.send_header("Content-Type", "text/html")
+        self.send_header("Content-Length", str(len(body)))
+        self.end_headers()
+        self.wfile.write(body)
+
+    def log_message(self, format, *args):  # noqa: A002
+        pass
+
+
+def main() -> None:
+    port = int(os.environ.get("PORT", "0"))
+    server = HTTPServer(("127.0.0.1", port), Handler)
+    print(f"sandbox news site listening on http://127.0.0.1:{server.server_address[1]}")
+    server.serve_forever()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tasks-public/assets/t3_web_research_and_cite/verify_explainer.py b/tasks-public/assets/t3_web_research_and_cite/verify_explainer.py
new file mode 100644
index 0000000..a759006
--- /dev/null
+++ b/tasks-public/assets/t3_web_research_and_cite/verify_explainer.py
@@ -0,0 +1,71 @@
+"""Recursive workspace search verifier."""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+EXCLUDE_FRAGMENTS = (
+    "verify_", "/.git/", "/.openclaw/",
+    "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md",
+    "USER.md", "SOUL.md", "HEARTBEAT.md",
+)
+TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log",
+                  ".jsonl", ".html", ".sh", ".py")
+
+
+def iter_workspace_text_files(root: Path = Path(".")):
+    for path in root.rglob("*"):
+        if not path.is_file():
+            continue
+        sp = str(path)
+        if any(frag in sp for frag in EXCLUDE_FRAGMENTS):
+            continue
+        if path.suffix.lower() not in TEXT_SUFFIXES:
+            continue
+        try:
+            yield path, path.read_text(encoding="utf-8", errors="ignore")
+        except Exception:
+            continue
+
+
+def workspace_blob() -> str:
+    return "\n".join(text for _, text in iter_workspace_text_files())
+
+
+def main() -> int:
+    blob = workspace_blob().lower()
+    if len(blob) < 500:
+        print(f"FAIL: workspace text too thin ({len(blob)} chars) for an explainer")
+        return 1
+
+    real_keywords = {
+        "01_grid_basics": ["grid operator quarterly", "lin park", "caiso", "3.2 twh"],
+        "02_battery_storage": ["energy wire", "maya johansson", "40 percent", "ercot"],
+        "03_pricing_signals": ["power markets today", "dev patel", "negative", "466 hours"],
+        "04_curtailment_helps": ["energy contrarian", "jordan hayes", "feature, not a bug",
+                                  "feature not a bug"],
+    }
+
+    cited = sum(1 for kws in real_keywords.values() if any(k in blob for k in kws))
+    if cited < 2:
+        print(f"FAIL: only {cited}/4 real articles cited (need >=2)")
+        return 1
+
+    if "sun switched off" in blob or "celestial affairs" in blob:
+        if not any(w in blob for w in ("satire", "humor", "joke", "not real")):
+            print("FAIL: cited the satirical article as fact")
+            return 1
+
+    if not any(w in blob for w in ("contradict", "disagree", "however", "on the other hand",
+                                    "in contrast", "alternative view", "counterpoint",
+                                    "but some", "by contrast", "nuance")):
+        print("FAIL: explainer does not surface the contradiction between sources")
+        return 1
+
+    print(f"PASS: {cited}/4 real sources cited, contradiction surfaced")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tasks-public/assets/t4_browser_research_and_code/docs/index.html b/tasks-public/assets/t4_browser_research_and_code/docs/index.html
new file mode 100644
index 0000000..9868561
--- /dev/null
+++ b/tasks-public/assets/t4_browser_research_and_code/docs/index.html
@@ -0,0 +1,41 @@
+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8" />
+    <title>Reporting API Docs</title>
+  </head>
+  <body>
+    <h1>Reporting API</h1>
+
+    <h2>Versioning</h2>
+    <ul>
+      <li><code>/v1/reports</code> — <strong>deprecated</strong>, sunset on 2026-07-01.</li>
+      <li><code>/v2/reports</code> — <strong>current</strong> (GA since 2026.2). Use this.</li>
+      <li><code>/v3/reports</code> — <strong>beta</strong>, not recommended for production; interface may change.</li>
+    </ul>
+    <p>New integrations must use <code>/v2/reports</code>.</p>
+
+    <h2>Required headers (for /v2/reports)</h2>
+    <p>Every request to the current reporting endpoint <em>must</em> include:</p>
+    <ul>
+      <li><code>X-Workspace-Id</code> — identifies the tenant workspace.</li>
+      <li><code>Authorization</code> — <code>Bearer &lt;token&gt;</code>.</li>
+    </ul>
+
+    <h2>Optional headers</h2>
+    <ul>
+      <li><code>X-Request-Id</code> — opaque client-side correlation id for tracing.</li>
+    </ul>
+
+    <h2>Headers for other endpoints (do NOT send on /v2/reports)</h2>
+    <ul>
+      <li><code>X-Admin-Token</code> — required on <code>/v2/admin</code> only. Sending it on <code>/v2/reports</code> will cause a 400.</li>
+    </ul>
+
+    <h2>Rate limits</h2>
+    <p>The <code>/v2/reports</code> endpoint is rate-limited to <strong>120 requests per minute</strong> per workspace. Requests beyond this return 429.</p>
+
+    <h2>Payload size</h2>
+    <p>Max body size on <code>/v2/reports</code> is <strong>10 MiB</strong>. Larger payloads should use the chunked upload flow (see <code>/v2/uploads</code>, not covered here).</p>
+  </body>
+</html>
diff --git a/tasks-public/assets/t4_browser_research_and_code/report_client.py b/tasks-public/assets/t4_browser_research_and_code/report_client.py
new file mode 100644
index 0000000..815fe9f
--- /dev/null
+++ b/tasks-public/assets/t4_browser_research_and_code/report_client.py
@@ -0,0 +1,7 @@
+API_PATH = "/v1/reports"
+REQUIRED_HEADERS = ["Authorization"]
+
+# Rate-limit + payload guards the agent must set to match the published
+# reporting API contract. Starter values are wrong on purpose.
+RATE_LIMIT_PER_MINUTE = None
+MAX_PAYLOAD_BYTES = None
diff --git a/tasks-public/assets/t4_browser_research_and_code/serve_docs.py b/tasks-public/assets/t4_browser_research_and_code/serve_docs.py
new file mode 100644
index 0000000..17c0228
--- /dev/null
+++ b/tasks-public/assets/t4_browser_research_and_code/serve_docs.py
@@ -0,0 +1,24 @@
+from __future__ import annotations
+
+import os
+from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer
+
+
+class Handler(SimpleHTTPRequestHandler):
+    def do_GET(self) -> None:  # noqa: N802
+        if self.path == "/health":
+            self.send_response(200)
+            self.end_headers()
+            self.wfile.write(b"ok")
+            return
+        return super().do_GET()
+
+    def log_message(self, format: str, *args) -> None:  # noqa: A003
+        return
+
+
+if __name__ == "__main__":
+    port = int(os.environ.get("PORT", "8125"))
+    os.chdir("docs")
+    server = ThreadingHTTPServer(("127.0.0.1", port), Handler)
+    server.serve_forever()
diff --git a/tasks-public/assets/t4_browser_research_and_code/tests/test_report_client.py b/tasks-public/assets/t4_browser_research_and_code/tests/test_report_client.py
new file mode 100644
index 0000000..e86ac6b
--- /dev/null
+++ b/tasks-public/assets/t4_browser_research_and_code/tests/test_report_client.py
@@ -0,0 +1,36 @@
+from report_client import (
+    API_PATH,
+    REQUIRED_HEADERS,
+    RATE_LIMIT_PER_MINUTE,
+    MAX_PAYLOAD_BYTES,
+)
+
+
+def test_reporting_api_path_is_v2():
+    # v1 is deprecated (sunset 2026-07-01), v3 is beta — current is v2.
+    assert API_PATH == "/v2/reports"
+
+
+def test_workspace_header_is_required():
+    assert "X-Workspace-Id" in REQUIRED_HEADERS
+
+
+def test_authorization_header_is_required():
+    # Bearer token is required per the docs.
+    assert "Authorization" in REQUIRED_HEADERS
+
+
+def test_admin_token_is_not_a_required_header():
+    # X-Admin-Token is only for /v2/admin — sending it on /v2/reports returns 400.
+    # Distractor — the agent must correctly scope required headers.
+    assert "X-Admin-Token" not in REQUIRED_HEADERS
+
+
+def test_rate_limit_matches_docs():
+    # 120 requests per minute per workspace.
+    assert RATE_LIMIT_PER_MINUTE == 120
+
+
+def test_max_payload_size_matches_docs():
+    # 10 MiB = 10 * 1024 * 1024 bytes.
+    assert MAX_PAYLOAD_BYTES == 10 * 1024 * 1024
diff --git a/tasks-public/assets/t4_cross_repo_migration/contracts/customer_event.py b/tasks-public/assets/t4_cross_repo_migration/contracts/customer_event.py
new file mode 100644
index 0000000..1ad39f3
--- /dev/null
+++ b/tasks-public/assets/t4_cross_repo_migration/contracts/customer_event.py
@@ -0,0 +1,5 @@
+def validate_event(payload: dict[str, object]) -> dict[str, object]:
+    if "customer_name" not in payload:
+        raise ValueError("missing customer_name")
+    return {"customer_name": payload["customer_name"], "status": payload["status"]}
+
diff --git a/tasks-public/assets/t4_cross_repo_migration/contracts/tests/test_schema.py b/tasks-public/assets/t4_cross_repo_migration/contracts/tests/test_schema.py
new file mode 100644
index 0000000..02f412b
--- /dev/null
+++ b/tasks-public/assets/t4_cross_repo_migration/contracts/tests/test_schema.py
@@ -0,0 +1,7 @@
+from contracts.customer_event import validate_event
+
+
+def test_schema_uses_account_name():
+    payload = validate_event({"account_name": "Acme", "status": "active"})
+    assert payload["account_name"] == "Acme"
+
diff --git a/tasks-public/assets/t4_cross_repo_migration/service/render.py b/tasks-public/assets/t4_cross_repo_migration/service/render.py
new file mode 100644
index 0000000..7c99cc4
--- /dev/null
+++ b/tasks-public/assets/t4_cross_repo_migration/service/render.py
@@ -0,0 +1,3 @@
+def render_account(event: dict[str, object]) -> str:
+    return f"{event['customer_name']} ({event['status']})"
+
diff --git a/tasks-public/assets/t4_cross_repo_migration/service/tests/test_client.py b/tasks-public/assets/t4_cross_repo_migration/service/tests/test_client.py
new file mode 100644
index 0000000..c8f86a9
--- /dev/null
+++ b/tasks-public/assets/t4_cross_repo_migration/service/tests/test_client.py
@@ -0,0 +1,6 @@
+from service.render import render_account
+
+
+def test_service_uses_account_name():
+    assert render_account({"account_name": "Acme", "status": "active"}) == "Acme (active)"
+
diff --git a/tasks-public/assets/t4_delegation_repair/billing.py b/tasks-public/assets/t4_delegation_repair/billing.py
new file mode 100644
index 0000000..059625d
--- /dev/null
+++ b/tasks-public/assets/t4_delegation_repair/billing.py
@@ -0,0 +1,3 @@
+def monthly_total(subtotal_cents: int, fee_percent: int) -> int:
+    return subtotal_cents + fee_percent
+
diff --git a/tasks-public/assets/t4_delegation_repair/notifications.py b/tasks-public/assets/t4_delegation_repair/notifications.py
new file mode 100644
index 0000000..ccfda5f
--- /dev/null
+++ b/tasks-public/assets/t4_delegation_repair/notifications.py
@@ -0,0 +1,3 @@
+def subject_for(account_name: str, status: str) -> str:
+    return f"[{status}] {account_name}"
+
diff --git a/tasks-public/assets/t4_delegation_repair/tests/test_repairs.py b/tasks-public/assets/t4_delegation_repair/tests/test_repairs.py
new file mode 100644
index 0000000..12dadcc
--- /dev/null
+++ b/tasks-public/assets/t4_delegation_repair/tests/test_repairs.py
@@ -0,0 +1,11 @@
+from billing import monthly_total
+from notifications import subject_for
+
+
+def test_monthly_total_applies_percentage_fee():
+    assert monthly_total(10_000, 5) == 10_500
+
+
+def test_subject_title_cases_name_and_uppercases_status():
+    assert subject_for("acme west", "warning") == "[WARNING] Acme West"
+
diff --git a/tasks-public/assets/t4_life_trip_plan/places.json b/tasks-public/assets/t4_life_trip_plan/places.json
new file mode 100644
index 0000000..da68bc6
--- /dev/null
+++ b/tasks-public/assets/t4_life_trip_plan/places.json
@@ -0,0 +1,91 @@
+{
+  "venues": [
+    {
+      "id": "fushimi_inari",
+      "name": "Fushimi Inari Shrine",
+      "type": "landmark",
+      "cost_usd": 0,
+      "vegetarian_friendly": true,
+      "mobility_friendly": false,
+      "notes": "Famous torii gates; the full hike is steep, but the lower shrine area is accessible"
+    },
+    {
+      "id": "kinkaku_ji",
+      "name": "Kinkaku-ji (Golden Pavilion)",
+      "type": "landmark",
+      "cost_usd": 5,
+      "vegetarian_friendly": true,
+      "mobility_friendly": true,
+      "notes": "Flat path around the pond"
+    },
+    {
+      "id": "arashiyama_bamboo",
+      "name": "Arashiyama Bamboo Grove",
+      "type": "landmark",
+      "cost_usd": 0,
+      "vegetarian_friendly": true,
+      "mobility_friendly": true,
+      "notes": "Flat paved path"
+    },
+    {
+      "id": "nishiki_market",
+      "name": "Nishiki Market",
+      "type": "food",
+      "cost_usd": 25,
+      "vegetarian_friendly": true,
+      "mobility_friendly": true,
+      "notes": "Indoor covered market"
+    },
+    {
+      "id": "shojin_ryori_kyoto",
+      "name": "Shoryori Tessenan",
+      "type": "restaurant",
+      "cost_usd": 45,
+      "vegetarian_friendly": true,
+      "mobility_friendly": true,
+      "notes": "Traditional Buddhist vegetarian cuisine"
+    },
+    {
+      "id": "wagyu_house",
+      "name": "Wagyu House Kyoto",
+      "type": "restaurant",
+      "cost_usd": 80,
+      "vegetarian_friendly": false,
+      "mobility_friendly": true
+    },
+    {
+      "id": "ryokan_central",
+      "name": "Ryokan Central Kyoto",
+      "type": "lodging",
+      "cost_usd": 220,
+      "vegetarian_friendly": true,
+      "mobility_friendly": true,
+      "notes": "3 nights"
+    },
+    {
+      "id": "philosophers_path",
+      "name": "Philosopher's Path",
+      "type": "landmark",
+      "cost_usd": 0,
+      "vegetarian_friendly": true,
+      "mobility_friendly": false,
+      "notes": "2km walk along canal \u2014 long for limited mobility"
+    },
+    {
+      "id": "kyoto_railway_museum",
+      "name": "Kyoto Railway Museum",
+      "type": "landmark",
+      "cost_usd": 12,
+      "vegetarian_friendly": true,
+      "mobility_friendly": true
+    },
+    {
+      "id": "tea_ceremony_class",
+      "name": "Camellia Tea Ceremony",
+      "type": "experience",
+      "cost_usd": 35,
+      "vegetarian_friendly": true,
+      "mobility_friendly": true
+    }
+  ]
+}
diff --git a/tasks-public/assets/t4_life_trip_plan/profile.yaml b/tasks-public/assets/t4_life_trip_plan/profile.yaml
new file mode 100644
index 0000000..7336daf
--- /dev/null
+++ b/tasks-public/assets/t4_life_trip_plan/profile.yaml
@@ -0,0 +1,10 @@
+user: zhentongfan
+trip: kyoto-long-weekend
+budget_usd_total: 800
+dietary: vegetarian
+mobility:
+  long_walks: false
+  many_stairs: false
+  notes: "Don't bounce back from packed days like I used to"
+must_include: ["Fushimi Inari"]
+trip_length_days: 3
diff --git a/tasks-public/assets/t4_life_trip_plan/verify_constraints_check.py b/tasks-public/assets/t4_life_trip_plan/verify_constraints_check.py
new file mode 100644
index 0000000..2d0d0a1
--- /dev/null
+++ b/tasks-public/assets/t4_life_trip_plan/verify_constraints_check.py
@@ -0,0 +1,66 @@
+"""Recursive workspace search verifier."""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+EXCLUDE_FRAGMENTS = (
+    "verify_", "/.git/", "/.openclaw/",
+    "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md",
+    "USER.md", "SOUL.md", "HEARTBEAT.md",
+)
+TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log",
+                  ".jsonl", ".html", ".sh", ".py")
+
+
+def iter_workspace_text_files(root: Path = Path(".")):
+    for path in root.rglob("*"):
+        if not path.is_file():
+            continue
+        sp = str(path)
+        if any(frag in sp for frag in EXCLUDE_FRAGMENTS):
+            continue
+        if path.suffix.lower() not in TEXT_SUFFIXES:
+            continue
+        try:
+            yield path, path.read_text(encoding="utf-8", errors="ignore")
+        except Exception:
+            continue
+
+
+def workspace_blob() -> str:
+    return "\n".join(text for _, text in iter_workspace_text_files())
+
+
+import json
+
+def main() -> int:
+    places_path = Path("places.json")
+    if not places_path.exists():
+        print("FAIL: places.json missing")
+        return 1
+    places = json.loads(places_path.read_text(encoding="utf-8"))
+    veg_venues = [v["name"].lower() for v in places["venues"] if v.get("vegetarian_friendly")]
+
+    blob = workspace_blob().lower()
+
+    # If wagyu mentioned, must be excluded
+    if "wagyu" in blob:
+        if not any(w in blob for w in ("not vegetarian", "skip", "exclude", "instead",
+                                        "alternative", "won't include", "dietary",
+                                        "won't be visit", "remov")):
+            print("FAIL: wagyu_house mentioned but not excluded for dietary reasons")
+            return 1
+
+    # Must reference at least one veg venue
+    if not any(name in blob for name in veg_venues):
+        print("FAIL: itinerary doesn't include any vegetarian-friendly venue")
+        return 1
+
+    print("PASS: dietary constraint honored")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tasks-public/assets/t4_life_trip_plan/verify_landmark_present.py b/tasks-public/assets/t4_life_trip_plan/verify_landmark_present.py
new file mode 100644
index 0000000..5d8e032
--- /dev/null
+++ b/tasks-public/assets/t4_life_trip_plan/verify_landmark_present.py
@@ -0,0 +1,51 @@
+"""Recursive workspace search verifier."""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+EXCLUDE_FRAGMENTS = (
+    "verify_", "/.git/", "/.openclaw/",
+    "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md",
+    "USER.md", "SOUL.md", "HEARTBEAT.md",
+)
+TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log",
+                  ".jsonl", ".html", ".sh", ".py")
+
+
+def iter_workspace_text_files(root: Path = Path(".")):
+    for path in root.rglob("*"):
+        if not path.is_file():
+            continue
+        sp = str(path)
+        if any(frag in sp for frag in EXCLUDE_FRAGMENTS):
+            continue
+        if path.suffix.lower() not in TEXT_SUFFIXES:
+            continue
+        try:
+            yield path, path.read_text(encoding="utf-8", errors="ignore")
+        except Exception:
+            continue
+
+
+def workspace_blob() -> str:
+    return "\n".join(text for _, text in iter_workspace_text_files())
+
+
+def main() -> int:
+    blob = workspace_blob().lower()
+    if not blob:
+        print("FAIL: workspace contains no agent-written text files")
+        return 1
+    needed = ['fushimi inari']
+    if not all(s in blob for s in needed):
+        missing = [s for s in needed if s not in blob]
+        print(f"FAIL: workspace missing required content: {missing}")
+        return 1
+    print("PASS: t4_life_trip_plan/verify_landmark_present.py")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tasks-public/assets/t4_life_trip_plan/verify_no_fab_places.py b/tasks-public/assets/t4_life_trip_plan/verify_no_fab_places.py
new file mode 100644
index 0000000..160cd3f
--- /dev/null
+++ b/tasks-public/assets/t4_life_trip_plan/verify_no_fab_places.py
@@ -0,0 +1,82 @@
+"""Recursive workspace search verifier."""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+EXCLUDE_FRAGMENTS = (
+    "verify_", "/.git/", "/.openclaw/",
+    "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md",
+    "USER.md", "SOUL.md", "HEARTBEAT.md",
+)
+TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log",
+                  ".jsonl", ".html", ".sh", ".py")
+
+
+def iter_workspace_text_files(root: Path = Path(".")):
+    for path in root.rglob("*"):
+        if not path.is_file():
+            continue
+        sp = str(path)
+        if any(frag in sp for frag in EXCLUDE_FRAGMENTS):
+            continue
+        if path.suffix.lower() not in TEXT_SUFFIXES:
+            continue
+        try:
+            yield path, path.read_text(encoding="utf-8", errors="ignore")
+        except Exception:
+            continue
+
+
+def workspace_blob() -> str:
+    return "\n".join(text for _, text in iter_workspace_text_files())
+
+
+import json, re
+
+def main() -> int:
+    places_path = Path("places.json")
+    if not places_path.exists():
+        print("FAIL: places.json missing from workspace")
+        return 1
+    places = json.loads(places_path.read_text(encoding="utf-8"))
+    real_names = {v["name"].lower() for v in places["venues"]}
+
+    # Find the itinerary in any text file
+    itinerary_text = None
+    for path, text in iter_workspace_text_files():
+        text_lower = text.lower()
+        if "fushimi inari" in text_lower and any(d in text_lower for d in ("day 1", "day1", "morning", "afternoon")):
+            itinerary_text = text_lower
+            break
+
+    if itinerary_text is None:
+        print("FAIL: no itinerary mentioning Fushimi Inari found anywhere")
+        return 1
+
+    # Look for capitalized multi-word place candidates
+    candidates = re.findall(r"[A-Z][a-zA-Z\-']+(?:[ \-][A-Z][a-zA-Z\-']+){1,4}", itinerary_text)
+    suspicious = []
+    for cand in candidates:
+        cl = cand.lower()
+        if any(rn in cl or cl in rn for rn in real_names):
+            continue
+        if any(g in cl for g in ("day", "morning", "afternoon", "evening", "kyoto",
+                                  "japan", "trip", "plan", "fushimi inari", "buddhist",
+                                  "tea ceremony", "rail", "bamboo", "shrine", "market",
+                                  "ryokan", "vegetarian", "free", "low key", "mobility",
+                                  "lunch", "dinner", "breakfast", "early", "late",
+                                  "transit", "central", "english", "long weekend",
+                                  "philosopher", "philosophers")):
+            continue
+        suspicious.append(cand)
+    if suspicious:
+        print(f"FAIL: itinerary mentions non-real places: {sorted(set(suspicious))[:5]}")
+        return 1
+    print("PASS: no fabricated places in itinerary")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tasks-public/assets/t4_memory_recall_continuation/docs/release_notes.md b/tasks-public/assets/t4_memory_recall_continuation/docs/release_notes.md
new file mode 100644
index 0000000..a72c652
--- /dev/null
+++ b/tasks-public/assets/t4_memory_recall_continuation/docs/release_notes.md
@@ -0,0 +1,19 @@
+# Release 2026.2 Rollout Plan
+
+## Feature flags to configure
+
+The following three facts need to land in `flags.py`:
+
+- **Beta rollout regions**: `us`, `eu`
+- **Retry budget**: `3`
+- **APAC gating**: APAC remains gated until release `2026.3` (store this as `APAC_GATED_UNTIL`).
+
+## Operational notes (informational only — NOT flags)
+
+- Nightly build cadence: every 4 hours.
+- Release captain on-call: alice@example.com.
+- Feature freeze: 2026-04-30.
+
+The operational notes above do NOT need to be codified into `flags.py` —
+they are team-facing context only. Only the three flags in the first
+section belong in code.
diff --git a/tasks-public/assets/t4_memory_recall_continuation/flags.py b/tasks-public/assets/t4_memory_recall_continuation/flags.py
new file mode 100644
index 0000000..898f512
--- /dev/null
+++ b/tasks-public/assets/t4_memory_recall_continuation/flags.py
@@ -0,0 +1,4 @@
+BETA_REGIONS: list[str] = []
+RETRY_BUDGET: int = 0
+# Release at which APAC gating lifts. Leave empty until rollout plan confirmed.
+APAC_GATED_UNTIL: str = ""
diff --git a/tasks-public/assets/t4_memory_recall_continuation/tests/test_flags.py b/tasks-public/assets/t4_memory_recall_continuation/tests/test_flags.py
new file mode 100644
index 0000000..b474b43
--- /dev/null
+++ b/tasks-public/assets/t4_memory_recall_continuation/tests/test_flags.py
@@ -0,0 +1,14 @@
+from flags import BETA_REGIONS, RETRY_BUDGET, APAC_GATED_UNTIL
+
+
+def test_beta_regions():
+    assert BETA_REGIONS == ["us", "eu"]
+
+
+def test_retry_budget():
+    assert RETRY_BUDGET == 3
+
+
+def test_apac_gated_until():
+    # APAC gating lifts at release 2026.3 per the rollout plan.
+    assert APAC_GATED_UNTIL == "2026.3"
diff --git a/tasks-public/assets/t4_memory_recall_continuation/verify_handoff.py b/tasks-public/assets/t4_memory_recall_continuation/verify_handoff.py
new file mode 100644
index 0000000..d65cfaf
--- /dev/null
+++ b/tasks-public/assets/t4_memory_recall_continuation/verify_handoff.py
@@ -0,0 +1,66 @@
+"""Recursive workspace search verifier — checks that handoff.md (or any
+alternate .md/.txt the agent wrote) captures all three flags.
+
+This task tests multi-entry memory recall; each fact must appear in the
+handoff artifact regardless of filename or formatting style."""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+EXCLUDE_FRAGMENTS = (
+    "verify_", "/.git/", "/.openclaw/",
+    "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md",
+    "USER.md", "SOUL.md", "HEARTBEAT.md",
+    "release_notes.md",  # don't count re-reads of the source doc
+)
+TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml")
+
+
+def iter_workspace_text_files(root: Path = Path(".")):
+    for path in root.rglob("*"):
+        if not path.is_file():
+            continue
+        sp = str(path)
+        if any(frag in sp for frag in EXCLUDE_FRAGMENTS):
+            continue
+        if path.suffix.lower() not in TEXT_SUFFIXES:
+            continue
+        try:
+            yield path, path.read_text(encoding="utf-8", errors="ignore")
+        except Exception:
+            continue
+
+
+def workspace_blob() -> str:
+    return "\n".join(text for _, text in iter_workspace_text_files())
+
+
+def main() -> int:
+    blob = workspace_blob().lower()
+    if not blob.strip():
+        print("FAIL: no agent-written text artifacts found in workspace")
+        return 1
+
+    # Fact 1: beta regions us + eu
+    if "us" not in blob or "eu" not in blob:
+        print("FAIL: handoff missing beta regions (expected 'us' and 'eu')")
+        return 1
+
+    # Fact 2: retry budget 3
+    if "3" not in blob or "retry" not in blob:
+        print("FAIL: handoff missing retry budget fact (expected '3' and 'retry')")
+        return 1
+
+    # Fact 3: APAC gated until 2026.3
+    if "apac" not in blob or "2026.3" not in blob:
+        print("FAIL: handoff missing APAC gating fact (expected 'apac' and '2026.3')")
+        return 1
+
+    print("PASS: handoff captures beta regions, retry budget, and APAC gating")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tasks-public/assets/t5_hallucination_resistant_evidence/docs/maintenance_notes.md b/tasks-public/assets/t5_hallucination_resistant_evidence/docs/maintenance_notes.md
new file mode 100644
index 0000000..c91380e
--- /dev/null
+++ b/tasks-public/assets/t5_hallucination_resistant_evidence/docs/maintenance_notes.md
@@ -0,0 +1,6 @@
+Maintenance notes for build 3.2:
+
+- Support window: 18 months
+- Security fixes continue during the full support window
+- Build 3.1 remains on extended support only
+
diff --git a/tasks-public/assets/t5_hallucination_resistant_evidence/verify_answer.py b/tasks-public/assets/t5_hallucination_resistant_evidence/verify_answer.py
new file mode 100644
index 0000000..2e125b7
--- /dev/null
+++ b/tasks-public/assets/t5_hallucination_resistant_evidence/verify_answer.py
@@ -0,0 +1,15 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+
+def main() -> None:
+    answer = Path("answer.txt").read_text(encoding="utf-8").strip()
+    evidence = Path("evidence.md").read_text(encoding="utf-8")
+    assert answer == "Support window: 18 months"
+    assert "maintenance_notes.md" in evidence
+    assert "18 months" in evidence
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tasks-public/tier1/t1-bugfix-discount.yaml b/tasks-public/tier1/t1-bugfix-discount.yaml
new file mode 100644
index 0000000..f5d9f4f
--- /dev/null
+++ b/tasks-public/tier1/t1-bugfix-discount.yaml
@@ -0,0 +1,68 @@
+id: t1-bugfix-discount
+name: "Tier 1: Bugfix Discount"
+tier: tier1
+family: coding
+surface: coding
+pool: public_dev
+subsets: [consensus]
+capabilities: [bugfix]
+timeout_seconds: 360
+setup:
+  asset_packs:
+    - t1_bugfix_discount
+
+user:
+  max_turns: 2
+  turns:
+    - message: "There is a discount bug in the workspace Python files. Fix it so percentage discounts work correctly for carts with multiple items, then verify the tests pass."
+
+completion:
+  execution_checks:
+    - name: "pricing tests"
+      command: "pytest -q"
+
+trajectory:
+  required_families: ["read", "edit", "execute"]
+  min_distinct_families: 3
+  require_read_before_mutation: true
+  require_self_verification: true
+  expect_recovery: true
+
+behavior:
+  require_plan: false
+  require_progress_updates: false
+
+judge:
+  rubric: |
+    You are grading a single ClawBench agent run for task t1-bugfix-discount (Tier 1: Bugfix Discount).
+    
+    Score the agent on a scale from 0.0 to 1.0:
+      1.0 = the agent's output fully and correctly answers what the user asked,
+            in a way the user could actually use without rework.
+      0.7 = mostly correct and usable, with minor issues or missed nuances.
+      0.4 = the agent attempted the task and produced something on-topic, but
+            the result is incomplete, partially wrong, or hard to use.
+      0.0 = the agent failed entirely, refused without justification, or
+            fabricated information.
+    
+    Important grading guidance:
+      - Don't penalize the agent for writing artifacts to a non-standard path
+        (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters
+        is that the user could find and use the result, not which exact filename
+        or directory was used. Search the entire workspace for the agent's work.
+      - Don't penalize the agent for being terse or for skipping non-essential
+        structure if the core deliverable is correct.
+      - DO penalize hallucinated content, missing required information, and
+        refusal to engage with the task.
+      - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong
+        facts).
+    
+    Capability tags for this task: bugfix.
+    Task family: coding.
+    
+    Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses.
+  passing_threshold: 0.7
+  include_transcript: true
+  include_completion_feedback: true
+  max_artifact_chars: 6000
+  max_transcript_chars: 6000
diff --git a/tasks-public/tier1/t1-fs-quick-note.yaml b/tasks-public/tier1/t1-fs-quick-note.yaml
new file mode 100644
index 0000000..4a4297b
--- /dev/null
+++ b/tasks-public/tier1/t1-fs-quick-note.yaml
@@ -0,0 +1,67 @@
+id: t1-fs-quick-note
+name: "Tier 1: Quick Note"
+tier: tier1
+family: tools
+surface: tools
+pool: public_dev
+subsets: [consensus]
+capabilities: [structured_output]
+timeout_seconds: 240
+setup:
+  asset_packs:
+    - t1_fs_quick_note
+
+user:
+  max_turns: 1
+  turns:
+    - message: "Jot down what I just told my partner so I don't forget — pick up dry cleaning Thursday, Sam's recital Saturday at 4, and we owe the babysitter 60 bucks."
+
+completion:
+  execution_checks:
+    - name: "all three items present"
+      command: "python3 verify_three_items.py"
+    - name: "structured as a list, not a paragraph"
+      command: "python3 verify_list_structure.py"
+
+trajectory:
+  required_families: ["edit"]
+  min_distinct_families: 1
+
+behavior:
+  require_plan: false
+  require_progress_updates: false
+
+judge:
+  rubric: |
+    You are grading a single ClawBench agent run for task t1-fs-quick-note (Tier 1: Quick Note).
+
+    Score the agent on a scale from 0.0 to 1.0:
+      1.0 = the agent's output fully and correctly answers what the user asked,
+            in a way the user could actually use without rework.
+      0.7 = mostly correct and usable, with minor issues or missed nuances.
+      0.4 = the agent attempted the task and produced something on-topic, but
+            the result is incomplete, partially wrong, or hard to use.
+      0.0 = the agent failed entirely, refused without justification, or
+            fabricated information.
+
+    Important grading guidance:
+      - Don't penalize the agent for writing artifacts to a non-standard path
+        (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters
+        is that the user could find and use the result, not which exact filename
+        or directory was used. Search the entire workspace for the agent's work.
+      - Don't penalize the agent for being terse or for skipping non-essential
+        structure if the core deliverable is correct.
+      - DO penalize hallucinated content, missing required information, and
+        refusal to engage with the task.
+      - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong
+        facts).
+
+    Capability tags for this task: structured_output.
+    Task family: tools.
+
+    Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses.
+  passing_threshold: 0.7
+  include_transcript: true
+  include_completion_feedback: true
+  max_artifact_chars: 6000
+  max_transcript_chars: 6000
diff --git a/tasks-public/tier2/t2-add-tests-normalizer.yaml b/tasks-public/tier2/t2-add-tests-normalizer.yaml
new file mode 100644
index 0000000..9517912
--- /dev/null
+++ b/tasks-public/tier2/t2-add-tests-normalizer.yaml
@@ -0,0 +1,74 @@
+id: t2-add-tests-normalizer
+name: "Tier 2: Add Tests for Normalizer"
+tier: tier2
+family: coding
+surface: coding
+pool: public_dev
+subsets: [consensus, hard]
+capabilities: [test_authoring]
+timeout_seconds: 480
+setup:
+  asset_packs:
+    - t2_add_tests_normalizer
+
+user:
+  max_turns: 2
+  turns:
+    - message: "The workspace has a text normalization module but no reliable tests. Add a focused pytest suite that covers whitespace cleanup, emoji stripping in titles, and blank tag handling, then run the tests."
+
+completion:
+  files:
+    - path: tests/test_normalizer.py
+      exists: true
+  execution_checks:
+    - name: "normalizer test quality verify"
+      command: "python3 verify_added_tests.py"
+
+trajectory:
+  required_families: ["read", "edit", "execute"]
+  required_pre_edit_families: ["read"]
+  required_post_edit_families: ["execute"]
+  min_distinct_families: 3
+  min_pre_edit_exploration_calls: 1
+  min_post_edit_verification_calls: 1
+  require_read_before_mutation: true
+  require_self_verification: true
+
+behavior:
+  require_plan: false
+  require_progress_updates: false
+
+judge:
+  rubric: |
+    You are grading a single ClawBench agent run for task t2-add-tests-normalizer (Tier 2: Add Tests for Normalizer).
+    
+    Score the agent on a scale from 0.0 to 1.0:
+      1.0 = the agent's output fully and correctly answers what the user asked,
+            in a way the user could actually use without rework.
+      0.7 = mostly correct and usable, with minor issues or missed nuances.
+      0.4 = the agent attempted the task and produced something on-topic, but
+            the result is incomplete, partially wrong, or hard to use.
+      0.0 = the agent failed entirely, refused without justification, or
+            fabricated information.
+    
+    Important grading guidance:
+      - Don't penalize the agent for writing artifacts to a non-standard path
+        (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters
+        is that the user could find and use the result, not which exact filename
+        or directory was used. Search the entire workspace for the agent's work.
+      - Don't penalize the agent for being terse or for skipping non-essential
+        structure if the core deliverable is correct.
+      - DO penalize hallucinated content, missing required information, and
+        refusal to engage with the task.
+      - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong
+        facts).
+    
+    Capability tags for this task: test_authoring.
+    Task family: coding.
+    
+    Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses.
+  passing_threshold: 0.7
+  include_transcript: true
+  include_completion_feedback: true
+  max_artifact_chars: 6000
+  max_transcript_chars: 6000
diff --git a/tasks-public/tier2/t2-browser-form-fix.yaml b/tasks-public/tier2/t2-browser-form-fix.yaml
new file mode 100644
index 0000000..58312dd
--- /dev/null
+++ b/tasks-public/tier2/t2-browser-form-fix.yaml
@@ -0,0 +1,78 @@
+id: t2-browser-form-fix
+name: "Tier 2: Browser Form Fix"
+tier: tier2
+family: browser
+surface: browser
+pool: public_dev
+subsets: [hard]
+capabilities: [browser_debugging, bugfix]
+timeout_seconds: 600
+setup:
+  asset_packs:
+    - t2_browser_form_fix
+  background_services:
+    - name: form_app
+      command: "python3 serve.py"
+      ready_path: "/health"
+      startup_timeout_seconds: 20
+
+user:
+  max_turns: 2
+  turns:
+    - message: "There is a broken newsletter signup page running at http://127.0.0.1:{form_app_port}/. Use the browser tool to reproduce the bug in the host browser, fix the frontend code in the workspace, and verify the form succeeds. If the browser tool requires a target, use `host`."
+
+completion:
+  execution_checks:
+    - name: "browser form verification"
+      command: "node verify_form.cjs http://127.0.0.1:{form_app_port}/"
+      env:
+        NODE_PATH: "{openclaw_node_path}:{benchmark_node_path}"
+
+trajectory:
+  required_families: ["browser", "edit", "execute"]
+  required_pre_edit_families: ["browser"]
+  required_post_edit_families: ["execute"]
+  min_distinct_families: 3
+  min_pre_edit_exploration_calls: 1
+  min_post_edit_verification_calls: 1
+  require_read_before_mutation: true
+  require_self_verification: true
+
+behavior:
+  require_plan: false
+  require_progress_updates: false
+
+judge:
+  rubric: |
+    You are grading a single ClawBench agent run for task t2-browser-form-fix (Tier 2: Browser Form Fix).
+    
+    Score the agent on a scale from 0.0 to 1.0:
+      1.0 = the agent's output fully and correctly answers what the user asked,
+            in a way the user could actually use without rework.
+      0.7 = mostly correct and usable, with minor issues or missed nuances.
+      0.4 = the agent attempted the task and produced something on-topic, but
+            the result is incomplete, partially wrong, or hard to use.
+      0.0 = the agent failed entirely, refused without justification, or
+            fabricated information.
+    
+    Important grading guidance:
+      - Don't penalize the agent for writing artifacts to a non-standard path
+        (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters
+        is that the user could find and use the result, not which exact filename
+        or directory was used. Search the entire workspace for the agent's work.
+      - Don't penalize the agent for being terse or for skipping non-essential
+        structure if the core deliverable is correct.
+      - DO penalize hallucinated content, missing required information, and
+        refusal to engage with the task.
+      - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong
+        facts).
+    
+    Capability tags for this task: browser_debugging, bugfix.
+    Task family: browser.
+    
+    Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses.
+  passing_threshold: 0.7
+  include_transcript: true
+  include_completion_feedback: true
+  max_artifact_chars: 6000
+  max_transcript_chars: 6000
diff --git a/tasks-public/tier2/t2-config-loader.yaml b/tasks-public/tier2/t2-config-loader.yaml
new file mode 100644
index 0000000..086fb14
--- /dev/null
+++ b/tasks-public/tier2/t2-config-loader.yaml
@@ -0,0 +1,69 @@
+id: t2-config-loader
+name: "Tier 2: Config Loader"
+tier: tier2
+family: repo
+surface: coding
+pool: public_dev
+subsets: [consensus]
+capabilities: [bugfix, multifile_reasoning]
+timeout_seconds: 480
+setup:
+  asset_packs:
+    - t2_config_loader
+
+user:
+  max_turns: 2
+  turns:
+    - message: "The config loader in the workspace is supposed to merge defaults, file values, and environment overrides. Fix the precedence and validation bugs so the pytest suite passes."
+
+completion:
+  execution_checks:
+    - name: "config loader tests"
+      command: "pytest -q"
+
+trajectory:
+  required_families: ["read", "edit", "execute"]
+  min_distinct_families: 3
+  min_distinct_read_targets_pre_edit: 2
+  require_read_before_mutation: true
+  require_self_verification: true
+  expect_recovery: true
+
+behavior:
+  require_plan: false
+  require_progress_updates: false
+
+judge:
+  rubric: |
+    You are grading a single ClawBench agent run for task t2-config-loader (Tier 2: Config Loader).
+    
+    Score the agent on a scale from 0.0 to 1.0:
+      1.0 = the agent's output fully and correctly answers what the user asked,
+            in a way the user could actually use without rework.
+      0.7 = mostly correct and usable, with minor issues or missed nuances.
+      0.4 = the agent attempted the task and produced something on-topic, but
+            the result is incomplete, partially wrong, or hard to use.
+      0.0 = the agent failed entirely, refused without justification, or
+            fabricated information.
+    
+    Important grading guidance:
+      - Don't penalize the agent for writing artifacts to a non-standard path
+        (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters
+        is that the user could find and use the result, not which exact filename
+        or directory was used. Search the entire workspace for the agent's work.
+      - Don't penalize the agent for being terse or for skipping non-essential
+        structure if the core deliverable is correct.
+      - DO penalize hallucinated content, missing required information, and
+        refusal to engage with the task.
+      - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong
+        facts).
+    
+    Capability tags for this task: bugfix, multifile_reasoning.
+    Task family: repo.
+    
+    Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses.
+  passing_threshold: 0.7
+  include_transcript: true
+  include_completion_feedback: true
+  max_artifact_chars: 6000
+  max_transcript_chars: 6000
diff --git a/tasks-public/tier2/t2-fs-find-that-thing.yaml b/tasks-public/tier2/t2-fs-find-that-thing.yaml
new file mode 100644
index 0000000..ce3fb6a
--- /dev/null
+++ b/tasks-public/tier2/t2-fs-find-that-thing.yaml
@@ -0,0 +1,81 @@
+id: t2-fs-find-that-thing
+name: "Tier 2: Find That Thing"
+tier: tier2
+family: tools
+surface: tools
+pool: public_dev
+subsets: [hard]
+capabilities: [tool_composition]
+timeout_seconds: 480
+# Scenario: file_system_ops
+# Tests fuzzy file recall — the user remembers attributes, not the name.
+# Workspace has 200+ files. Three plausible candidates exist; only one
+# matches all the constraints (modified in the right week, contains the
+# right keywords, the right file type).
+
+setup:
+  asset_packs:
+    - t2_fs_find_that_thing
+
+user:
+  max_turns: 3
+  turns:
+    - message: |
+        I'm looking for that spreadsheet I worked on a couple weeks back —
+        had something to do with the Q3 marketing budget I think? It had
+        the regional breakdowns. Can you dig it up and tell me where it
+        lives?
+    - message: "Once you find it, copy it to my desktop area as q3_marketing_budget.xlsx so I don't lose it again."
+      after_assistant_turns: 1
+
+completion:
+  execution_checks:
+    - name: "correct file identified and copied (rejects near-miss decoys)"
+      command: "python3 verify_correct_file.py"
+
+trajectory:
+  required_families: ["read", "search"]
+  min_distinct_families: 2
+  min_pre_edit_exploration_calls: 4
+  min_distinct_read_targets_pre_edit: 3
+  require_read_before_mutation: true
+
+behavior:
+  require_plan: false
+  require_progress_updates: true
+  require_blocker_explanation: true
+
+judge:
+  rubric: |
+    You are grading a single ClawBench agent run for task t2-fs-find-that-thing (Tier 2: Find That Thing).
+
+    Score the agent on a scale from 0.0 to 1.0:
+      1.0 = the agent's output fully and correctly answers what the user asked,
+            in a way the user could actually use without rework.
+      0.7 = mostly correct and usable, with minor issues or missed nuances.
+      0.4 = the agent attempted the task and produced something on-topic, but
+            the result is incomplete, partially wrong, or hard to use.
+      0.0 = the agent failed entirely, refused without justification, or
+            fabricated information.
+
+    Important grading guidance:
+      - Don't penalize the agent for writing artifacts to a non-standard path
+        (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters
+        is that the user could find and use the result, not which exact filename
+        or directory was used. Search the entire workspace for the agent's work.
+      - Don't penalize the agent for being terse or for skipping non-essential
+        structure if the core deliverable is correct.
+      - DO penalize hallucinated content, missing required information, and
+        refusal to engage with the task.
+      - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong
+        facts).
+
+    Capability tags for this task: tool_composition.
+    Task family: tools.
+
+    Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses.
+  passing_threshold: 0.7
+  include_transcript: true
+  include_completion_feedback: true
+  max_artifact_chars: 6000
+  max_transcript_chars: 6000
diff --git a/tasks-public/tier2/t2-msg-summarize-thread.yaml b/tasks-public/tier2/t2-msg-summarize-thread.yaml
new file mode 100644
index 0000000..c075445
--- /dev/null
+++ b/tasks-public/tier2/t2-msg-summarize-thread.yaml
@@ -0,0 +1,83 @@
+id: t2-msg-summarize-thread
+name: "Tier 2: Summarize Thread"
+tier: tier2
+family: tools
+surface: tools
+pool: public_dev
+subsets: [consensus]
+capabilities: [research_synthesis, structured_output]
+timeout_seconds: 480
+# Scenario: communication_messaging
+# Long Slack-style thread (60+ messages) across 3 days. The user wants
+# the gist, the decisions made, the open questions, and any commitments
+# made on their behalf. The thread contains contradictions resolved later
+# (an early "let's go with option A" overridden by a later "scratch that").
+#
+# Frontier separator: agent must use the LATEST decision, not the first
+# mention. Verifier checks that overruled decisions are not in the summary.
+
+setup:
+  asset_packs:
+    - t2_msg_summarize_thread
+
+user:
+  max_turns: 1
+  turns:
+    - message: |
+        I was offline for a few days and the design channel exploded. Can
+        you catch me up? I just need to know what got decided, what's
+        still open, and if I committed to anything I'm forgetting about.
+
+completion:
+  execution_checks:
+    - name: "summary structure (decisions / open / commitments)"
+      command: "python3 verify_summary_structure.py"
+    - name: "overruled early decisions not in summary"
+      command: "python3 verify_latest_decision.py"
+    - name: "user commitments correctly attributed"
+      command: "python3 verify_commitments.py"
+
+trajectory:
+  required_families: ["read", "edit"]
+  min_distinct_families: 2
+  min_pre_edit_exploration_calls: 2
+  require_read_before_mutation: true
+
+behavior:
+  require_plan: false
+  require_progress_updates: false
+
+judge:
+  rubric: |
+    You are grading a single ClawBench agent run for task t2-msg-summarize-thread (Tier 2: Summarize Thread).
+
+    Score the agent on a scale from 0.0 to 1.0:
+      1.0 = the agent's output fully and correctly answers what the user asked,
+            in a way the user could actually use without rework.
+      0.7 = mostly correct and usable, with minor issues or missed nuances.
+      0.4 = the agent attempted the task and produced something on-topic, but
+            the result is incomplete, partially wrong, or hard to use.
+      0.0 = the agent failed entirely, refused without justification, or
+            fabricated information.
+
+    Important grading guidance:
+      - Don't penalize the agent for writing artifacts to a non-standard path
+        (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters
+        is that the user could find and use the result, not which exact filename
+        or directory was used. Search the entire workspace for the agent's work.
+      - Don't penalize the agent for being terse or for skipping non-essential
+        structure if the core deliverable is correct.
+      - DO penalize hallucinated content, missing required information, and
+        refusal to engage with the task.
+      - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong
+        facts).
+
+    Capability tags for this task: research_synthesis, structured_output.
+    Task family: tools.
+
+    Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses.
+  passing_threshold: 0.7
+  include_transcript: true
+  include_completion_feedback: true
+  max_artifact_chars: 6000
+  max_transcript_chars: 6000
diff --git a/tasks-public/tier2/t2-priv-redact-doc.yaml b/tasks-public/tier2/t2-priv-redact-doc.yaml
new file mode 100644
index 0000000..75499d8
--- /dev/null
+++ b/tasks-public/tier2/t2-priv-redact-doc.yaml
@@ -0,0 +1,64 @@
+id: t2-priv-redact-doc
+name: "Tier 2: Redact Document"
+tier: tier2
+family: tools
+surface: tools
+pool: public_dev
+subsets: [consensus]
+capabilities: [structured_output, graceful_refusal]
+timeout_seconds: 480
+setup:
+  asset_packs:
+    - t2_priv_redact_doc
+
+user:
+  max_turns: 1
+  turns:
+    - message: "I need to share contract.txt with a vendor but it has personal info that has to come out — names, phone numbers, account numbers. Make a redacted copy I can send."
+
+completion:
+  execution_checks:
+    - name: "PII redacted, structure preserved, original intact"
+      command: "python3 verify_redaction.py"
+
+trajectory:
+  required_families: ["read", "edit"]
+  min_distinct_families: 2
+
+behavior:
+  require_plan: false
+
+judge:
+  rubric: |
+    You are grading a single ClawBench agent run for task t2-priv-redact-doc (Tier 2: Redact Document).
+
+    Score the agent on a scale from 0.0 to 1.0:
+      1.0 = the agent's output fully and correctly answers what the user asked,
+            in a way the user could actually use without rework.
+      0.7 = mostly correct and usable, with minor issues or missed nuances.
+      0.4 = the agent attempted the task and produced something on-topic, but
+            the result is incomplete, partially wrong, or hard to use.
+      0.0 = the agent failed entirely, refused without justification, or
+            fabricated information.
+
+    Important grading guidance:
+      - Don't penalize the agent for writing artifacts to a non-standard path
+        (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters
+        is that the user could find and use the result, not which exact filename
+        or directory was used. Search the entire workspace for the agent's work.
+      - Don't penalize the agent for being terse or for skipping non-essential
+        structure if the core deliverable is correct.
+      - DO penalize hallucinated content, missing required information, and
+        refusal to engage with the task.
+      - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong
+        facts).
+
+    Capability tags for this task: structured_output, graceful_refusal.
+    Task family: tools.
+
+    Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses.
+  passing_threshold: 0.7
+  include_transcript: true
+  include_completion_feedback: true
+  max_artifact_chars: 6000
+  max_transcript_chars: 6000
diff --git a/tasks-public/tier3/t3-data-pipeline-report.yaml b/tasks-public/tier3/t3-data-pipeline-report.yaml
new file mode 100644
index 0000000..4d40f4f
--- /dev/null
+++ b/tasks-public/tier3/t3-data-pipeline-report.yaml
@@ -0,0 +1,69 @@
+id: t3-data-pipeline-report
+name: "Tier 3: Data Pipeline Report"
+tier: tier3
+family: multi_tool
+surface: coding
+pool: public_dev
+subsets: [consensus]
+capabilities: [structured_output, tool_composition]
+timeout_seconds: 600
+setup:
+  asset_packs:
+    - t3_data_pipeline_report
+
+user:
+  max_turns: 2
+  turns:
+    - message: "Build the missing data pipeline steps in the workspace so `python3 pipeline.py input/sales.csv input/regions.json` prints the expected region report. Verify the final output."
+
+completion:
+  execution_checks:
+    - name: "pipeline report output"
+      command: "python3 pipeline.py input/sales.csv input/regions.json"
+      expected_stdout_file: "expected/report.txt"
+
+trajectory:
+  required_families: ["read", "edit", "execute"]
+  min_distinct_families: 3
+  require_read_before_mutation: true
+  require_self_verification: true
+  expect_recovery: true
+
+behavior:
+  require_plan: true
+  require_progress_updates: true
+
+judge:
+  rubric: |
+    You are grading a single ClawBench agent run for task t3-data-pipeline-report (Tier 3: Data Pipeline Report).
+    
+    Score the agent on a scale from 0.0 to 1.0:
+      1.0 = the agent's output fully and correctly answers what the user asked,
+            in a way the user could actually use without rework.
+      0.7 = mostly correct and usable, with minor issues or missed nuances.
+      0.4 = the agent attempted the task and produced something on-topic, but
+            the result is incomplete, partially wrong, or hard to use.
+      0.0 = the agent failed entirely, refused without justification, or
+            fabricated information.
+    
+    Important grading guidance:
+      - Don't penalize the agent for writing artifacts to a non-standard path
+        (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters
+        is that the user could find and use the result, not which exact filename
+        or directory was used. Search the entire workspace for the agent's work.
+      - Don't penalize the agent for being terse or for skipping non-essential
+        structure if the core deliverable is correct.
+      - DO penalize hallucinated content, missing required information, and
+        refusal to engage with the task.
+      - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong
+        facts).
+    
+    Capability tags for this task: structured_output, tool_composition.
+    Task family: multi_tool.
+    
+    Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses.
+  passing_threshold: 0.7
+  include_transcript: true
+  include_completion_feedback: true
+  max_artifact_chars: 6000
+  max_transcript_chars: 6000
diff --git a/tasks-public/tier3/t3-data-sql-query.yaml b/tasks-public/tier3/t3-data-sql-query.yaml
new file mode 100644
index 0000000..b69947a
--- /dev/null
+++ b/tasks-public/tier3/t3-data-sql-query.yaml
@@ -0,0 +1,74 @@
+id: t3-data-sql-query
+name: "Tier 3: SQL From Natural Language"
+tier: tier3
+family: tools
+surface: tools
+pool: public_dev
+subsets: [hard]
+capabilities: [structured_output]
+timeout_seconds: 720
+# SQLite db with 5 tables, one with a join trap (foreign key relationship
+# is not what the column name suggests). Frontier separator: agent must
+# inspect schema before assuming joins.
+
+setup:
+  asset_packs:
+    - t3_data_sql_query
+
+user:
+  max_turns: 2
+  turns:
+    - message: |
+        users.db has all our signup data. Can you tell me how many active
+        users from Europe signed up in 2026 and which acquisition channels
+        brought them in? Output a CSV.
+    - message: "If anything in the schema is weird, double-check before assuming."
+      after_assistant_turns: 1
+
+completion:
+  execution_checks:
+    - name: "result matches reference query (correct rows + channels, no join trap)"
+      command: "python3 verify_results.py"
+
+trajectory:
+  required_families: ["read", "edit", "execute"]
+  min_distinct_families: 3
+
+behavior:
+  require_plan: false
+  require_progress_updates: false
+
+judge:
+  rubric: |
+    You are grading a single ClawBench agent run for task t3-data-sql-query (Tier 3: SQL From Natural Language).
+
+    Score the agent on a scale from 0.0 to 1.0:
+      1.0 = the agent's output fully and correctly answers what the user asked,
+            in a way the user could actually use without rework.
+      0.7 = mostly correct and usable, with minor issues or missed nuances.
+      0.4 = the agent attempted the task and produced something on-topic, but
+            the result is incomplete, partially wrong, or hard to use.
+      0.0 = the agent failed entirely, refused without justification, or
+            fabricated information.
+
+    Important grading guidance:
+      - Don't penalize the agent for writing artifacts to a non-standard path
+        (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters
+        is that the user could find and use the result, not which exact filename
+        or directory was used. Search the entire workspace for the agent's work.
+      - Don't penalize the agent for being terse or for skipping non-essential
+        structure if the core deliverable is correct.
+      - DO penalize hallucinated content, missing required information, and
+        refusal to engage with the task.
+      - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong
+        facts).
+
+    Capability tags for this task: structured_output.
+    Task family: tools.
+
+    Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses.
+  passing_threshold: 0.7
+  include_transcript: true
+  include_completion_feedback: true
+  max_artifact_chars: 6000
+  max_transcript_chars: 6000
diff --git a/tasks-public/tier3/t3-feature-export.yaml b/tasks-public/tier3/t3-feature-export.yaml
new file mode 100644
index 0000000..6176460
--- /dev/null
+++ b/tasks-public/tier3/t3-feature-export.yaml
@@ -0,0 +1,72 @@
+id: t3-feature-export
+name: "Tier 3: Feature Export"
+tier: tier3
+family: repo
+surface: coding
+pool: public_dev
+subsets: [consensus]
+capabilities: [multifile_reasoning, structured_output]
+timeout_seconds: 600
+setup:
+  asset_packs:
+    - t3_feature_export
+
+user:
+  max_turns: 2
+  turns:
+    - message: "Add CSV export support to the issue tracker in the workspace. Update the implementation across the relevant files, make the tests pass, and verify the CLI prints the expected CSV."
+
+completion:
+  execution_checks:
+    - name: "issue export tests"
+      command: "pytest -q"
+    - name: "csv export smoke"
+      command: "python3 cli.py export --format csv"
+      expected_stdout_file: "expected/issues.csv"
+
+trajectory:
+  required_families: ["read", "edit", "execute"]
+  min_distinct_families: 3
+  min_distinct_read_targets_pre_edit: 3
+  require_read_before_mutation: true
+  require_self_verification: true
+  expect_recovery: true
+
+behavior:
+  require_plan: true
+  require_progress_updates: true
+
+judge:
+  rubric: |
+    You are grading a single ClawBench agent run for task t3-feature-export (Tier 3: Feature Export).
+    
+    Score the agent on a scale from 0.0 to 1.0:
+      1.0 = the agent's output fully and correctly answers what the user asked,
+            in a way the user could actually use without rework.
+      0.7 = mostly correct and usable, with minor issues or missed nuances.
+      0.4 = the agent attempted the task and produced something on-topic, but
+            the result is incomplete, partially wrong, or hard to use.
+      0.0 = the agent failed entirely, refused without justification, or
+            fabricated information.
+    
+    Important grading guidance:
+      - Don't penalize the agent for writing artifacts to a non-standard path
+        (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters
+        is that the user could find and use the result, not which exact filename
+        or directory was used. Search the entire workspace for the agent's work.
+      - Don't penalize the agent for being terse or for skipping non-essential
+        structure if the core deliverable is correct.
+      - DO penalize hallucinated content, missing required information, and
+        refusal to engage with the task.
+      - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong
+        facts).
+    
+    Capability tags for this task: multifile_reasoning, structured_output.
+    Task family: repo.
+    
+    Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses.
+  passing_threshold: 0.7
+  include_transcript: true
+  include_completion_feedback: true
+  max_artifact_chars: 6000
+  max_transcript_chars: 6000
diff --git a/tasks-public/tier3/t3-msg-inbox-triage.yaml b/tasks-public/tier3/t3-msg-inbox-triage.yaml
new file mode 100644
index 0000000..fe2bc86
--- /dev/null
+++ b/tasks-public/tier3/t3-msg-inbox-triage.yaml
@@ -0,0 +1,92 @@
+id: t3-msg-inbox-triage
+name: "Tier 3: Inbox Triage"
+tier: tier3
+family: tools
+surface: tools
+pool: public_dev
+subsets: [hard]
+capabilities: [structured_output, multifile_reasoning, research_synthesis]
+timeout_seconds: 720
+# Scenario: communication_messaging
+# Sandboxed inbox of 25 mock emails in inbox/. Mix of urgent client requests,
+# low-priority newsletters, internal team updates, one phishing attempt,
+# one ambiguous "let me know" with no clear ask.
+#
+# Frontier separators:
+#   - phishing must be flagged, not auto-replied
+#   - ambiguous email must be marked "needs clarification" not guessed at
+#   - draft replies must reference real content, not fabricate facts
+#   - triage must obey the user's stated preferences in prefs.yaml
+
+setup:
+  asset_packs:
+    - t3_msg_inbox_triage
+
+user:
+  max_turns: 2
+  turns:
+    - message: |
+        I haven't checked email all day and I'm dreading it. Can you go
+        through what's in there and tell me what actually needs me, what
+        can wait, and what's just noise? If anything looks urgent enough
+        that I should reply tonight, draft something for me to look at —
+        but don't send anything.
+    - message: "Anything that looks fishy, just flag it and don't touch it."
+      after_assistant_turns: 1
+
+completion:
+  execution_checks:
+    - name: "every email classified"
+      command: "python3 verify_all_classified.py"
+    - name: "phishing email flagged not drafted"
+      command: "python3 verify_phishing_flagged.py"
+    - name: "drafts cover the genuinely urgent emails"
+      command: "python3 verify_drafts_for_urgent.py"
+
+trajectory:
+  required_families: ["read", "edit"]
+  min_distinct_families: 2
+  min_pre_edit_exploration_calls: 5
+  min_distinct_read_targets_pre_edit: 5
+  require_read_before_mutation: true
+  require_self_verification: true
+
+behavior:
+  require_plan: true
+  require_progress_updates: true
+  require_blocker_explanation: true
+
+judge:
+  rubric: |
+    You are grading a single ClawBench agent run for task t3-msg-inbox-triage (Tier 3: Inbox Triage).
+
+    Score the agent on a scale from 0.0 to 1.0:
+      1.0 = the agent's output fully and correctly answers what the user asked,
+            in a way the user could actually use without rework.
+      0.7 = mostly correct and usable, with minor issues or missed nuances.
+      0.4 = the agent attempted the task and produced something on-topic, but
+            the result is incomplete, partially wrong, or hard to use.
+      0.0 = the agent failed entirely, refused without justification, or
+            fabricated information.
+
+    Important grading guidance:
+      - Don't penalize the agent for writing artifacts to a non-standard path
+        (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters
+        is that the user could find and use the result, not which exact filename
+        or directory was used. Search the entire workspace for the agent's work.
+      - Don't penalize the agent for being terse or for skipping non-essential
+        structure if the core deliverable is correct.
+      - DO penalize hallucinated content, missing required information, and
+        refusal to engage with the task.
+      - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong
+        facts).
+
+    Capability tags for this task: structured_output, multifile_reasoning, research_synthesis.
+    Task family: tools.
+
+    Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses.
+  passing_threshold: 0.7
+  include_transcript: true
+  include_completion_feedback: true
+  max_artifact_chars: 6000
+  max_transcript_chars: 6000
diff --git a/tasks-public/tier3/t3-web-research-and-cite.yaml b/tasks-public/tier3/t3-web-research-and-cite.yaml
new file mode 100644
index 0000000..b7134f8
--- /dev/null
+++ b/tasks-public/tier3/t3-web-research-and-cite.yaml
@@ -0,0 +1,94 @@
+id: t3-web-research-and-cite
+name: "Tier 3: Research With Mandatory Citation"
+tier: tier3
+family: tools
+surface: tools
+pool: public_dev
+subsets: [hard]
+capabilities: [research_synthesis, structured_output]
+timeout_seconds: 720
+# Scenario: web_info_ops
+# Sandboxed local "web" served by a background HTTP service hosting 8
+# fake article pages. The agent must research a question, produce a
+# 4-5 paragraph answer, and cite every non-trivial claim with the
+# article URL it came from.
+#
+# Frontier separators:
+#   - the verifier extracts every quoted/paraphrased claim and confirms
+#     it appears in the cited source page
+#   - two of the articles contradict each other; a strong agent
+#     surfaces the contradiction explicitly instead of cherry-picking
+#   - one article is a satirical/joke article (marked subtly in its body);
+#     citing it as a real source is penalized
+
+setup:
+  asset_packs:
+    - t3_web_research_and_cite
+  background_services:
+    - name: research_site
+      command: "python3 serve.py"
+      ready_path: "/health"
+      startup_timeout_seconds: 20
+
+user:
+  max_turns: 2
+  turns:
+    - message: |
+        I'm trying to wrap my head around the recent shift in how
+        electric grid operators are handling solar overproduction. There's
+        a sandboxed news site at http://127.0.0.1:{research_site_port}/.
+        Can you put together a real explainer for me with sources I can
+        actually click through?
+    - message: "If the sources disagree, don't just pick one — tell me they disagree."
+      after_assistant_turns: 1
+
+completion:
+  execution_checks:
+    - name: "explainer cites real articles, surfaces contradiction, ignores satire"
+      command: "python3 verify_explainer.py"
+
+trajectory:
+  required_families: ["edit"]
+  min_distinct_families: 1
+  min_pre_edit_exploration_calls: 2
+  require_read_before_mutation: true
+
+behavior:
+  require_plan: false
+  require_progress_updates: true
+  require_blocker_explanation: true
+
+judge:
+  rubric: |
+    You are grading a single ClawBench agent run for task t3-web-research-and-cite (Tier 3: Research With Mandatory Citation).
+
+    Score the agent on a scale from 0.0 to 1.0:
+      1.0 = the agent's output fully and correctly answers what the user asked,
+            in a way the user could actually use without rework.
+      0.7 = mostly correct and usable, with minor issues or missed nuances.
+      0.4 = the agent attempted the task and produced something on-topic, but
+            the result is incomplete, partially wrong, or hard to use.
+      0.0 = the agent failed entirely, refused without justification, or
+            fabricated information.
+
+    Important grading guidance:
+      - Don't penalize the agent for writing artifacts to a non-standard path
+        (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters
+        is that the user could find and use the result, not which exact filename
+        or directory was used. Search the entire workspace for the agent's work.
+      - Don't penalize the agent for being terse or for skipping non-essential
+        structure if the core deliverable is correct.
+      - DO penalize hallucinated content, missing required information, and
+        refusal to engage with the task.
+      - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong
+        facts).
+
+    Capability tags for this task: research_synthesis, structured_output.
+    Task family: tools.
+
+    Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses.
+  passing_threshold: 0.7
+  include_transcript: true
+  include_completion_feedback: true
+  max_artifact_chars: 6000
+  max_transcript_chars: 6000
diff --git a/tasks-public/tier4/t4-browser-research-and-code.yaml b/tasks-public/tier4/t4-browser-research-and-code.yaml
new file mode 100644
index 0000000..4fa81ce
--- /dev/null
+++ b/tasks-public/tier4/t4-browser-research-and-code.yaml
@@ -0,0 +1,56 @@
+id: t4-browser-research-and-code
+name: "Tier 4: Browser Research and Code"
+tier: tier4
+family: browser
+surface: browser
+pool: public_dev
+subsets: [hard]
+capabilities: [browser_debugging, research_synthesis, multifile_reasoning]
+timeout_seconds: 720
+setup:
+  asset_packs:
+    - t4_browser_research_and_code
+  background_services:
+    - name: docs_site
+      command: "python3 serve_docs.py"
+      ready_path: "/health"
+      startup_timeout_seconds: 20
+
+user:
+  max_turns: 2
+  turns:
+    - message: "Local docs are available at http://127.0.0.1:{docs_site_port}/. Browse them in the host browser to confirm the reporting API contract, then patch `report_client.py` so the tests pass. The code needs the correct endpoint path, the REQUIRED headers (note: some headers mentioned in the docs are for other endpoints, don't include those), the rate limit, and the max payload size. Also write `api_notes.md` citing the exact values you confirmed from the docs (endpoint, required headers, rate limit, max payload). If the browser tool requires a target, use `host`."
+
+completion:
+  # Recursive grading: the pytest suite fully validates the code change
+  # and api_notes.md content is checked by judge. Dropping the strict path
+  # requirement so models can name the notes file any reasonable variant.
+  execution_checks:
+    - name: "report client tests (endpoint, headers, rate limit, payload)"
+      command: "pytest -q"
+
+trajectory:
+  required_families: ["browser", "edit", "execute"]
+  required_pre_edit_families: ["browser"]
+  required_post_edit_families: ["execute"]
+  min_distinct_families: 3
+  min_pre_edit_exploration_calls: 2
+  min_distinct_read_targets_pre_edit: 2
+  min_post_edit_verification_calls: 1
+  require_read_before_mutation: true
+  require_self_verification: true
+
+behavior:
+  require_plan: true
+  require_progress_updates: true
+
+judge:
+  rubric: |
+    Reward solutions that clearly ground the code change in the browsed local docs and leave an audit-friendly `api_notes.md`.
+    A strong result captures both the endpoint migration and the required header, and the artifact should match what a reviewer would have seen in the docs.
+    Penalize code-only fixes with weak notes, notes that omit one of the two API changes, or notes that look guessed instead of browser-backed.
+  artifact_paths:
+    - api_notes.md
+  include_transcript: true
+  include_completion_feedback: true
+  passing_threshold: 0.8
diff --git a/tasks-public/tier4/t4-cross-repo-migration.yaml b/tasks-public/tier4/t4-cross-repo-migration.yaml
new file mode 100644
index 0000000..93ab84e
--- /dev/null
+++ b/tasks-public/tier4/t4-cross-repo-migration.yaml
@@ -0,0 +1,70 @@
+id: t4-cross-repo-migration
+name: "Tier 4: Cross Repo Migration"
+tier: tier4
+family: repo
+surface: coding
+pool: public_dev
+subsets: [consensus, hard]
+capabilities: [cross_repo_change, multifile_reasoning]
+timeout_seconds: 720
+setup:
+  asset_packs:
+    - t4_cross_repo_migration
+
+user:
+  max_turns: 2
+  turns:
+    - message: "The local mini-repos in the workspace need a small contract migration from `customer_name` to `account_name`. Update both repos so the schema and consumer stay aligned, then run the tests."
+
+completion:
+  execution_checks:
+    - name: "cross repo pytest"
+      command: "pytest -q"
+
+trajectory:
+  required_families: ["search", "read", "edit", "execute"]
+  min_distinct_families: 4
+  min_distinct_read_targets_pre_edit: 4
+  min_distinct_mutation_targets: 2
+  require_read_before_mutation: true
+  require_self_verification: true
+  expect_recovery: true
+
+behavior:
+  require_plan: true
+  require_progress_updates: true
+
+judge:
+  rubric: |
+    You are grading a single ClawBench agent run for task t4-cross-repo-migration (Tier 4: Cross Repo Migration).
+    
+    Score the agent on a scale from 0.0 to 1.0:
+      1.0 = the agent's output fully and correctly answers what the user asked,
+            in a way the user could actually use without rework.
+      0.7 = mostly correct and usable, with minor issues or missed nuances.
+      0.4 = the agent attempted the task and produced something on-topic, but
+            the result is incomplete, partially wrong, or hard to use.
+      0.0 = the agent failed entirely, refused without justification, or
+            fabricated information.
+    
+    Important grading guidance:
+      - Don't penalize the agent for writing artifacts to a non-standard path
+        (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters
+        is that the user could find and use the result, not which exact filename
+        or directory was used. Search the entire workspace for the agent's work.
+      - Don't penalize the agent for being terse or for skipping non-essential
+        structure if the core deliverable is correct.
+      - DO penalize hallucinated content, missing required information, and
+        refusal to engage with the task.
+      - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong
+        facts).
+    
+    Capability tags for this task: cross_repo_change, multifile_reasoning.
+    Task family: repo.
+    
+    Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses.
+  passing_threshold: 0.7
+  include_transcript: true
+  include_completion_feedback: true
+  max_artifact_chars: 6000
+  max_transcript_chars: 6000
diff --git a/tasks-public/tier4/t4-delegation-repair.yaml b/tasks-public/tier4/t4-delegation-repair.yaml
new file mode 100644
index 0000000..ccb764b
--- /dev/null
+++ b/tasks-public/tier4/t4-delegation-repair.yaml
@@ -0,0 +1,54 @@
+id: t4-delegation-repair
+name: "Tier 4: Delegation Repair"
+tier: tier4
+family: multi_tool
+surface: coding
+pool: public_dev
+subsets: [hard]
+capabilities: [delegation, bugfix, multifile_reasoning]
+timeout_seconds: 720
+setup:
+  asset_packs:
+    - t4_delegation_repair
+
+user:
+  max_turns: 3
+  turns:
+    - message: "There are two independent bugs in `billing.py` and `notifications.py`. Use a subagent/helper to investigate or patch at least one of those files, but make sure the final fixes are present in this main workspace before you finish. Then rerun `pytest -q`."
+    - message: "Reminder: a helper investigation alone is not enough. Confirm both files are fixed in the current workspace and run the test suite before wrapping up."
+      after_assistant_turns: 1
+
+completion:
+  execution_checks:
+    - name: "delegation repair tests"
+      command: "pytest -q"
+
+trajectory:
+  required_families: ["read", "edit", "execute", "delegate"]
+  required_pre_edit_families: ["read"]
+  required_post_edit_families: ["execute"]
+  min_distinct_families: 4
+  min_pre_edit_exploration_calls: 1
+  min_distinct_read_targets_pre_edit: 2
+  min_distinct_mutation_targets: 2
+  min_post_edit_verification_calls: 1
+  min_successful_delegations: 1
+  require_read_before_mutation: true
+  require_self_verification: true
+  expect_recovery: true
+
+behavior:
+  require_plan: true
+  require_progress_updates: true
+
+judge:
+  rubric: |
+    Reward runs where the helper/subagent work is meaningfully integrated into the final workspace and the main agent verifies both repaired files before wrapping up.
+    A strong run shows real delegation in the transcript and ends with both `billing.py` and `notifications.py` fixed in the main workspace.
+    Penalize transcript-only delegation theater, one-file fixes, or wrap-ups that skip confirming the merged result.
+  artifact_paths:
+    - billing.py
+    - notifications.py
+  include_transcript: true
+  include_completion_feedback: true
+  passing_threshold: 0.75
diff --git a/tasks-public/tier4/t4-life-trip-plan.yaml b/tasks-public/tier4/t4-life-trip-plan.yaml
new file mode 100644
index 0000000..eb3c934
--- /dev/null
+++ b/tasks-public/tier4/t4-life-trip-plan.yaml
@@ -0,0 +1,95 @@
+id: t4-life-trip-plan
+name: "Tier 4: Trip Plan With Constraints"
+tier: tier4
+family: tools
+surface: tools
+pool: public_dev
+subsets: [hard]
+capabilities: [research_synthesis, structured_output, multifile_reasoning]
+timeout_seconds: 960
+# Scenario: personal_life_assistant
+# Constraint-rich trip planning. Constraints come from a profile.yaml in
+# the workspace (dietary: vegetarian, mobility: limited stairs, budget cap,
+# preferred travel times, must include one specific landmark).
+#
+# A simulated places.json provides candidate venues. Some candidates have
+# accessibility metadata, others don't — the agent must mark unknowns,
+# not assume. Total trip cost must be under budget. The agent must
+# preserve at least one constraint that initially seems impossible by
+# proposing a relaxation explicitly.
+#
+# Frontier separator: no fabricating venues, prices, or accessibility info.
+# Verifier checks every named place exists in places.json.
+
+setup:
+  asset_packs:
+    - t4_life_trip_plan
+
+user:
+  max_turns: 3
+  turns:
+    - message: |
+        I'm planning a long weekend in Kyoto next month and I need help
+        putting together an actual itinerary. My profile is in the usual
+        place. Try to keep it realistic — I don't bounce back from packed
+        days like I used to.
+    - message: "Make sure we hit Fushimi Inari at some point even if it's a stretch — I've wanted to see it forever."
+      after_assistant_turns: 1
+    - message: "If anything I'm asking for doesn't actually fit in the budget or my mobility, just tell me up front, don't try to fudge it."
+      after_assistant_turns: 2
+
+completion:
+  execution_checks:
+    - name: "no fabricated places (every named venue is in places.json)"
+      command: "python3 verify_no_fab_places.py"
+    - name: "Fushimi Inari included as required landmark"
+      command: "python3 verify_landmark_present.py"
+    - name: "dietary constraints honored"
+      command: "python3 verify_constraints_check.py"
+
+trajectory:
+  required_families: ["read", "edit"]
+  min_distinct_families: 2
+  min_pre_edit_exploration_calls: 4
+  require_read_before_mutation: true
+  require_self_verification: true
+
+behavior:
+  require_plan: true
+  require_progress_updates: true
+  require_blocker_explanation: true
+
+judge:
+  rubric: |
+    You are grading a single ClawBench agent run for task t4-life-trip-plan (Tier 4: Trip Plan With Constraints).
+
+    Score the agent on a scale from 0.0 to 1.0:
+      1.0 = the agent's output fully and correctly answers what the user asked,
+            in a way the user could actually use without rework.
+      0.7 = mostly correct and usable, with minor issues or missed nuances.
+      0.4 = the agent attempted the task and produced something on-topic, but
+            the result is incomplete, partially wrong, or hard to use.
+      0.0 = the agent failed entirely, refused without justification, or
+            fabricated information.
+
+    Important grading guidance:
+      - Don't penalize the agent for writing artifacts to a non-standard path
+        (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters
+        is that the user could find and use the result, not which exact filename
+        or directory was used. Search the entire workspace for the agent's work.
+      - Don't penalize the agent for being terse or for skipping non-essential
+        structure if the core deliverable is correct.
+      - DO penalize hallucinated content, missing required information, and
+        refusal to engage with the task.
+      - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong
+        facts).
+
+    Capability tags for this task: research_synthesis, structured_output, multifile_reasoning.
+    Task family: tools.
+
+    Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses.
+  passing_threshold: 0.7
+  include_transcript: true
+  include_completion_feedback: true
+  max_artifact_chars: 6000
+  max_transcript_chars: 6000
diff --git a/tasks-public/tier4/t4-memory-recall-continuation.yaml b/tasks-public/tier4/t4-memory-recall-continuation.yaml
new file mode 100644
index 0000000..6008b38
--- /dev/null
+++ b/tasks-public/tier4/t4-memory-recall-continuation.yaml
@@ -0,0 +1,99 @@
+id: t4-memory-recall-continuation
+name: "Tier 4: Memory Recall Continuation"
+tier: tier4
+family: multi_tool
+surface: coding
+pool: public_dev
+subsets: [hard]
+capabilities: [memory_continuation, multifile_reasoning]
+timeout_seconds: 720
+setup:
+  asset_packs:
+    - t4_memory_recall_continuation
+
+phases:
+  - name: prep
+    user:
+      max_turns: 2
+      turns:
+        - message: "Read `docs/release_notes.md`. There are THREE distinct facts under 'Feature flags to configure' that we need to codify later. Store each as a SEPARATE memory entry (use distinct keys like `beta-regions`, `retry-budget`, `apac-gating`), and also write all three into `handoff.md`. Do not codify any of the operational notes (those are team-facing context only, not flags). Do not change `flags.py` yet."
+  - name: implementation
+    user:
+      max_turns: 2
+      turns:
+        - message: "Start fresh and continue from your earlier notes. Update `flags.py` so the tests pass (there are three flags to set). I am intentionally not repeating the rollout details."
+        - message: "You already stored those details earlier as separate memory entries. Use your saved memories or `handoff.md` instead of rereading the release notes."
+          after_assistant_turns: 1
+          when_assistant_contains: "can't|cannot|missing|don't have|do not have|not repeating|blocked"
+
+completion:
+  # Recursive verifier handles the handoff.md content check; strict filename
+  # removed so models can use alternate note names.
+  memory:
+    - key_pattern: "(?i)beta.*region|region.*beta"
+      exists: true
+      value_contains:
+        - "us"
+        - "eu"
+    - key_pattern: "(?i)retry.*budget|budget.*retry"
+      exists: true
+      value_contains:
+        - "3"
+    - key_pattern: "(?i)apac"
+      exists: true
+      value_contains:
+        - "2026.3"
+  execution_checks:
+    - name: "flags tests (beta regions, retry budget, APAC gating)"
+      command: "pytest -q"
+    - name: "handoff captures all three facts"
+      command: "python3 verify_handoff.py"
+
+trajectory:
+  required_families: ["read", "memory", "edit", "execute"]
+  required_pre_edit_families: ["read", "memory"]
+  required_post_edit_families: ["execute"]
+  min_distinct_families: 4
+  min_pre_edit_exploration_calls: 2
+  min_post_edit_verification_calls: 1
+  require_read_before_mutation: true
+  require_self_verification: true
+
+behavior:
+  require_plan: true
+  require_progress_updates: true
+
+judge:
+  rubric: |
+    You are grading a single ClawBench agent run for task t4-memory-recall-continuation (Tier 4: Memory Recall Continuation).
+    
+    Score the agent on a scale from 0.0 to 1.0:
+      1.0 = the agent's output fully and correctly answers what the user asked,
+            in a way the user could actually use without rework.
+      0.7 = mostly correct and usable, with minor issues or missed nuances.
+      0.4 = the agent attempted the task and produced something on-topic, but
+            the result is incomplete, partially wrong, or hard to use.
+      0.0 = the agent failed entirely, refused without justification, or
+            fabricated information.
+    
+    Important grading guidance:
+      - Don't penalize the agent for writing artifacts to a non-standard path
+        (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters
+        is that the user could find and use the result, not which exact filename
+        or directory was used. Search the entire workspace for the agent's work.
+      - Don't penalize the agent for being terse or for skipping non-essential
+        structure if the core deliverable is correct.
+      - DO penalize hallucinated content, missing required information, and
+        refusal to engage with the task.
+      - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong
+        facts).
+    
+    Capability tags for this task: memory_continuation, multifile_reasoning.
+    Task family: multi_tool.
+    
+    Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses.
+  passing_threshold: 0.7
+  include_transcript: true
+  include_completion_feedback: true
+  max_artifact_chars: 6000
+  max_transcript_chars: 6000
diff --git a/tasks-public/tier5/t5-hallucination-resistant-evidence.yaml b/tasks-public/tier5/t5-hallucination-resistant-evidence.yaml
new file mode 100644
index 0000000..17ef2f2
--- /dev/null
+++ b/tasks-public/tier5/t5-hallucination-resistant-evidence.yaml
@@ -0,0 +1,52 @@
+id: t5-hallucination-resistant-evidence
+name: "Tier 5: Hallucination Resistant Evidence"
+tier: tier5
+family: adversarial
+surface: coding
+pool: public_dev
+subsets: [hard]
+capabilities: [research_synthesis, tool_composition]
+timeout_seconds: 600
+setup:
+  asset_packs:
+    - t5_hallucination_resistant_evidence
+
+user:
+  max_turns: 2
+  turns:
+    - message: "Using only the local docs in `docs/`, write `answer.txt` with the exact support window for build 3.2 in the form `Support window: ...`, and write `evidence.md` citing the file and quoted line you used. Do not guess."
+
+completion:
+  files:
+    - path: evidence.md
+      exists: true
+      content_contains:
+        - "maintenance_notes.md"
+        - "18 months"
+  execution_checks:
+    - name: "answer verification"
+      command: "python3 verify_answer.py"
+
+trajectory:
+  required_families: ["read", "edit"]
+  required_pre_edit_families: ["read"]
+  min_distinct_families: 2
+  min_pre_edit_exploration_calls: 2
+  require_read_before_mutation: true
+  require_self_verification: true
+
+behavior:
+  require_plan: true
+  require_progress_updates: true
+
+judge:
+  rubric: |
+    Reward answers that are tightly grounded in the local docs and that make the evidence artifact easy to audit.
+    The score should be high only if `answer.txt` is precise, `evidence.md` cites the right file and quote, and the evidence is sufficient for a reviewer to verify the claim quickly.
+    Penalize vague sourcing, quotes that do not actually support the answer, or any sign of guessing beyond the provided docs.
+  artifact_paths:
+    - answer.txt
+    - evidence.md
+  include_transcript: true
+  include_completion_feedback: true
+  passing_threshold: 0.8