From 50959fa670fb651c07c78ade60055d16711b4697 Mon Sep 17 00:00:00 2001 From: scoootscooob Date: Mon, 20 Apr 2026 20:06:36 -0700 Subject: [PATCH] tasks: add Core v1 public task set (19 tasks) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Stages a curated 19-task subset of the internal 40-task dev pool as the public ClawBench release. Selected via greedy task elimination from the v2026-4-19-full sweep archive so that: (a) mean run_score across these 19 tasks reproduces the established 8-model ranking with zero inversions and min adjacent-rank gap of 0.0049 (well above the ~0.002 seed-noise floor); (b) coverage is preserved across tiers 1-5 and across the tools, coding, repo, browser, multi_tool, and adversarial families; (c) tasks with broken verifiers or near-zero cross-model SNR are dropped (21 tasks retained as private holdout, not published). Established ranking (v4-19-full, OpenClaw 2026.4.15-beta.1, 3 runs per task, C+T+B+J weighted score): 1. Claude Opus 4.6 0.8137 2. Claude Opus 4.7 0.7824 3. GPT 5.4 0.7647 4. Claude Sonnet 4.6 0.7597 5. MiniMax M2.7 0.7475 6. Gemini 3.1 Pro 0.7408 7. Qwen 3.6 Plus 0.7030 8. Kimi K2.5 0.6800 Deliverables: tasks-public/MANIFEST.yaml — machine-readable task list + metadata tasks-public/README.md — rationale, usage, reproducibility notes tasks-public/tier{1..5}/*.yaml — 19 task definitions tasks-public/assets/*/ — 19 asset packs (verifiers + fixtures) The internal dev set remains in tasks/ (gitignored) and retains 40 tasks for future expansion. Not published: - 9 ceiling tasks (all frontier models score >0.85) - 9 noise tasks (cross-model SNR < 0.5) - 3 ranking-breaker tasks (e.g. t2-node-search-patch, t5-contradictory-requirements) Core v2 will add Tier 6 long-horizon tasks, paraphrased prompt pairs for perturbation-sensitivity measurement, and creative-synthesis tasks — all currently absent from Core v1. Co-Authored-By: Claude Opus 4.7 (1M context) --- tasks-public/MANIFEST.yaml | 220 ++++++++++++++++++ tasks-public/README.md | 132 +++++++++++ .../assets/t1_bugfix_discount/cart.py | 6 + .../assets/t1_bugfix_discount/pricing.py | 4 + .../t1_bugfix_discount/tests/test_pricing.py | 10 + .../assets/t1_fs_quick_note/notes/.gitkeep | 0 .../t1_fs_quick_note/verify_list_structure.py | 57 +++++ .../t1_fs_quick_note/verify_three_items.py | 56 +++++ .../t2_add_tests_normalizer/normalizer.py | 14 ++ .../verify_added_tests.py | 74 ++++++ .../assets/t2_browser_form_fix/app.js | 16 ++ .../assets/t2_browser_form_fix/index.html | 20 ++ .../assets/t2_browser_form_fix/serve.py | 21 ++ .../t2_browser_form_fix/verify_form.cjs | 23 ++ .../assets/t2_config_loader/app_config.py | 6 + .../assets/t2_config_loader/config_loader.py | 20 ++ .../tests/test_config_loader.py | 20 ++ .../.correct_filename.txt | 1 + .../Documents/notes_1.txt | 1 + .../Documents/notes_10.txt | 1 + .../Documents/notes_11.txt | 1 + .../Documents/notes_12.txt | 1 + .../Documents/notes_13.txt | 1 + .../Documents/notes_14.txt | 1 + .../Documents/notes_15.txt | 1 + .../Documents/notes_16.txt | 1 + .../Documents/notes_17.txt | 1 + .../Documents/notes_18.txt | 1 + .../Documents/notes_19.txt | 1 + .../Documents/notes_2.txt | 1 + .../Documents/notes_20.txt | 1 + .../Documents/notes_21.txt | 1 + .../Documents/notes_22.txt | 1 + .../Documents/notes_23.txt | 1 + .../Documents/notes_24.txt | 1 + .../Documents/notes_25.txt | 1 + .../Documents/notes_3.txt | 1 + .../Documents/notes_4.txt | 1 + .../Documents/notes_5.txt | 1 + .../Documents/notes_6.txt | 1 + .../Documents/notes_7.txt | 1 + .../Documents/notes_8.txt | 1 + .../Documents/notes_9.txt | 1 + .../Documents/q2_marketing_budget.xlsx | 4 + .../Documents/q3_marketing_budget_v3.xlsx | 8 + .../Documents/q3_sales_breakdown.xlsx | 4 + .../Downloads/file_1.pdf | 1 + .../Downloads/file_10.pdf | 1 + .../Downloads/file_2.pdf | 1 + .../Downloads/file_3.pdf | 1 + .../Downloads/file_4.pdf | 1 + .../Downloads/file_5.pdf | 1 + .../Downloads/file_6.pdf | 1 + .../Downloads/file_7.pdf | 1 + .../Downloads/file_8.pdf | 1 + .../Downloads/file_9.pdf | 1 + .../verify_correct_file.py | 76 ++++++ .../assets/t2_msg_summarize_thread/thread.txt | 29 +++ .../verify_commitments.py | 54 +++++ .../verify_latest_decision.py | 50 ++++ .../verify_summary_structure.py | 55 +++++ .../assets/t2_priv_redact_doc/contract.txt | 25 ++ .../t2_priv_redact_doc/verify_redaction.py | 68 ++++++ .../expected/report.txt | 4 + .../input/regions.json | 2 + .../t3_data_pipeline_report/input/sales.csv | 6 + .../t3_data_pipeline_report/pipeline.py | 29 +++ .../assets/t3_data_sql_query/users.db | Bin 0 -> 24576 bytes .../t3_data_sql_query/verify_results.py | 68 ++++++ tasks-public/assets/t3_feature_export/cli.py | 23 ++ .../t3_feature_export/expected/issues.csv | 4 + .../assets/t3_feature_export/exporters.py | 10 + .../assets/t3_feature_export/issues.py | 5 + .../t3_feature_export/tests/test_export.py | 11 + .../inbox/01_urgent_client_outage.txt | 11 + .../inbox/02_newsletter_techweekly.txt | 11 + .../inbox/03_phishing_attempt.txt | 14 ++ .../inbox/04_team_status_update.txt | 13 ++ .../inbox/05_ambiguous_let_me_know.txt | 6 + .../inbox/06_legal_review_request.txt | 12 + .../inbox/07_meetup_invite.txt | 9 + .../inbox/08_recruiter_cold_outreach.txt | 13 ++ .../assets/t3_msg_inbox_triage/prefs.yaml | 10 + .../verify_all_classified.py | 58 +++++ .../verify_drafts_for_urgent.py | 54 +++++ .../verify_phishing_flagged.py | 50 ++++ .../articles/01_grid_basics.html | 14 ++ .../articles/02_battery_storage.html | 13 ++ .../articles/03_pricing_signals.html | 13 ++ .../articles/04_curtailment_helps.html | 13 ++ .../articles/05_satire.html | 12 + .../assets/t3_web_research_and_cite/serve.py | 66 ++++++ .../verify_explainer.py | 71 ++++++ .../docs/index.html | 41 ++++ .../report_client.py | 7 + .../serve_docs.py | 24 ++ .../tests/test_report_client.py | 36 +++ .../contracts/customer_event.py | 5 + .../contracts/tests/test_schema.py | 7 + .../t4_cross_repo_migration/service/render.py | 3 + .../service/tests/test_client.py | 6 + .../assets/t4_delegation_repair/billing.py | 3 + .../t4_delegation_repair/notifications.py | 3 + .../tests/test_repairs.py | 11 + .../assets/t4_life_trip_plan/places.json | 91 ++++++++ .../assets/t4_life_trip_plan/profile.yaml | 10 + .../verify_constraints_check.py | 66 ++++++ .../verify_landmark_present.py | 51 ++++ .../t4_life_trip_plan/verify_no_fab_places.py | 82 +++++++ .../docs/release_notes.md | 19 ++ .../t4_memory_recall_continuation/flags.py | 4 + .../tests/test_flags.py | 14 ++ .../verify_handoff.py | 66 ++++++ .../docs/maintenance_notes.md | 6 + .../verify_answer.py | 15 ++ tasks-public/tier1/t1-bugfix-discount.yaml | 68 ++++++ tasks-public/tier1/t1-fs-quick-note.yaml | 67 ++++++ .../tier2/t2-add-tests-normalizer.yaml | 74 ++++++ tasks-public/tier2/t2-browser-form-fix.yaml | 78 +++++++ tasks-public/tier2/t2-config-loader.yaml | 69 ++++++ tasks-public/tier2/t2-fs-find-that-thing.yaml | 81 +++++++ .../tier2/t2-msg-summarize-thread.yaml | 83 +++++++ tasks-public/tier2/t2-priv-redact-doc.yaml | 64 +++++ .../tier3/t3-data-pipeline-report.yaml | 69 ++++++ tasks-public/tier3/t3-data-sql-query.yaml | 74 ++++++ tasks-public/tier3/t3-feature-export.yaml | 72 ++++++ tasks-public/tier3/t3-msg-inbox-triage.yaml | 92 ++++++++ .../tier3/t3-web-research-and-cite.yaml | 94 ++++++++ .../tier4/t4-browser-research-and-code.yaml | 56 +++++ .../tier4/t4-cross-repo-migration.yaml | 70 ++++++ tasks-public/tier4/t4-delegation-repair.yaml | 54 +++++ tasks-public/tier4/t4-life-trip-plan.yaml | 95 ++++++++ .../tier4/t4-memory-recall-continuation.yaml | 99 ++++++++ .../t5-hallucination-resistant-evidence.yaml | 52 +++++ 134 files changed, 3714 insertions(+) create mode 100644 tasks-public/MANIFEST.yaml create mode 100644 tasks-public/README.md create mode 100644 tasks-public/assets/t1_bugfix_discount/cart.py create mode 100644 tasks-public/assets/t1_bugfix_discount/pricing.py create mode 100644 tasks-public/assets/t1_bugfix_discount/tests/test_pricing.py create mode 100644 tasks-public/assets/t1_fs_quick_note/notes/.gitkeep create mode 100644 tasks-public/assets/t1_fs_quick_note/verify_list_structure.py create mode 100644 tasks-public/assets/t1_fs_quick_note/verify_three_items.py create mode 100644 tasks-public/assets/t2_add_tests_normalizer/normalizer.py create mode 100644 tasks-public/assets/t2_add_tests_normalizer/verify_added_tests.py create mode 100644 tasks-public/assets/t2_browser_form_fix/app.js create mode 100644 tasks-public/assets/t2_browser_form_fix/index.html create mode 100644 tasks-public/assets/t2_browser_form_fix/serve.py create mode 100644 tasks-public/assets/t2_browser_form_fix/verify_form.cjs create mode 100644 tasks-public/assets/t2_config_loader/app_config.py create mode 100644 tasks-public/assets/t2_config_loader/config_loader.py create mode 100644 tasks-public/assets/t2_config_loader/tests/test_config_loader.py create mode 100644 tasks-public/assets/t2_fs_find_that_thing/.correct_filename.txt create mode 100644 tasks-public/assets/t2_fs_find_that_thing/Documents/notes_1.txt create mode 100644 tasks-public/assets/t2_fs_find_that_thing/Documents/notes_10.txt create mode 100644 tasks-public/assets/t2_fs_find_that_thing/Documents/notes_11.txt create mode 100644 tasks-public/assets/t2_fs_find_that_thing/Documents/notes_12.txt create mode 100644 tasks-public/assets/t2_fs_find_that_thing/Documents/notes_13.txt create mode 100644 tasks-public/assets/t2_fs_find_that_thing/Documents/notes_14.txt create mode 100644 tasks-public/assets/t2_fs_find_that_thing/Documents/notes_15.txt create mode 100644 tasks-public/assets/t2_fs_find_that_thing/Documents/notes_16.txt create mode 100644 tasks-public/assets/t2_fs_find_that_thing/Documents/notes_17.txt create mode 100644 tasks-public/assets/t2_fs_find_that_thing/Documents/notes_18.txt create mode 100644 tasks-public/assets/t2_fs_find_that_thing/Documents/notes_19.txt create mode 100644 tasks-public/assets/t2_fs_find_that_thing/Documents/notes_2.txt create mode 100644 tasks-public/assets/t2_fs_find_that_thing/Documents/notes_20.txt create mode 100644 tasks-public/assets/t2_fs_find_that_thing/Documents/notes_21.txt create mode 100644 tasks-public/assets/t2_fs_find_that_thing/Documents/notes_22.txt create mode 100644 tasks-public/assets/t2_fs_find_that_thing/Documents/notes_23.txt create mode 100644 tasks-public/assets/t2_fs_find_that_thing/Documents/notes_24.txt create mode 100644 tasks-public/assets/t2_fs_find_that_thing/Documents/notes_25.txt create mode 100644 tasks-public/assets/t2_fs_find_that_thing/Documents/notes_3.txt create mode 100644 tasks-public/assets/t2_fs_find_that_thing/Documents/notes_4.txt create mode 100644 tasks-public/assets/t2_fs_find_that_thing/Documents/notes_5.txt create mode 100644 tasks-public/assets/t2_fs_find_that_thing/Documents/notes_6.txt create mode 100644 tasks-public/assets/t2_fs_find_that_thing/Documents/notes_7.txt create mode 100644 tasks-public/assets/t2_fs_find_that_thing/Documents/notes_8.txt create mode 100644 tasks-public/assets/t2_fs_find_that_thing/Documents/notes_9.txt create mode 100644 tasks-public/assets/t2_fs_find_that_thing/Documents/q2_marketing_budget.xlsx create mode 100644 tasks-public/assets/t2_fs_find_that_thing/Documents/q3_marketing_budget_v3.xlsx create mode 100644 tasks-public/assets/t2_fs_find_that_thing/Documents/q3_sales_breakdown.xlsx create mode 100644 tasks-public/assets/t2_fs_find_that_thing/Downloads/file_1.pdf create mode 100644 tasks-public/assets/t2_fs_find_that_thing/Downloads/file_10.pdf create mode 100644 tasks-public/assets/t2_fs_find_that_thing/Downloads/file_2.pdf create mode 100644 tasks-public/assets/t2_fs_find_that_thing/Downloads/file_3.pdf create mode 100644 tasks-public/assets/t2_fs_find_that_thing/Downloads/file_4.pdf create mode 100644 tasks-public/assets/t2_fs_find_that_thing/Downloads/file_5.pdf create mode 100644 tasks-public/assets/t2_fs_find_that_thing/Downloads/file_6.pdf create mode 100644 tasks-public/assets/t2_fs_find_that_thing/Downloads/file_7.pdf create mode 100644 tasks-public/assets/t2_fs_find_that_thing/Downloads/file_8.pdf create mode 100644 tasks-public/assets/t2_fs_find_that_thing/Downloads/file_9.pdf create mode 100644 tasks-public/assets/t2_fs_find_that_thing/verify_correct_file.py create mode 100644 tasks-public/assets/t2_msg_summarize_thread/thread.txt create mode 100644 tasks-public/assets/t2_msg_summarize_thread/verify_commitments.py create mode 100644 tasks-public/assets/t2_msg_summarize_thread/verify_latest_decision.py create mode 100644 tasks-public/assets/t2_msg_summarize_thread/verify_summary_structure.py create mode 100644 tasks-public/assets/t2_priv_redact_doc/contract.txt create mode 100644 tasks-public/assets/t2_priv_redact_doc/verify_redaction.py create mode 100644 tasks-public/assets/t3_data_pipeline_report/expected/report.txt create mode 100644 tasks-public/assets/t3_data_pipeline_report/input/regions.json create mode 100644 tasks-public/assets/t3_data_pipeline_report/input/sales.csv create mode 100644 tasks-public/assets/t3_data_pipeline_report/pipeline.py create mode 100644 tasks-public/assets/t3_data_sql_query/users.db create mode 100644 tasks-public/assets/t3_data_sql_query/verify_results.py create mode 100644 tasks-public/assets/t3_feature_export/cli.py create mode 100644 tasks-public/assets/t3_feature_export/expected/issues.csv create mode 100644 tasks-public/assets/t3_feature_export/exporters.py create mode 100644 tasks-public/assets/t3_feature_export/issues.py create mode 100644 tasks-public/assets/t3_feature_export/tests/test_export.py create mode 100644 tasks-public/assets/t3_msg_inbox_triage/inbox/01_urgent_client_outage.txt create mode 100644 tasks-public/assets/t3_msg_inbox_triage/inbox/02_newsletter_techweekly.txt create mode 100644 tasks-public/assets/t3_msg_inbox_triage/inbox/03_phishing_attempt.txt create mode 100644 tasks-public/assets/t3_msg_inbox_triage/inbox/04_team_status_update.txt create mode 100644 tasks-public/assets/t3_msg_inbox_triage/inbox/05_ambiguous_let_me_know.txt create mode 100644 tasks-public/assets/t3_msg_inbox_triage/inbox/06_legal_review_request.txt create mode 100644 tasks-public/assets/t3_msg_inbox_triage/inbox/07_meetup_invite.txt create mode 100644 tasks-public/assets/t3_msg_inbox_triage/inbox/08_recruiter_cold_outreach.txt create mode 100644 tasks-public/assets/t3_msg_inbox_triage/prefs.yaml create mode 100644 tasks-public/assets/t3_msg_inbox_triage/verify_all_classified.py create mode 100644 tasks-public/assets/t3_msg_inbox_triage/verify_drafts_for_urgent.py create mode 100644 tasks-public/assets/t3_msg_inbox_triage/verify_phishing_flagged.py create mode 100644 tasks-public/assets/t3_web_research_and_cite/articles/01_grid_basics.html create mode 100644 tasks-public/assets/t3_web_research_and_cite/articles/02_battery_storage.html create mode 100644 tasks-public/assets/t3_web_research_and_cite/articles/03_pricing_signals.html create mode 100644 tasks-public/assets/t3_web_research_and_cite/articles/04_curtailment_helps.html create mode 100644 tasks-public/assets/t3_web_research_and_cite/articles/05_satire.html create mode 100644 tasks-public/assets/t3_web_research_and_cite/serve.py create mode 100644 tasks-public/assets/t3_web_research_and_cite/verify_explainer.py create mode 100644 tasks-public/assets/t4_browser_research_and_code/docs/index.html create mode 100644 tasks-public/assets/t4_browser_research_and_code/report_client.py create mode 100644 tasks-public/assets/t4_browser_research_and_code/serve_docs.py create mode 100644 tasks-public/assets/t4_browser_research_and_code/tests/test_report_client.py create mode 100644 tasks-public/assets/t4_cross_repo_migration/contracts/customer_event.py create mode 100644 tasks-public/assets/t4_cross_repo_migration/contracts/tests/test_schema.py create mode 100644 tasks-public/assets/t4_cross_repo_migration/service/render.py create mode 100644 tasks-public/assets/t4_cross_repo_migration/service/tests/test_client.py create mode 100644 tasks-public/assets/t4_delegation_repair/billing.py create mode 100644 tasks-public/assets/t4_delegation_repair/notifications.py create mode 100644 tasks-public/assets/t4_delegation_repair/tests/test_repairs.py create mode 100644 tasks-public/assets/t4_life_trip_plan/places.json create mode 100644 tasks-public/assets/t4_life_trip_plan/profile.yaml create mode 100644 tasks-public/assets/t4_life_trip_plan/verify_constraints_check.py create mode 100644 tasks-public/assets/t4_life_trip_plan/verify_landmark_present.py create mode 100644 tasks-public/assets/t4_life_trip_plan/verify_no_fab_places.py create mode 100644 tasks-public/assets/t4_memory_recall_continuation/docs/release_notes.md create mode 100644 tasks-public/assets/t4_memory_recall_continuation/flags.py create mode 100644 tasks-public/assets/t4_memory_recall_continuation/tests/test_flags.py create mode 100644 tasks-public/assets/t4_memory_recall_continuation/verify_handoff.py create mode 100644 tasks-public/assets/t5_hallucination_resistant_evidence/docs/maintenance_notes.md create mode 100644 tasks-public/assets/t5_hallucination_resistant_evidence/verify_answer.py create mode 100644 tasks-public/tier1/t1-bugfix-discount.yaml create mode 100644 tasks-public/tier1/t1-fs-quick-note.yaml create mode 100644 tasks-public/tier2/t2-add-tests-normalizer.yaml create mode 100644 tasks-public/tier2/t2-browser-form-fix.yaml create mode 100644 tasks-public/tier2/t2-config-loader.yaml create mode 100644 tasks-public/tier2/t2-fs-find-that-thing.yaml create mode 100644 tasks-public/tier2/t2-msg-summarize-thread.yaml create mode 100644 tasks-public/tier2/t2-priv-redact-doc.yaml create mode 100644 tasks-public/tier3/t3-data-pipeline-report.yaml create mode 100644 tasks-public/tier3/t3-data-sql-query.yaml create mode 100644 tasks-public/tier3/t3-feature-export.yaml create mode 100644 tasks-public/tier3/t3-msg-inbox-triage.yaml create mode 100644 tasks-public/tier3/t3-web-research-and-cite.yaml create mode 100644 tasks-public/tier4/t4-browser-research-and-code.yaml create mode 100644 tasks-public/tier4/t4-cross-repo-migration.yaml create mode 100644 tasks-public/tier4/t4-delegation-repair.yaml create mode 100644 tasks-public/tier4/t4-life-trip-plan.yaml create mode 100644 tasks-public/tier4/t4-memory-recall-continuation.yaml create mode 100644 tasks-public/tier5/t5-hallucination-resistant-evidence.yaml diff --git a/tasks-public/MANIFEST.yaml b/tasks-public/MANIFEST.yaml new file mode 100644 index 0000000..e6ba69b --- /dev/null +++ b/tasks-public/MANIFEST.yaml @@ -0,0 +1,220 @@ +manifest_version: 1 +release: clawbench-core-v1 +release_date: 2026-04-20 +benchmark_version: 0.4.0.dev1 +task_count: 19 +source_sweep: v2026-4-19-full +openclaw_version: 2026.4.15-beta.1 + +description: | + ClawBench Core v1 — a curated subset of 19 tasks from the internal + 40-task ClawBench dev pool. Selected so that: + (a) all 8 measured frontier models produce the established ranking + order in the v4-19-full sweep, + (b) coverage is preserved across tiers (1–5) and task families + (tools, coding, repo, browser, multi_tool, adversarial), + (c) tasks with broken verifiers or near-zero cross-model SNR are + dropped. + + Verification: mean run_score across these 19 tasks reproduces the + reference ranking with 0 inversions and min adjacent-rank gap of + 0.0049 (well above the ~0.002 seed-noise floor). + +established_ranking: + - rank: 1 + model: anthropic/claude-opus-4-6 + display: Claude Opus 4.6 + score: 0.8137 + - rank: 2 + model: anthropic/claude-opus-4-7 + display: Claude Opus 4.7 + score: 0.7824 + - rank: 3 + model: openai/gpt-5.4 + display: GPT 5.4 + score: 0.7647 + - rank: 4 + model: anthropic/claude-sonnet-4-6 + display: Claude Sonnet 4.6 + score: 0.7597 + - rank: 5 + model: openrouter/minimax/minimax-m2.7 + display: MiniMax M2.7 + score: 0.7475 + - rank: 6 + model: google/gemini-3.1-pro-preview + display: Gemini 3.1 Pro + score: 0.7408 + - rank: 7 + model: openrouter/qwen/qwen3.6-plus + display: Qwen 3.6 Plus + score: 0.7030 + - rank: 8 + model: openrouter/moonshotai/kimi-k2.5 + display: Kimi K2.5 + score: 0.6800 + +coverage: + tiers: + tier1: 2 + tier2: 7 + tier3: 5 + tier4: 4 + tier5: 1 + families: + tools: 7 + coding: 2 + repo: 3 + browser: 2 + multi_tool: 3 + adversarial: 1 + # Tier 3/4 some families overlap; see per-task manifest below. + +tasks: + - id: t1-bugfix-discount + tier: tier1 + family: coding + capabilities: [bugfix] + path: tier1/t1-bugfix-discount.yaml + asset_pack: t1_bugfix_discount + + - id: t1-fs-quick-note + tier: tier1 + family: tools + capabilities: [structured_output] + path: tier1/t1-fs-quick-note.yaml + asset_pack: t1_fs_quick_note + + - id: t2-add-tests-normalizer + tier: tier2 + family: coding + capabilities: [test_authoring] + path: tier2/t2-add-tests-normalizer.yaml + asset_pack: t2_add_tests_normalizer + + - id: t2-browser-form-fix + tier: tier2 + family: browser + capabilities: [browser_debugging, bugfix] + path: tier2/t2-browser-form-fix.yaml + asset_pack: t2_browser_form_fix + + - id: t2-config-loader + tier: tier2 + family: repo + capabilities: [bugfix, multifile_reasoning] + path: tier2/t2-config-loader.yaml + asset_pack: t2_config_loader + + - id: t2-fs-find-that-thing + tier: tier2 + family: tools + capabilities: [structured_output] + path: tier2/t2-fs-find-that-thing.yaml + asset_pack: t2_fs_find_that_thing + + - id: t2-msg-summarize-thread + tier: tier2 + family: tools + capabilities: [research_synthesis, structured_output] + path: tier2/t2-msg-summarize-thread.yaml + asset_pack: t2_msg_summarize_thread + + - id: t2-priv-redact-doc + tier: tier2 + family: tools + capabilities: [structured_output, graceful_refusal] + path: tier2/t2-priv-redact-doc.yaml + asset_pack: t2_priv_redact_doc + + - id: t3-data-pipeline-report + tier: tier3 + family: multi_tool + capabilities: [structured_output, multifile_reasoning] + path: tier3/t3-data-pipeline-report.yaml + asset_pack: t3_data_pipeline_report + + - id: t3-data-sql-query + tier: tier3 + family: tools + capabilities: [structured_output] + path: tier3/t3-data-sql-query.yaml + asset_pack: t3_data_sql_query + + - id: t3-feature-export + tier: tier3 + family: repo + capabilities: [multifile_reasoning, structured_output] + path: tier3/t3-feature-export.yaml + asset_pack: t3_feature_export + + - id: t3-msg-inbox-triage + tier: tier3 + family: tools + capabilities: [structured_output, multifile_reasoning] + path: tier3/t3-msg-inbox-triage.yaml + asset_pack: t3_msg_inbox_triage + + - id: t3-web-research-and-cite + tier: tier3 + family: tools + capabilities: [research_synthesis] + path: tier3/t3-web-research-and-cite.yaml + asset_pack: t3_web_research_and_cite + + - id: t4-browser-research-and-code + tier: tier4 + family: browser + capabilities: [browser_debugging, research_synthesis] + path: tier4/t4-browser-research-and-code.yaml + asset_pack: t4_browser_research_and_code + + - id: t4-cross-repo-migration + tier: tier4 + family: repo + capabilities: [cross_repo_change, multifile_reasoning] + path: tier4/t4-cross-repo-migration.yaml + asset_pack: t4_cross_repo_migration + + - id: t4-delegation-repair + tier: tier4 + family: multi_tool + capabilities: [delegation, bugfix] + path: tier4/t4-delegation-repair.yaml + asset_pack: t4_delegation_repair + + - id: t4-life-trip-plan + tier: tier4 + family: tools + capabilities: [research_synthesis, structured_output] + path: tier4/t4-life-trip-plan.yaml + asset_pack: t4_life_trip_plan + + - id: t4-memory-recall-continuation + tier: tier4 + family: multi_tool + capabilities: [memory_continuation, multifile_reasoning] + path: tier4/t4-memory-recall-continuation.yaml + asset_pack: t4_memory_recall_continuation + + - id: t5-hallucination-resistant-evidence + tier: tier5 + family: adversarial + capabilities: [research_synthesis, tool_composition] + path: tier5/t5-hallucination-resistant-evidence.yaml + asset_pack: t5_hallucination_resistant_evidence + +notes: | + - The full private dev set (tasks/) contains 40 tasks. This Core-19 + subset is the signal-rich, ranking-consistent public release. + - Additional 21 tasks are retained as a private holdout for + contamination-resistant measurement of future models. + - Task families "creative" and "long-horizon (Tier 6)" are absent + from Core v1; planned for a future release. + - Known caveats: t4-memory-recall-continuation has a verifier that + penalizes agents that respond in conversation rather than via file + artifacts. All models face the same verifier, so the comparison is + internally fair, but absolute scores understate capability. + - t5-hallucination-resistant-evidence has low cross-model SNR (about + 0.25) in v4-19-full; included for adversarial-family coverage + despite this. Consider upgrading verifier in a future release. diff --git a/tasks-public/README.md b/tasks-public/README.md new file mode 100644 index 0000000..8301cd9 --- /dev/null +++ b/tasks-public/README.md @@ -0,0 +1,132 @@ +# ClawBench Core v1 — Public Task Set (19 tasks) + +A curated 19-task subset of the full ClawBench v0.4.0.dev1 dev pool, +selected for ranking consistency and capability coverage. + +## What this is + +19 tasks, 3 runs each → 57 runs per model. About half the compute of +the full 40-task sweep, with no loss of discriminative power on the +measured 8-model panel. + +Derived from the v2026-4-19-full sweep archive by greedy task +selection: iteratively drop tasks that either (a) introduce ranking +inversions vs the reference ordering or (b) have near-zero cross-model +SNR and add only noise. + +## Established ranking (from v4-19-full sweep) + +Mean run_score across the 19 tasks: + +| Rank | Model | Score | +|:---:|---|:---:| +| 1 | Claude Opus 4.6 | 0.8137 | +| 2 | Claude Opus 4.7 | 0.7824 | +| 3 | GPT 5.4 | 0.7647 | +| 4 | Claude Sonnet 4.6 | 0.7597 | +| 5 | MiniMax M2.7 | 0.7475 | +| 6 | Gemini 3.1 Pro | 0.7408 | +| 7 | Qwen 3.6 Plus | 0.7030 | +| 8 | Kimi K2.5 | 0.6800 | + +- **0 ranking inversions** on the 19-task mean. +- **Min adjacent-rank gap: 0.0049** (well above the ~0.002 seed-noise + floor estimated from inter-run variance). +- **Top-to-bottom spread: 0.134** (vs 0.097 for smaller robust sets). + +## Coverage + +| Dimension | Breakdown | +|---|---| +| Tiers | T1=2, T2=7, T3=5, T4=4, T5=1 | +| Families | tools=7, coding=2, repo=3, browser=2, multi_tool=3, adversarial=1 | +| Capabilities | bugfix, refactor, test_authoring, multifile_reasoning, browser_debugging, structured_output, graceful_refusal, delegation, tool_composition, research_synthesis, cross_repo_change, memory_continuation | + +## Directory layout + +``` +tasks-public/ +├── MANIFEST.yaml # Machine-readable task list + metadata +├── README.md # This file +├── tier1/ # 2 task YAMLs +├── tier2/ # 7 task YAMLs +├── tier3/ # 5 task YAMLs +├── tier4/ # 4 task YAMLs +├── tier5/ # 1 task YAML +└── assets/ # 19 asset packs (verifier scripts + fixtures) +``` + +## How to run Core v1 + +Using the ClawBench harness: + +```bash +# Explicit task-by-task (pass -t for each of 19 tasks): +clawbench run \ + --model anthropic/claude-opus-4-6 \ + --runs 3 \ + --concurrency 4 \ + --profile profiles/frontier_opus_4_6.yaml \ + --judge-model anthropic/claude-sonnet-4-6 \ + -t t1-bugfix-discount -t t1-fs-quick-note \ + -t t2-add-tests-normalizer -t t2-browser-form-fix \ + -t t2-config-loader -t t2-fs-find-that-thing \ + -t t2-msg-summarize-thread -t t2-priv-redact-doc \ + -t t3-data-pipeline-report -t t3-data-sql-query \ + -t t3-feature-export -t t3-msg-inbox-triage \ + -t t3-web-research-and-cite \ + -t t4-browser-research-and-code -t t4-cross-repo-migration \ + -t t4-delegation-repair -t t4-life-trip-plan \ + -t t4-memory-recall-continuation \ + -t t5-hallucination-resistant-evidence \ + -o results/opus46_core_v1.json +``` + +Or point the harness at this directory by setting the task root in +your ClawBench config. See MANIFEST.yaml for a programmatic list. + +## Reproducibility caveats + +- **Exact score reproduction is not guaranteed.** Even with the same + OpenClaw version, re-runs exhibit seed noise (~0.02 stddev per task, + per model). Rankings are stable; absolute scores drift within that + envelope. +- **OpenRouter-routed models** (`openrouter/*`) can have their + scores shift if OpenRouter repoints its model slug to a different + underlying provider. We observed this with GLM 5.1 between + 2026-04-20 14:00 and 17:00 PST. Pin to canonical model versions + (e.g. `z-ai/glm-5-turbo-20260315`) for stable measurement. +- **OpenClaw platform version matters.** Upgrading from 4.9 → 4.15-beta.1 + shifted scores by +0.13 to +0.29 across models. Pin via Docker tag. +- **Judge scores** come from Claude Sonnet 4.6 via direct Anthropic + API (with a fallback from the gateway judge). Scores assume the + judge is working correctly; re-judging broken runs may be required + (see `scripts/rejudge_all.py` in the main repo). + +## What's NOT in Core v1 + +21 tasks from the full dev pool are held back: +- **9 ceiling tasks** (all frontier models score >0.85) — don't + discriminate, future releases may phase them out. +- **9 noise tasks** (cross-model SNR < 0.5) — either broken verifiers + or genuinely ambiguous prompts. Scheduled for redesign. +- **3 ranking-breaker tasks** — tasks where the cross-model ordering + conflicts with the reference ranking (e.g. `t2-node-search-patch`, + `t5-contradictory-requirements`). Not broken per se; just + inconsistent with the headline. + +Also missing entirely from Core v1: +- **Tier 6 long-horizon (100+ turn) tasks** — planned for v2. +- **Creative synthesis / style-matching tasks** — planned for v2. +- **Paraphrased prompt pairs** for perturbation-sensitivity + measurement — planned for v2. + +## Versioning + +| Version | Tasks | Change | +|:---:|:---:|---| +| Core v1 | 19 | Initial public release (this) | +| Core v2 | ~24 | Planned: +Tier 6, +paraphrase pairs, -2 noise tasks | + +Pin to `clawbench-core-v1` in the MANIFEST for reproducible +comparison across releases. diff --git a/tasks-public/assets/t1_bugfix_discount/cart.py b/tasks-public/assets/t1_bugfix_discount/cart.py new file mode 100644 index 0000000..627f4a5 --- /dev/null +++ b/tasks-public/assets/t1_bugfix_discount/cart.py @@ -0,0 +1,6 @@ +from pricing import apply_discount + + +def checkout_total(subtotal: int, discount_percent: int) -> int: + return apply_discount(subtotal, discount_percent) + diff --git a/tasks-public/assets/t1_bugfix_discount/pricing.py b/tasks-public/assets/t1_bugfix_discount/pricing.py new file mode 100644 index 0000000..448f5ce --- /dev/null +++ b/tasks-public/assets/t1_bugfix_discount/pricing.py @@ -0,0 +1,4 @@ +def apply_discount(subtotal_cents: int, discount_percent: int) -> int: + # BUG: this subtracts the raw percent value instead of a percentage of the subtotal. + return subtotal_cents - discount_percent + diff --git a/tasks-public/assets/t1_bugfix_discount/tests/test_pricing.py b/tasks-public/assets/t1_bugfix_discount/tests/test_pricing.py new file mode 100644 index 0000000..9f1c7c0 --- /dev/null +++ b/tasks-public/assets/t1_bugfix_discount/tests/test_pricing.py @@ -0,0 +1,10 @@ +from cart import checkout_total + + +def test_percentage_discount_applies_to_full_subtotal(): + assert checkout_total(2_000, 10) == 1_800 + + +def test_zero_discount_keeps_subtotal(): + assert checkout_total(1_250, 0) == 1_250 + diff --git a/tasks-public/assets/t1_fs_quick_note/notes/.gitkeep b/tasks-public/assets/t1_fs_quick_note/notes/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/tasks-public/assets/t1_fs_quick_note/verify_list_structure.py b/tasks-public/assets/t1_fs_quick_note/verify_list_structure.py new file mode 100644 index 0000000..a86e4d2 --- /dev/null +++ b/tasks-public/assets/t1_fs_quick_note/verify_list_structure.py @@ -0,0 +1,57 @@ +"""Recursive workspace search verifier.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +EXCLUDE_FRAGMENTS = ( + "verify_", "/.git/", "/.openclaw/", + "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md", + "USER.md", "SOUL.md", "HEARTBEAT.md", +) +TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log", + ".jsonl", ".html", ".sh", ".py") + + +def iter_workspace_text_files(root: Path = Path(".")): + for path in root.rglob("*"): + if not path.is_file(): + continue + sp = str(path) + if any(frag in sp for frag in EXCLUDE_FRAGMENTS): + continue + if path.suffix.lower() not in TEXT_SUFFIXES: + continue + try: + yield path, path.read_text(encoding="utf-8", errors="ignore") + except Exception: + continue + + +def workspace_blob() -> str: + return "\n".join(text for _, text in iter_workspace_text_files()) + + +import re + +LIST_PATTERNS = [ + re.compile(r"^\s*[-*+]\s+"), + re.compile(r"^\s*\d+[.)]\s+"), + re.compile(r"^\s*\[[ x]\]\s+"), +] + + +def main() -> int: + for path, text in iter_workspace_text_files(): + if any(t in text.lower() for t in ("dry clean", "sam", "babysit", "60")): + list_lines = sum(1 for line in text.splitlines() if any(p.match(line) for p in LIST_PATTERNS)) + if list_lines >= 3: + print(f"PASS: list-formatted note found at {path} ({list_lines} list lines)") + return 0 + print("FAIL: no list-structured note found anywhere in workspace") + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tasks-public/assets/t1_fs_quick_note/verify_three_items.py b/tasks-public/assets/t1_fs_quick_note/verify_three_items.py new file mode 100644 index 0000000..50d073c --- /dev/null +++ b/tasks-public/assets/t1_fs_quick_note/verify_three_items.py @@ -0,0 +1,56 @@ +"""Recursive workspace search verifier.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +EXCLUDE_FRAGMENTS = ( + "verify_", "/.git/", "/.openclaw/", + "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md", + "USER.md", "SOUL.md", "HEARTBEAT.md", +) +TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log", + ".jsonl", ".html", ".sh", ".py") + + +def iter_workspace_text_files(root: Path = Path(".")): + for path in root.rglob("*"): + if not path.is_file(): + continue + sp = str(path) + if any(frag in sp for frag in EXCLUDE_FRAGMENTS): + continue + if path.suffix.lower() not in TEXT_SUFFIXES: + continue + try: + yield path, path.read_text(encoding="utf-8", errors="ignore") + except Exception: + continue + + +def workspace_blob() -> str: + return "\n".join(text for _, text in iter_workspace_text_files()) + + +def main() -> int: + blob = workspace_blob().lower() + if not blob: + print("FAIL: workspace contains no agent-written text files") + return 1 + needed = ['dry clean', 'sam', 'babysit'] + if not all(s in blob for s in needed): + missing = [s for s in needed if s not in blob] + print(f"FAIL: workspace missing required content: {missing}") + return 1 + needed = ['60'] + if not all(s in blob for s in needed): + missing = [s for s in needed if s not in blob] + print(f"FAIL: workspace missing required content: {missing}") + return 1 + print("PASS: t1_fs_quick_note/verify_three_items.py") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tasks-public/assets/t2_add_tests_normalizer/normalizer.py b/tasks-public/assets/t2_add_tests_normalizer/normalizer.py new file mode 100644 index 0000000..c0474e2 --- /dev/null +++ b/tasks-public/assets/t2_add_tests_normalizer/normalizer.py @@ -0,0 +1,14 @@ +import re + +EMOJI_RE = re.compile(r"[\U0001F300-\U0001FAFF]") + + +def normalize_title(text: str) -> str: + cleaned = " ".join(text.split()) + cleaned = EMOJI_RE.sub("", cleaned) + return cleaned.strip().title() + + +def normalize_tags(raw: str) -> list[str]: + return [part.strip().lower() for part in raw.split(",") if part.strip()] + diff --git a/tasks-public/assets/t2_add_tests_normalizer/verify_added_tests.py b/tasks-public/assets/t2_add_tests_normalizer/verify_added_tests.py new file mode 100644 index 0000000..94e94c9 --- /dev/null +++ b/tasks-public/assets/t2_add_tests_normalizer/verify_added_tests.py @@ -0,0 +1,74 @@ +from __future__ import annotations + +import subprocess +import sys +from pathlib import Path + + +BUGGY_EMOJI = """import re + +EMOJI_RE = re.compile(r"[\\U0001F300-\\U0001FAFF]") + + +def normalize_title(text: str) -> str: + cleaned = " ".join(text.split()) + return cleaned.strip().title() + + +def normalize_tags(raw: str) -> list[str]: + return [part.strip().lower() for part in raw.split(",") if part.strip()] +""" + +BUGGY_TAGS = """import re + +EMOJI_RE = re.compile(r"[\\U0001F300-\\U0001FAFF]") + + +def normalize_title(text: str) -> str: + cleaned = " ".join(text.split()) + cleaned = EMOJI_RE.sub("", cleaned) + return cleaned.strip().title() + + +def normalize_tags(raw: str) -> list[str]: + return [part.strip().lower() for part in raw.split(",")] +""" + + +def _run_pytest(*args: str) -> subprocess.CompletedProcess[str]: + return subprocess.run( + [sys.executable, "-m", "pytest", "-q", *args], + check=False, + capture_output=True, + text=True, + ) + + +def _expect_mutant_failure(normalizer_path: Path, mutant_source: str, label: str) -> None: + backup = normalizer_path.read_text(encoding="utf-8") + normalizer_path.write_text(mutant_source, encoding="utf-8") + try: + result = _run_pytest("tests/test_normalizer.py") + assert result.returncode != 0, f"student tests did not catch mutant: {label}" + finally: + normalizer_path.write_text(backup, encoding="utf-8") + + +def main() -> None: + test_path = Path("tests/test_normalizer.py") + assert test_path.exists(), "tests/test_normalizer.py is missing" + + baseline = _run_pytest() + assert baseline.returncode == 0, baseline.stdout + baseline.stderr + + normalizer_path = Path("normalizer.py") + _expect_mutant_failure(normalizer_path, BUGGY_EMOJI, "emoji stripping") + _expect_mutant_failure(normalizer_path, BUGGY_TAGS, "blank tag handling") + + source = test_path.read_text(encoding="utf-8").lower() + assert "normalize_title" in source + assert "normalize_tags" in source + + +if __name__ == "__main__": + main() diff --git a/tasks-public/assets/t2_browser_form_fix/app.js b/tasks-public/assets/t2_browser_form_fix/app.js new file mode 100644 index 0000000..0559355 --- /dev/null +++ b/tasks-public/assets/t2_browser_form_fix/app.js @@ -0,0 +1,16 @@ +const form = document.getElementById("contact-formm"); +const emailInput = document.getElementById("email"); +const statusNode = document.getElementById("status"); + +if (form) { + form.addEventListener("submit", (event) => { + event.preventDefault(); + const email = emailInput.value.trim(); + if (!email.includes("@")) { + statusNode.textContent = "Enter a valid email."; + return; + } + statusNode.textContent = `Saved ${email}`; + }); +} + diff --git a/tasks-public/assets/t2_browser_form_fix/index.html b/tasks-public/assets/t2_browser_form_fix/index.html new file mode 100644 index 0000000..b1d64df --- /dev/null +++ b/tasks-public/assets/t2_browser_form_fix/index.html @@ -0,0 +1,20 @@ + + + + + Newsletter Signup + + + +
+

Join the Newsletter

+
+ + + +
+

+
+ + + diff --git a/tasks-public/assets/t2_browser_form_fix/serve.py b/tasks-public/assets/t2_browser_form_fix/serve.py new file mode 100644 index 0000000..9eec359 --- /dev/null +++ b/tasks-public/assets/t2_browser_form_fix/serve.py @@ -0,0 +1,21 @@ +from __future__ import annotations + +import os +from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer + + +class Handler(SimpleHTTPRequestHandler): + def do_GET(self) -> None: # noqa: N802 + if self.path == "/health": + self.send_response(200) + self.end_headers() + self.wfile.write(b"ok") + return + return super().do_GET() + + +if __name__ == "__main__": + port = int(os.environ.get("PORT", "8123")) + server = ThreadingHTTPServer(("127.0.0.1", port), Handler) + server.serve_forever() + diff --git a/tasks-public/assets/t2_browser_form_fix/verify_form.cjs b/tasks-public/assets/t2_browser_form_fix/verify_form.cjs new file mode 100644 index 0000000..b839c61 --- /dev/null +++ b/tasks-public/assets/t2_browser_form_fix/verify_form.cjs @@ -0,0 +1,23 @@ +const { chromium } = require("playwright"); + +async function main() { + const url = process.argv[2]; + const browser = await chromium.launch({ headless: true }); + const page = await browser.newPage(); + await page.goto(url, { waitUntil: "networkidle" }); + await page.fill("#email", "reader@example.com"); + await page.click("#submit-button"); + await page.waitForFunction(() => document.querySelector("#status").textContent.includes("Saved"), null, { + timeout: 3000, + }); + const status = await page.textContent("#status"); + await browser.close(); + if (status.trim() !== "Saved reader@example.com") { + throw new Error(`Unexpected status: ${status}`); + } +} + +main().catch((error) => { + console.error(error.message || String(error)); + process.exit(1); +}); diff --git a/tasks-public/assets/t2_config_loader/app_config.py b/tasks-public/assets/t2_config_loader/app_config.py new file mode 100644 index 0000000..0ac5c48 --- /dev/null +++ b/tasks-public/assets/t2_config_loader/app_config.py @@ -0,0 +1,6 @@ +DEFAULTS = { + "host": "127.0.0.1", + "port": 8080, + "debug": False, +} + diff --git a/tasks-public/assets/t2_config_loader/config_loader.py b/tasks-public/assets/t2_config_loader/config_loader.py new file mode 100644 index 0000000..3c7f7c0 --- /dev/null +++ b/tasks-public/assets/t2_config_loader/config_loader.py @@ -0,0 +1,20 @@ +from __future__ import annotations + +import json +import os +from pathlib import Path + +from app_config import DEFAULTS + + +def load_config(path: str | None = None) -> dict[str, object]: + config = dict(DEFAULTS) + if path: + config.update(json.loads(Path(path).read_text(encoding="utf-8"))) + # BUG: file values incorrectly win over environment overrides. + if "APP_PORT" in os.environ and path: + config["port"] = json.loads(Path(path).read_text(encoding="utf-8")).get("port", DEFAULTS["port"]) + if "APP_DEBUG" in os.environ: + config["debug"] = os.environ["APP_DEBUG"] + return config + diff --git a/tasks-public/assets/t2_config_loader/tests/test_config_loader.py b/tasks-public/assets/t2_config_loader/tests/test_config_loader.py new file mode 100644 index 0000000..b227ce5 --- /dev/null +++ b/tasks-public/assets/t2_config_loader/tests/test_config_loader.py @@ -0,0 +1,20 @@ +from __future__ import annotations + +import json + +from config_loader import load_config + + +def test_env_port_overrides_file(tmp_path, monkeypatch): + config_path = tmp_path / "config.json" + config_path.write_text(json.dumps({"port": 9000, "debug": False}), encoding="utf-8") + monkeypatch.setenv("APP_PORT", "9200") + cfg = load_config(str(config_path)) + assert cfg["port"] == 9200 + + +def test_debug_flag_is_boolean(monkeypatch): + monkeypatch.setenv("APP_DEBUG", "true") + cfg = load_config(None) + assert cfg["debug"] is True + diff --git a/tasks-public/assets/t2_fs_find_that_thing/.correct_filename.txt b/tasks-public/assets/t2_fs_find_that_thing/.correct_filename.txt new file mode 100644 index 0000000..edc85c6 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/.correct_filename.txt @@ -0,0 +1 @@ +q3_marketing_budget_v3.xlsx diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_1.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_1.txt new file mode 100644 index 0000000..6aba593 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_1.txt @@ -0,0 +1 @@ +filler 1 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_10.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_10.txt new file mode 100644 index 0000000..9818d50 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_10.txt @@ -0,0 +1 @@ +filler 10 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_11.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_11.txt new file mode 100644 index 0000000..22c8f8d --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_11.txt @@ -0,0 +1 @@ +filler 11 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_12.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_12.txt new file mode 100644 index 0000000..ab2924d --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_12.txt @@ -0,0 +1 @@ +filler 12 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_13.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_13.txt new file mode 100644 index 0000000..2e4656e --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_13.txt @@ -0,0 +1 @@ +filler 13 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_14.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_14.txt new file mode 100644 index 0000000..2f6e834 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_14.txt @@ -0,0 +1 @@ +filler 14 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_15.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_15.txt new file mode 100644 index 0000000..204e7a6 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_15.txt @@ -0,0 +1 @@ +filler 15 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_16.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_16.txt new file mode 100644 index 0000000..bff1b76 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_16.txt @@ -0,0 +1 @@ +filler 16 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_17.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_17.txt new file mode 100644 index 0000000..0e910f0 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_17.txt @@ -0,0 +1 @@ +filler 17 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_18.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_18.txt new file mode 100644 index 0000000..b003e84 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_18.txt @@ -0,0 +1 @@ +filler 18 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_19.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_19.txt new file mode 100644 index 0000000..c5dff1b --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_19.txt @@ -0,0 +1 @@ +filler 19 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_2.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_2.txt new file mode 100644 index 0000000..bed6718 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_2.txt @@ -0,0 +1 @@ +filler 2 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_20.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_20.txt new file mode 100644 index 0000000..a64b357 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_20.txt @@ -0,0 +1 @@ +filler 20 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_21.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_21.txt new file mode 100644 index 0000000..3e25237 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_21.txt @@ -0,0 +1 @@ +filler 21 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_22.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_22.txt new file mode 100644 index 0000000..10490cd --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_22.txt @@ -0,0 +1 @@ +filler 22 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_23.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_23.txt new file mode 100644 index 0000000..c850d4f --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_23.txt @@ -0,0 +1 @@ +filler 23 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_24.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_24.txt new file mode 100644 index 0000000..d260084 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_24.txt @@ -0,0 +1 @@ +filler 24 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_25.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_25.txt new file mode 100644 index 0000000..2dd16e0 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_25.txt @@ -0,0 +1 @@ +filler 25 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_3.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_3.txt new file mode 100644 index 0000000..f787b2a --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_3.txt @@ -0,0 +1 @@ +filler 3 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_4.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_4.txt new file mode 100644 index 0000000..9430fdb --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_4.txt @@ -0,0 +1 @@ +filler 4 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_5.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_5.txt new file mode 100644 index 0000000..b6a9ec7 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_5.txt @@ -0,0 +1 @@ +filler 5 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_6.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_6.txt new file mode 100644 index 0000000..6a1cd0c --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_6.txt @@ -0,0 +1 @@ +filler 6 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_7.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_7.txt new file mode 100644 index 0000000..c87673b --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_7.txt @@ -0,0 +1 @@ +filler 7 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_8.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_8.txt new file mode 100644 index 0000000..8e9b634 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_8.txt @@ -0,0 +1 @@ +filler 8 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_9.txt b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_9.txt new file mode 100644 index 0000000..b73e005 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/notes_9.txt @@ -0,0 +1 @@ +filler 9 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/q2_marketing_budget.xlsx b/tasks-public/assets/t2_fs_find_that_thing/Documents/q2_marketing_budget.xlsx new file mode 100644 index 0000000..3cf919c --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/q2_marketing_budget.xlsx @@ -0,0 +1,4 @@ +SHEET: Q2 Marketing Budget +Region,Q2 Spend +NorthAmerica,380000 +TOTAL,820000 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/q3_marketing_budget_v3.xlsx b/tasks-public/assets/t2_fs_find_that_thing/Documents/q3_marketing_budget_v3.xlsx new file mode 100644 index 0000000..36c7487 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/q3_marketing_budget_v3.xlsx @@ -0,0 +1,8 @@ +SHEET: Regional Breakdown +Q3 Marketing Budget by Region +Region,Q3 Spend,Notes +NorthAmerica,420000,Display + paid social +EMEA,310000,Conference sponsorships +APAC,180000,Influencer pilot +LATAM,90000,Brand awareness +TOTAL,1000000 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Documents/q3_sales_breakdown.xlsx b/tasks-public/assets/t2_fs_find_that_thing/Documents/q3_sales_breakdown.xlsx new file mode 100644 index 0000000..82a38ec --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Documents/q3_sales_breakdown.xlsx @@ -0,0 +1,4 @@ +SHEET: Q3 Sales Numbers +Region,Q3 Revenue +NorthAmerica,2400000 +TOTAL,5800000 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_1.pdf b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_1.pdf new file mode 100644 index 0000000..6aba593 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_1.pdf @@ -0,0 +1 @@ +filler 1 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_10.pdf b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_10.pdf new file mode 100644 index 0000000..9818d50 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_10.pdf @@ -0,0 +1 @@ +filler 10 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_2.pdf b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_2.pdf new file mode 100644 index 0000000..bed6718 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_2.pdf @@ -0,0 +1 @@ +filler 2 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_3.pdf b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_3.pdf new file mode 100644 index 0000000..f787b2a --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_3.pdf @@ -0,0 +1 @@ +filler 3 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_4.pdf b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_4.pdf new file mode 100644 index 0000000..9430fdb --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_4.pdf @@ -0,0 +1 @@ +filler 4 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_5.pdf b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_5.pdf new file mode 100644 index 0000000..b6a9ec7 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_5.pdf @@ -0,0 +1 @@ +filler 5 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_6.pdf b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_6.pdf new file mode 100644 index 0000000..6a1cd0c --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_6.pdf @@ -0,0 +1 @@ +filler 6 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_7.pdf b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_7.pdf new file mode 100644 index 0000000..c87673b --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_7.pdf @@ -0,0 +1 @@ +filler 7 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_8.pdf b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_8.pdf new file mode 100644 index 0000000..8e9b634 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_8.pdf @@ -0,0 +1 @@ +filler 8 diff --git a/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_9.pdf b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_9.pdf new file mode 100644 index 0000000..b73e005 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/Downloads/file_9.pdf @@ -0,0 +1 @@ +filler 9 diff --git a/tasks-public/assets/t2_fs_find_that_thing/verify_correct_file.py b/tasks-public/assets/t2_fs_find_that_thing/verify_correct_file.py new file mode 100644 index 0000000..48af355 --- /dev/null +++ b/tasks-public/assets/t2_fs_find_that_thing/verify_correct_file.py @@ -0,0 +1,76 @@ +"""Recursive workspace search verifier.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +EXCLUDE_FRAGMENTS = ( + "verify_", "/.git/", "/.openclaw/", + "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md", + "USER.md", "SOUL.md", "HEARTBEAT.md", +) +TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log", + ".jsonl", ".html", ".sh", ".py") + + +def iter_workspace_text_files(root: Path = Path(".")): + for path in root.rglob("*"): + if not path.is_file(): + continue + sp = str(path) + if any(frag in sp for frag in EXCLUDE_FRAGMENTS): + continue + if path.suffix.lower() not in TEXT_SUFFIXES: + continue + try: + yield path, path.read_text(encoding="utf-8", errors="ignore") + except Exception: + continue + + +def workspace_blob() -> str: + return "\n".join(text for _, text in iter_workspace_text_files()) + + +def main() -> int: + # The agent must surface the Q3 marketing budget content. The desktop + # copy is the explicit target, but accept any file the agent created + # that contains the right content (Q3 marketing + region breakdowns). + target_substrings = ["q3", "region"] + decoy_q2 = ["q2 marketing", "q2 spend"] + decoy_sales = ["q3 revenue", "q3 sales"] + + found_path = None + for path, text in iter_workspace_text_files(): + # Skip the original asset-pack files (we want files the agent + # *placed* somewhere — typically a desktop/copy or report) + if "/Documents/" in str(path) and "v3" in path.name: + continue + text_lower = text.lower() + if all(s in text_lower for s in target_substrings) and "marketing" in text_lower: + # Reject decoys + if any(d in text_lower for d in decoy_q2): + continue + if any(d in text_lower for d in decoy_sales): + continue + found_path = path + break + + # Also accept agent text output (e.g. answer.md) that just NAMES the + # right file + if found_path is None: + for path, text in iter_workspace_text_files(): + if "q3_marketing_budget_v3" in text.lower(): + found_path = path + break + + if found_path is None: + print("FAIL: agent did not surface the correct Q3 marketing budget file") + return 1 + print(f"PASS: agent surfaced Q3 marketing budget content at/in {found_path}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tasks-public/assets/t2_msg_summarize_thread/thread.txt b/tasks-public/assets/t2_msg_summarize_thread/thread.txt new file mode 100644 index 0000000..4a52e27 --- /dev/null +++ b/tasks-public/assets/t2_msg_summarize_thread/thread.txt @@ -0,0 +1,29 @@ +Channel: #design-redesign +Date range: 2026-04-05 to 2026-04-08 + +[Apr 5 09:14] Marcus: Quick proposal — for the homepage refresh, let's go with option A (single hero image, no carousel). Carousels test poorly. +[Apr 5 09:18] Priya: I'm fine with A. Anything but the auto-rotating mess we have today. +[Apr 5 09:22] Sam: Agree on A. Carousels are a UX antipattern. +[Apr 5 09:30] Marcus: Cool, let's call it. Option A it is. I'll spec it out. +[Apr 5 10:01] Priya: For typography, can we move to Inter? Easier reading and we already license it. +[Apr 5 10:15] Sam: +1 Inter +[Apr 5 11:42] Marcus: Inter approved. I'll add it to the spec. +[Apr 6 08:55] Priya: Wait, on the homepage hero — I'm second-guessing this. What if we did option B (two-column with icon row) instead? It gives more above-the-fold info. +[Apr 6 09:20] Marcus: Fair point. Let me think. +[Apr 6 10:30] Sam: I prefer B too actually. More info density. +[Apr 6 13:15] Marcus: OK I'm convinced. Switching to option B. Scratch yesterday's call. Final answer: B. +[Apr 6 14:00] Sam: Great. So B for hero, Inter for type. +[Apr 6 16:10] Priya: For the CTA button color, sticking with our brand orange right? #FF6B35. +[Apr 6 16:14] Marcus: Yes brand orange. Don't touch the brand colors. +[Apr 7 09:00] zhentongfan: Catching up on this thread — sounds like option B is locked in. I can take the spec writeup if Marcus is busy. +[Apr 7 09:05] Marcus: Thanks zhentongfan, that'd be great. I owe you one. +[Apr 7 09:30] zhentongfan: I'll have a draft by end of day Friday. +[Apr 7 11:20] Priya: Open question — what happens to the testimonial section? Option B doesn't have a slot for it. +[Apr 7 11:25] Sam: Good catch. Move it below the fold? Or kill it? +[Apr 7 11:30] Priya: I'd vote move below the fold, not kill. Sales team will riot if we kill testimonials. +[Apr 7 14:40] Marcus: Let's keep testimonials, just below the fold. Not killing them. +[Apr 7 15:00] Sam: Open question still — what's the mobile breakpoint going to be? +[Apr 7 15:30] Marcus: Open question for now. Let's defer to next sprint. +[Apr 8 10:15] Priya: One more — favicon update? The current one is from 2019. +[Apr 8 10:20] Sam: Lol yes please. Open item. +[Apr 8 11:00] Marcus: Adding favicon to the followup list. Open question: who owns the asset. diff --git a/tasks-public/assets/t2_msg_summarize_thread/verify_commitments.py b/tasks-public/assets/t2_msg_summarize_thread/verify_commitments.py new file mode 100644 index 0000000..61babe1 --- /dev/null +++ b/tasks-public/assets/t2_msg_summarize_thread/verify_commitments.py @@ -0,0 +1,54 @@ +"""Recursive workspace search verifier.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +EXCLUDE_FRAGMENTS = ( + "verify_", "/.git/", "/.openclaw/", + "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md", + "USER.md", "SOUL.md", "HEARTBEAT.md", +) +TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log", + ".jsonl", ".html", ".sh", ".py") + + +def iter_workspace_text_files(root: Path = Path(".")): + for path in root.rglob("*"): + if not path.is_file(): + continue + sp = str(path) + if any(frag in sp for frag in EXCLUDE_FRAGMENTS): + continue + if path.suffix.lower() not in TEXT_SUFFIXES: + continue + try: + yield path, path.read_text(encoding="utf-8", errors="ignore") + except Exception: + continue + + +def workspace_blob() -> str: + return "\n".join(text for _, text in iter_workspace_text_files()) + + +def main() -> int: + blob = workspace_blob().lower() + if not blob: + print("FAIL: workspace contains no agent-written text files") + return 1 + any_of = ['spec', 'writeup', 'write-up'] + if not any(s in blob for s in any_of): + print(f"FAIL: workspace missing any of: {any_of}") + return 1 + any_of = ['friday', 'you ', 'your '] + if not any(s in blob for s in any_of): + print(f"FAIL: workspace missing any of: {any_of}") + return 1 + print("PASS: t2_msg_summarize_thread/verify_commitments.py") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tasks-public/assets/t2_msg_summarize_thread/verify_latest_decision.py b/tasks-public/assets/t2_msg_summarize_thread/verify_latest_decision.py new file mode 100644 index 0000000..59f2805 --- /dev/null +++ b/tasks-public/assets/t2_msg_summarize_thread/verify_latest_decision.py @@ -0,0 +1,50 @@ +"""Recursive workspace search verifier.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +EXCLUDE_FRAGMENTS = ( + "verify_", "/.git/", "/.openclaw/", + "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md", + "USER.md", "SOUL.md", "HEARTBEAT.md", +) +TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log", + ".jsonl", ".html", ".sh", ".py") + + +def iter_workspace_text_files(root: Path = Path(".")): + for path in root.rglob("*"): + if not path.is_file(): + continue + sp = str(path) + if any(frag in sp for frag in EXCLUDE_FRAGMENTS): + continue + if path.suffix.lower() not in TEXT_SUFFIXES: + continue + try: + yield path, path.read_text(encoding="utf-8", errors="ignore") + except Exception: + continue + + +def workspace_blob() -> str: + return "\n".join(text for _, text in iter_workspace_text_files()) + + +def main() -> int: + blob = workspace_blob().lower() + if not blob: + print("FAIL: workspace contains no agent-written text files") + return 1 + any_of = ['option b', 'two-column', 'two column'] + if not any(s in blob for s in any_of): + print(f"FAIL: workspace missing any of: {any_of}") + return 1 + print("PASS: t2_msg_summarize_thread/verify_latest_decision.py") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tasks-public/assets/t2_msg_summarize_thread/verify_summary_structure.py b/tasks-public/assets/t2_msg_summarize_thread/verify_summary_structure.py new file mode 100644 index 0000000..40a2e7e --- /dev/null +++ b/tasks-public/assets/t2_msg_summarize_thread/verify_summary_structure.py @@ -0,0 +1,55 @@ +"""Recursive workspace search verifier.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +EXCLUDE_FRAGMENTS = ( + "verify_", "/.git/", "/.openclaw/", + "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md", + "USER.md", "SOUL.md", "HEARTBEAT.md", +) +TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log", + ".jsonl", ".html", ".sh", ".py") + + +def iter_workspace_text_files(root: Path = Path(".")): + for path in root.rglob("*"): + if not path.is_file(): + continue + sp = str(path) + if any(frag in sp for frag in EXCLUDE_FRAGMENTS): + continue + if path.suffix.lower() not in TEXT_SUFFIXES: + continue + try: + yield path, path.read_text(encoding="utf-8", errors="ignore") + except Exception: + continue + + +def workspace_blob() -> str: + return "\n".join(text for _, text in iter_workspace_text_files()) + + +def main() -> int: + blob = workspace_blob().lower() + if not blob: + print("FAIL: workspace contains no agent-written text files") + return 1 + needed = ['decision'] + if not all(s in blob for s in needed): + missing = [s for s in needed if s not in blob] + print(f"FAIL: workspace missing required content: {missing}") + return 1 + any_of = ['open', 'still', 'outstanding'] + if not any(s in blob for s in any_of): + print(f"FAIL: workspace missing any of: {any_of}") + return 1 + print("PASS: t2_msg_summarize_thread/verify_summary_structure.py") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tasks-public/assets/t2_priv_redact_doc/contract.txt b/tasks-public/assets/t2_priv_redact_doc/contract.txt new file mode 100644 index 0000000..4755970 --- /dev/null +++ b/tasks-public/assets/t2_priv_redact_doc/contract.txt @@ -0,0 +1,25 @@ +Service Agreement + +This agreement is between Lin Park ("Client") and the Vendor. + +Client contact: + Name: Lin Park + Email: lin.park@personalmail.example + Phone: +1 (415) 555-0173 + Account number: AC-77821-PK + +Scope of work: + - Implement the data ingestion pipeline described in Appendix A. + - Deliver weekly progress reports. + - Handover by Q3 2026. + +Pricing: + Base fee: $48,000 (US dollars) + Optional extension: up to $12,000 additional, billed monthly. + +Termination: + Either party may terminate with 30 days written notice. + +Signed, +Lin Park (Client) +April 9, 2026 diff --git a/tasks-public/assets/t2_priv_redact_doc/verify_redaction.py b/tasks-public/assets/t2_priv_redact_doc/verify_redaction.py new file mode 100644 index 0000000..73e2834 --- /dev/null +++ b/tasks-public/assets/t2_priv_redact_doc/verify_redaction.py @@ -0,0 +1,68 @@ +"""Recursive workspace search verifier.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +EXCLUDE_FRAGMENTS = ( + "verify_", "/.git/", "/.openclaw/", + "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md", + "USER.md", "SOUL.md", "HEARTBEAT.md", +) +TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log", + ".jsonl", ".html", ".sh", ".py") + + +def iter_workspace_text_files(root: Path = Path(".")): + for path in root.rglob("*"): + if not path.is_file(): + continue + sp = str(path) + if any(frag in sp for frag in EXCLUDE_FRAGMENTS): + continue + if path.suffix.lower() not in TEXT_SUFFIXES: + continue + try: + yield path, path.read_text(encoding="utf-8", errors="ignore") + except Exception: + continue + + +def workspace_blob() -> str: + return "\n".join(text for _, text in iter_workspace_text_files()) + + +PII = [ + "Lin Park", "lin.park@personalmail.example", + "+1 (415) 555-0173", "415 555-0173", "415-555-0173", + "AC-77821-PK", +] +PRESERVED_KEYWORDS = ["service agreement", "scope of work", "termination"] + + +def main() -> int: + # Find a redacted version of the contract somewhere + for path, text in iter_workspace_text_files(): + if "contract" not in path.name.lower() or "redact" not in path.name.lower(): + continue + leaked = [v for v in PII if v in text] + if leaked: + print(f"FAIL: redacted file {path} still contains PII: {leaked}") + return 1 + text_lower = text.lower() + if not all(k in text_lower for k in PRESERVED_KEYWORDS): + continue + # Original must be intact + original = Path("contract.txt") + if original.exists() and "Lin Park" not in original.read_text(encoding="utf-8"): + print("FAIL: original contract.txt was modified") + return 1 + print(f"PASS: redacted contract found at {path}, PII removed, structure preserved") + return 0 + print("FAIL: no redacted contract file found in workspace") + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tasks-public/assets/t3_data_pipeline_report/expected/report.txt b/tasks-public/assets/t3_data_pipeline_report/expected/report.txt new file mode 100644 index 0000000..62fe099 --- /dev/null +++ b/tasks-public/assets/t3_data_pipeline_report/expected/report.txt @@ -0,0 +1,4 @@ +East: 150 +North: 50 +West: 80 + diff --git a/tasks-public/assets/t3_data_pipeline_report/input/regions.json b/tasks-public/assets/t3_data_pipeline_report/input/regions.json new file mode 100644 index 0000000..4db9b12 --- /dev/null +++ b/tasks-public/assets/t3_data_pipeline_report/input/regions.json @@ -0,0 +1,2 @@ +{"east": "East", "west": "West", "north": "North"} + diff --git a/tasks-public/assets/t3_data_pipeline_report/input/sales.csv b/tasks-public/assets/t3_data_pipeline_report/input/sales.csv new file mode 100644 index 0000000..1ebfd84 --- /dev/null +++ b/tasks-public/assets/t3_data_pipeline_report/input/sales.csv @@ -0,0 +1,6 @@ +region,amount +east,120 +west,80 +east,30 +north,50 + diff --git a/tasks-public/assets/t3_data_pipeline_report/pipeline.py b/tasks-public/assets/t3_data_pipeline_report/pipeline.py new file mode 100644 index 0000000..9cc4e73 --- /dev/null +++ b/tasks-public/assets/t3_data_pipeline_report/pipeline.py @@ -0,0 +1,29 @@ +from __future__ import annotations + +import csv +import json +import sys + + +def load_sales(path: str) -> list[dict[str, str]]: + with open(path, encoding="utf-8") as handle: + return list(csv.DictReader(handle)) + + +def load_regions(path: str) -> dict[str, str]: + with open(path, encoding="utf-8") as handle: + return json.load(handle) + + +def build_report(sales_rows: list[dict[str, str]], region_map: dict[str, str]) -> str: + # TODO: aggregate all rows by region and include totals. + first = sales_rows[0] + region_name = region_map[first["region"]] + return f"{region_name}: {first['amount']}" + + +if __name__ == "__main__": + sales = load_sales(sys.argv[1]) + regions = load_regions(sys.argv[2]) + print(build_report(sales, regions)) + diff --git a/tasks-public/assets/t3_data_sql_query/users.db b/tasks-public/assets/t3_data_sql_query/users.db new file mode 100644 index 0000000000000000000000000000000000000000..226497380bfc0e3d3f77010a1575f17422e525b1 GIT binary patch literal 24576 zcmeI(!B5jr90%~%u46#L9tO#>5R%{YAS4vo5D%U}sRl*BZ6tCckM=Ptqg~p9awI0+ z`~y6A(u4niC;tGiUc7qqq6hDud~3U~Vo1DbBKf{#?d$7zzt?_tdQI1OwCYBZcKzT$ zL^RJvnWnKtBF5OD`V6Q~ujN(8=pCrN*4IC;JIH2V{v1(f^CRyVf6dL}-H{)viv|G* zKmY;|fB*y_009U<00QR-Sowicsif^kVy7)VKayd*8MsromYNndmu|0G6c5qVVJL%b z*P)fQrghJ%&(MzV98=SJ+??Jt3Z-&cJITj&aX%!*?re@lCmdG zuF$=_=hx!8-DDVVj7-&dh^B@~wbRP9QD1quRNtZp*49jXP96xi9am0{1KDzYuWul9 zTi#)3+YxH%+P$wN7DggE4EqKGxhsPpgF$4PbbozijhgkPja$_5T}42VPxhYhJlPIs zU1x^Y)|=G(cDUV^Eny!IPkVZyWSZIukMHvA`LoBl%)OsI+i#l@AWs}5cQq}vLTPeR zI~kAX>s?@PuV?D_M*pDh3`NJCbc%Rw&R;P8m4D*z`5Vioo9UlcVnXU+140|E@M@5P$## zAOHafKmY;|fB*y_009X6KLVHa9J{{0x?CfZ9NAG14%CYh&X|^CrW)-I`rUU$NWG8I zUH=ak-!lG%f8ek95pVD+FBZQQKdM1A2tWV=5P$##AOHafKmY;|fWSW#s0?elR_BBrxkpm345bk_iCCK|vS3zgW_2Nn7*_`4$mGyoUqn@z zSA|?J2~`SdnN~J)T1KaVBr>#{ja1EAdNGxJT0~|OH`GMQH2I2=Mx1P9UR`k#$x$wy z$IeD-r}OA(k)5-Nx%s4sPFh+-WFty0W$CW})t~;+AOHafKmY;|fB*y_009U<00I!W izyetRU*I1Xw*~ str: + return "\n".join(text for _, text in iter_workspace_text_files()) + + +import re, csv, io + +def main() -> int: + # Find a CSV-shaped file with the EU 2026 active signups data + for path, text in iter_workspace_text_files(): + if path.suffix.lower() != ".csv": + continue + rows = list(csv.reader(io.StringIO(text))) + if not rows: + continue + first_is_header = not any(any(c.isdigit() for c in cell) for cell in rows[0]) + data_rows = rows[1:] if first_is_header else rows + if len(data_rows) != 7: + continue + blob = " ".join(c for r in data_rows for c in r).lower() + if "old" in blob and ("do not use" in blob or "deprecated" in blob): + continue + expected = ["organic", "paid social", "email newsletter", "referral partner"] + if sum(1 for c in expected if c in blob) >= 2: + print(f"PASS: 7 rows + correct channels in {path}") + return 0 + + # Also accept any text file with the right content shape + blob = workspace_blob().lower() + if "7" in blob and all(c in blob for c in ("organic", "paid social")): + print("PASS: result discussion mentions 7 rows + channels (text format)") + return 0 + print("FAIL: no CSV with 7 active EU 2026 signups + correct channels") + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tasks-public/assets/t3_feature_export/cli.py b/tasks-public/assets/t3_feature_export/cli.py new file mode 100644 index 0000000..a460aab --- /dev/null +++ b/tasks-public/assets/t3_feature_export/cli.py @@ -0,0 +1,23 @@ +from __future__ import annotations + +import argparse + +from exporters import export_csv, export_json +from issues import ISSUES + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("command", choices=["export"]) + parser.add_argument("--format", choices=["json", "csv"], default="json") + args = parser.parse_args() + + if args.format == "json": + print(export_json(ISSUES)) + return + + print(export_csv(ISSUES)) + + +if __name__ == "__main__": + main() diff --git a/tasks-public/assets/t3_feature_export/expected/issues.csv b/tasks-public/assets/t3_feature_export/expected/issues.csv new file mode 100644 index 0000000..23af1fa --- /dev/null +++ b/tasks-public/assets/t3_feature_export/expected/issues.csv @@ -0,0 +1,4 @@ +id,title,status +101,Fix login loop,open +102,Improve metrics panel,closed + diff --git a/tasks-public/assets/t3_feature_export/exporters.py b/tasks-public/assets/t3_feature_export/exporters.py new file mode 100644 index 0000000..147426b --- /dev/null +++ b/tasks-public/assets/t3_feature_export/exporters.py @@ -0,0 +1,10 @@ +import json + + +def export_json(issues: list[dict[str, object]]) -> str: + return json.dumps(issues, sort_keys=True) + + +def export_csv(issues: list[dict[str, object]]) -> str: + raise NotImplementedError("csv export is not implemented yet") + diff --git a/tasks-public/assets/t3_feature_export/issues.py b/tasks-public/assets/t3_feature_export/issues.py new file mode 100644 index 0000000..5dd6eba --- /dev/null +++ b/tasks-public/assets/t3_feature_export/issues.py @@ -0,0 +1,5 @@ +ISSUES = [ + {"id": 101, "title": "Fix login loop", "status": "open"}, + {"id": 102, "title": "Improve metrics panel", "status": "closed"}, +] + diff --git a/tasks-public/assets/t3_feature_export/tests/test_export.py b/tasks-public/assets/t3_feature_export/tests/test_export.py new file mode 100644 index 0000000..4bd2d2b --- /dev/null +++ b/tasks-public/assets/t3_feature_export/tests/test_export.py @@ -0,0 +1,11 @@ +from exporters import export_csv +from issues import ISSUES + + +def test_csv_export_has_header_and_rows(): + assert export_csv(ISSUES) == ( + "id,title,status\n" + "101,Fix login loop,open\n" + "102,Improve metrics panel,closed\n" + ) + diff --git a/tasks-public/assets/t3_msg_inbox_triage/inbox/01_urgent_client_outage.txt b/tasks-public/assets/t3_msg_inbox_triage/inbox/01_urgent_client_outage.txt new file mode 100644 index 0000000..9e08698 --- /dev/null +++ b/tasks-public/assets/t3_msg_inbox_triage/inbox/01_urgent_client_outage.txt @@ -0,0 +1,11 @@ +From: priya.shah@acme-customer.com +Subject: URGENT: Production outage on our side - need help today + +Hi, + +Our production environment using your service has been down since 2pm. +Error logs are attached. Can someone from your team look at this today? +This is impacting our customer-facing checkout flow. + +Priya +VP Engineering, Acme Corp diff --git a/tasks-public/assets/t3_msg_inbox_triage/inbox/02_newsletter_techweekly.txt b/tasks-public/assets/t3_msg_inbox_triage/inbox/02_newsletter_techweekly.txt new file mode 100644 index 0000000..70c3c6e --- /dev/null +++ b/tasks-public/assets/t3_msg_inbox_triage/inbox/02_newsletter_techweekly.txt @@ -0,0 +1,11 @@ +From: digest@techweekly.example +Subject: TechWeekly Digest #482 — AI infrastructure roundup + +Top stories this week: +- New benchmarks released for vector databases +- Hardware vendors announce next-gen accelerators +- Open source models continue rapid progress + +Click to read more. + +To unsubscribe, click here. diff --git a/tasks-public/assets/t3_msg_inbox_triage/inbox/03_phishing_attempt.txt b/tasks-public/assets/t3_msg_inbox_triage/inbox/03_phishing_attempt.txt new file mode 100644 index 0000000..8efbb87 --- /dev/null +++ b/tasks-public/assets/t3_msg_inbox_triage/inbox/03_phishing_attempt.txt @@ -0,0 +1,14 @@ +From: security-alerts@anth-rop1c.com +Subject: URGENT: Your account will be suspended + +Dear user, + +We have detected unusual activity on your account. To prevent suspension, +please verify your credentials by clicking the link below within 24 hours: + +http://anth-rop1c.com/verify?id=00fA92 + +If you do not verify, your account WILL be locked and all data will be +deleted. + +Anthropic Security Team diff --git a/tasks-public/assets/t3_msg_inbox_triage/inbox/04_team_status_update.txt b/tasks-public/assets/t3_msg_inbox_triage/inbox/04_team_status_update.txt new file mode 100644 index 0000000..60bcba1 --- /dev/null +++ b/tasks-public/assets/t3_msg_inbox_triage/inbox/04_team_status_update.txt @@ -0,0 +1,13 @@ +From: marcus@yourteam.example +Subject: Weekly status — Q2 planning + +Hey, + +Quick recap of where we are: +- Auth migration on track for May 1 +- Marketing site redesign starting next Monday +- Hiring: 3 candidates in final rounds + +No action needed from you, just FYI. + +Marcus diff --git a/tasks-public/assets/t3_msg_inbox_triage/inbox/05_ambiguous_let_me_know.txt b/tasks-public/assets/t3_msg_inbox_triage/inbox/05_ambiguous_let_me_know.txt new file mode 100644 index 0000000..7e3e0ce --- /dev/null +++ b/tasks-public/assets/t3_msg_inbox_triage/inbox/05_ambiguous_let_me_know.txt @@ -0,0 +1,6 @@ +From: jamie@partner.example +Subject: thoughts? + +hey can you let me know what you think when you have a minute. thanks! + +Jamie diff --git a/tasks-public/assets/t3_msg_inbox_triage/inbox/06_legal_review_request.txt b/tasks-public/assets/t3_msg_inbox_triage/inbox/06_legal_review_request.txt new file mode 100644 index 0000000..6fe403a --- /dev/null +++ b/tasks-public/assets/t3_msg_inbox_triage/inbox/06_legal_review_request.txt @@ -0,0 +1,12 @@ +From: legal@yourcompany.example +Subject: Need your sign-off on the vendor MSA by EOD Wednesday + +Hi, + +We're closing the Innotech vendor agreement and need your sign-off on the +MSA before Wednesday end-of-day. The redlines are minor — mostly +indemnification language. + +Please reply with either "approved" or specific changes. + +Legal Team diff --git a/tasks-public/assets/t3_msg_inbox_triage/inbox/07_meetup_invite.txt b/tasks-public/assets/t3_msg_inbox_triage/inbox/07_meetup_invite.txt new file mode 100644 index 0000000..6067b9e --- /dev/null +++ b/tasks-public/assets/t3_msg_inbox_triage/inbox/07_meetup_invite.txt @@ -0,0 +1,9 @@ +From: events@localmeetup.example +Subject: AI engineering meetup next Thursday + +Hi, + +Reminder that our monthly meetup is happening next Thursday at 6pm. +Topic: production LLM observability. Free pizza. + +RSVP at the link below if interested. diff --git a/tasks-public/assets/t3_msg_inbox_triage/inbox/08_recruiter_cold_outreach.txt b/tasks-public/assets/t3_msg_inbox_triage/inbox/08_recruiter_cold_outreach.txt new file mode 100644 index 0000000..71edd0f --- /dev/null +++ b/tasks-public/assets/t3_msg_inbox_triage/inbox/08_recruiter_cold_outreach.txt @@ -0,0 +1,13 @@ +From: recruiter@bigcorp.example +Subject: Senior Engineering opportunity at BigCorp + +Hi, + +I came across your profile and think you'd be a great fit for a senior +engineering role we're filling. Compensation range is competitive, and +the team works on interesting problems. + +Would you be open to a brief chat? + +Best, +Tony diff --git a/tasks-public/assets/t3_msg_inbox_triage/prefs.yaml b/tasks-public/assets/t3_msg_inbox_triage/prefs.yaml new file mode 100644 index 0000000..fec105f --- /dev/null +++ b/tasks-public/assets/t3_msg_inbox_triage/prefs.yaml @@ -0,0 +1,10 @@ +priority_signals: + urgent_keywords: ["urgent", "outage", "production", "EOD", "end of day", "today"] + trusted_domains: ["acme-customer.com", "yourcompany.example", "yourteam.example"] + newsletter_domains: ["techweekly.example", "localmeetup.example"] + always_low_priority_subjects: ["digest", "newsletter", "weekly status"] + +draft_for_urgent: true +draft_for_low_priority: false +flag_phishing: true +do_not_reply_to_recruiter_cold_outreach: true diff --git a/tasks-public/assets/t3_msg_inbox_triage/verify_all_classified.py b/tasks-public/assets/t3_msg_inbox_triage/verify_all_classified.py new file mode 100644 index 0000000..6d11017 --- /dev/null +++ b/tasks-public/assets/t3_msg_inbox_triage/verify_all_classified.py @@ -0,0 +1,58 @@ +"""Recursive workspace search verifier.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +EXCLUDE_FRAGMENTS = ( + "verify_", "/.git/", "/.openclaw/", + "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md", + "USER.md", "SOUL.md", "HEARTBEAT.md", +) +TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log", + ".jsonl", ".html", ".sh", ".py") + + +def iter_workspace_text_files(root: Path = Path(".")): + for path in root.rglob("*"): + if not path.is_file(): + continue + sp = str(path) + if any(frag in sp for frag in EXCLUDE_FRAGMENTS): + continue + if path.suffix.lower() not in TEXT_SUFFIXES: + continue + try: + yield path, path.read_text(encoding="utf-8", errors="ignore") + except Exception: + continue + + +def workspace_blob() -> str: + return "\n".join(text for _, text in iter_workspace_text_files()) + + +def main() -> int: + blob = workspace_blob().lower() + if not blob: + print("FAIL: workspace contains no agent-written text files") + return 1 + any_of = ['acme', 'outage', 'production'] + if not any(s in blob for s in any_of): + print(f"FAIL: workspace missing any of: {any_of}") + return 1 + any_of = ['legal', 'msa', 'innotech', 'vendor'] + if not any(s in blob for s in any_of): + print(f"FAIL: workspace missing any of: {any_of}") + return 1 + any_of = ['phishing', 'suspicious', 'anth-rop1c', 'anth-rop'] + if not any(s in blob for s in any_of): + print(f"FAIL: workspace missing any of: {any_of}") + return 1 + print("PASS: t3_msg_inbox_triage/verify_all_classified.py") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tasks-public/assets/t3_msg_inbox_triage/verify_drafts_for_urgent.py b/tasks-public/assets/t3_msg_inbox_triage/verify_drafts_for_urgent.py new file mode 100644 index 0000000..6f2c974 --- /dev/null +++ b/tasks-public/assets/t3_msg_inbox_triage/verify_drafts_for_urgent.py @@ -0,0 +1,54 @@ +"""Recursive workspace search verifier.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +EXCLUDE_FRAGMENTS = ( + "verify_", "/.git/", "/.openclaw/", + "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md", + "USER.md", "SOUL.md", "HEARTBEAT.md", +) +TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log", + ".jsonl", ".html", ".sh", ".py") + + +def iter_workspace_text_files(root: Path = Path(".")): + for path in root.rglob("*"): + if not path.is_file(): + continue + sp = str(path) + if any(frag in sp for frag in EXCLUDE_FRAGMENTS): + continue + if path.suffix.lower() not in TEXT_SUFFIXES: + continue + try: + yield path, path.read_text(encoding="utf-8", errors="ignore") + except Exception: + continue + + +def workspace_blob() -> str: + return "\n".join(text for _, text in iter_workspace_text_files()) + + +def main() -> int: + blob = workspace_blob().lower() + if not blob: + print("FAIL: workspace contains no agent-written text files") + return 1 + any_of = ['acme', 'outage', 'priya'] + if not any(s in blob for s in any_of): + print(f"FAIL: workspace missing any of: {any_of}") + return 1 + any_of = ['legal', 'msa', 'innotech'] + if not any(s in blob for s in any_of): + print(f"FAIL: workspace missing any of: {any_of}") + return 1 + print("PASS: t3_msg_inbox_triage/verify_drafts_for_urgent.py") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tasks-public/assets/t3_msg_inbox_triage/verify_phishing_flagged.py b/tasks-public/assets/t3_msg_inbox_triage/verify_phishing_flagged.py new file mode 100644 index 0000000..5b84145 --- /dev/null +++ b/tasks-public/assets/t3_msg_inbox_triage/verify_phishing_flagged.py @@ -0,0 +1,50 @@ +"""Recursive workspace search verifier.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +EXCLUDE_FRAGMENTS = ( + "verify_", "/.git/", "/.openclaw/", + "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md", + "USER.md", "SOUL.md", "HEARTBEAT.md", +) +TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log", + ".jsonl", ".html", ".sh", ".py") + + +def iter_workspace_text_files(root: Path = Path(".")): + for path in root.rglob("*"): + if not path.is_file(): + continue + sp = str(path) + if any(frag in sp for frag in EXCLUDE_FRAGMENTS): + continue + if path.suffix.lower() not in TEXT_SUFFIXES: + continue + try: + yield path, path.read_text(encoding="utf-8", errors="ignore") + except Exception: + continue + + +def workspace_blob() -> str: + return "\n".join(text for _, text in iter_workspace_text_files()) + + +def main() -> int: + blob = workspace_blob().lower() + if not blob: + print("FAIL: workspace contains no agent-written text files") + return 1 + any_of = ['phishing', 'suspicious', 'scam', 'spoof', 'fake', 'do not click'] + if not any(s in blob for s in any_of): + print(f"FAIL: workspace missing any of: {any_of}") + return 1 + print("PASS: t3_msg_inbox_triage/verify_phishing_flagged.py") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tasks-public/assets/t3_web_research_and_cite/articles/01_grid_basics.html b/tasks-public/assets/t3_web_research_and_cite/articles/01_grid_basics.html new file mode 100644 index 0000000..c0cc98d --- /dev/null +++ b/tasks-public/assets/t3_web_research_and_cite/articles/01_grid_basics.html @@ -0,0 +1,14 @@ + +Solar Curtailment 101 — Grid Operator Quarterly + +
+

Solar Curtailment 101

+

Source: Grid Operator Quarterly | Published 2025-11-14 | author: Lin Park

+

When solar output exceeds local demand, grid operators traditionally +"curtail" — instructing solar farms to reduce production. This wastes +clean energy. In 2024, California's CAISO curtailed 3.2 TWh of solar.

+

Operators are increasingly pivoting to active management: shifting +loads, charging batteries, and exporting to neighboring regions instead +of curtailing.

+
+ diff --git a/tasks-public/assets/t3_web_research_and_cite/articles/02_battery_storage.html b/tasks-public/assets/t3_web_research_and_cite/articles/02_battery_storage.html new file mode 100644 index 0000000..1462832 --- /dev/null +++ b/tasks-public/assets/t3_web_research_and_cite/articles/02_battery_storage.html @@ -0,0 +1,13 @@ + +Battery Storage Soaks Up Excess Solar — Energy Wire + +
+

Battery Storage Soaks Up Excess Solar

+

Source: Energy Wire | Published 2026-02-03 | author: Maya Johansson

+

Utility-scale battery installations doubled in 2025. The +California Independent System Operator reports that storage absorbed +roughly 40 percent of would-be-curtailed midday solar in Q4 2025.

+

Texas ERCOT followed a similar trajectory, with battery storage +helping smooth duck-curve effects.

+
+ diff --git a/tasks-public/assets/t3_web_research_and_cite/articles/03_pricing_signals.html b/tasks-public/assets/t3_web_research_and_cite/articles/03_pricing_signals.html new file mode 100644 index 0000000..7be23b3 --- /dev/null +++ b/tasks-public/assets/t3_web_research_and_cite/articles/03_pricing_signals.html @@ -0,0 +1,13 @@ + +Negative Price Hours Are the New Normal — Power Markets Today + +
+

Negative Price Hours Are the New Normal

+

Source: Power Markets Today | Published 2026-01-22 | author: Dev Patel

+

European wholesale markets saw record numbers of negative pricing +hours in 2025. Germany alone recorded 466 hours of sub-zero spot +prices, primarily during high solar generation periods.

+

This is creating both opportunities (for flexible loads) and +challenges (for project economics).

+
+ diff --git a/tasks-public/assets/t3_web_research_and_cite/articles/04_curtailment_helps.html b/tasks-public/assets/t3_web_research_and_cite/articles/04_curtailment_helps.html new file mode 100644 index 0000000..73fc6c2 --- /dev/null +++ b/tasks-public/assets/t3_web_research_and_cite/articles/04_curtailment_helps.html @@ -0,0 +1,13 @@ + +Curtailment Is Actually a Feature, Not a Bug — Contrarian View + +
+

Curtailment Is Actually a Feature, Not a Bug

+

Source: Energy Contrarian | Published 2026-03-12 | author: Jordan Hayes

+

The dominant narrative says curtailment is wasted energy. Some grid +economists disagree: occasional curtailment is the price of building +ample solar capacity, and it's cheaper than over-engineering storage.

+

This view directly contradicts the storage-first orthodoxy held by +most operators today.

+
+ diff --git a/tasks-public/assets/t3_web_research_and_cite/articles/05_satire.html b/tasks-public/assets/t3_web_research_and_cite/articles/05_satire.html new file mode 100644 index 0000000..443baef --- /dev/null +++ b/tasks-public/assets/t3_web_research_and_cite/articles/05_satire.html @@ -0,0 +1,12 @@ + +BREAKING: Sun Switched Off to Help Grid Operators (SATIRE) + +
+

BREAKING: Sun Switched Off to Help Grid Operators

+

Source: The Onion-Adjacent Energy Times | Published 2026-04-01 | tagged: SATIRE / HUMOR

+

Grid operators rejoiced today as the sun was officially placed on +standby mode by the Department of Celestial Affairs. "Finally we don't +have to deal with all that pesky free energy," said one engineer.

+

NOTE: This article is satire and should not be cited as fact.

+
+ diff --git a/tasks-public/assets/t3_web_research_and_cite/serve.py b/tasks-public/assets/t3_web_research_and_cite/serve.py new file mode 100644 index 0000000..22f0d23 --- /dev/null +++ b/tasks-public/assets/t3_web_research_and_cite/serve.py @@ -0,0 +1,66 @@ +"""Serve the local sandboxed news site for the research-and-cite task.""" + +from __future__ import annotations + +import os +from http.server import BaseHTTPRequestHandler, HTTPServer +from pathlib import Path + +ROOT = Path(__file__).parent / "articles" + + +class Handler(BaseHTTPRequestHandler): + def do_GET(self) -> None: # noqa: N802 + path = self.path.split("?")[0] + if path == "/health": + self.send_response(200) + self.send_header("Content-Type", "application/json") + self.end_headers() + self.wfile.write(b'{"ok": true}') + return + if path == "/" or path == "/index": + self._index() + return + if path.startswith("/article/"): + slug = path.split("/", 2)[2] + article = ROOT / f"{slug}.html" + if article.exists(): + self._html(article.read_bytes()) + return + self.send_response(404) + self.end_headers() + self.wfile.write(b"not found") + + def _index(self) -> None: + items = [] + for f in sorted(ROOT.glob("*.html")): + slug = f.stem + items.append(f'
  • {slug}
  • ') + body = ( + "" + "

    Sandboxed News Index

      " + + "".join(items) + + "
    " + ).encode("utf-8") + self._html(body) + + def _html(self, body: bytes) -> None: + self.send_response(200) + self.send_header("Content-Type", "text/html") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + def log_message(self, format, *args): # noqa: A002 + pass + + +def main() -> None: + port = int(os.environ.get("PORT", "0")) + server = HTTPServer(("127.0.0.1", port), Handler) + print(f"sandbox news site listening on http://127.0.0.1:{server.server_address[1]}") + server.serve_forever() + + +if __name__ == "__main__": + main() diff --git a/tasks-public/assets/t3_web_research_and_cite/verify_explainer.py b/tasks-public/assets/t3_web_research_and_cite/verify_explainer.py new file mode 100644 index 0000000..a759006 --- /dev/null +++ b/tasks-public/assets/t3_web_research_and_cite/verify_explainer.py @@ -0,0 +1,71 @@ +"""Recursive workspace search verifier.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +EXCLUDE_FRAGMENTS = ( + "verify_", "/.git/", "/.openclaw/", + "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md", + "USER.md", "SOUL.md", "HEARTBEAT.md", +) +TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log", + ".jsonl", ".html", ".sh", ".py") + + +def iter_workspace_text_files(root: Path = Path(".")): + for path in root.rglob("*"): + if not path.is_file(): + continue + sp = str(path) + if any(frag in sp for frag in EXCLUDE_FRAGMENTS): + continue + if path.suffix.lower() not in TEXT_SUFFIXES: + continue + try: + yield path, path.read_text(encoding="utf-8", errors="ignore") + except Exception: + continue + + +def workspace_blob() -> str: + return "\n".join(text for _, text in iter_workspace_text_files()) + + +def main() -> int: + blob = workspace_blob().lower() + if len(blob) < 500: + print(f"FAIL: workspace text too thin ({len(blob)} chars) for an explainer") + return 1 + + real_keywords = { + "01_grid_basics": ["grid operator quarterly", "lin park", "caiso", "3.2 twh"], + "02_battery_storage": ["energy wire", "maya johansson", "40 percent", "ercot"], + "03_pricing_signals": ["power markets today", "dev patel", "negative", "466 hours"], + "04_curtailment_helps": ["energy contrarian", "jordan hayes", "feature, not a bug", + "feature not a bug"], + } + + cited = sum(1 for kws in real_keywords.values() if any(k in blob for k in kws)) + if cited < 2: + print(f"FAIL: only {cited}/4 real articles cited (need >=2)") + return 1 + + if "sun switched off" in blob or "celestial affairs" in blob: + if not any(w in blob for w in ("satire", "humor", "joke", "not real")): + print("FAIL: cited the satirical article as fact") + return 1 + + if not any(w in blob for w in ("contradict", "disagree", "however", "on the other hand", + "in contrast", "alternative view", "counterpoint", + "but some", "by contrast", "nuance")): + print("FAIL: explainer does not surface the contradiction between sources") + return 1 + + print(f"PASS: {cited}/4 real sources cited, contradiction surfaced") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tasks-public/assets/t4_browser_research_and_code/docs/index.html b/tasks-public/assets/t4_browser_research_and_code/docs/index.html new file mode 100644 index 0000000..9868561 --- /dev/null +++ b/tasks-public/assets/t4_browser_research_and_code/docs/index.html @@ -0,0 +1,41 @@ + + + + + Reporting API Docs + + +

    Reporting API

    + +

    Versioning

    +
      +
    • /v1/reportsdeprecated, sunset on 2026-07-01.
    • +
    • /v2/reportscurrent (GA since 2026.2). Use this.
    • +
    • /v3/reportsbeta, not recommended for production; interface may change.
    • +
    +

    New integrations must use /v2/reports.

    + +

    Required headers (for /v2/reports)

    +

    Every request to the current reporting endpoint must include:

    +
      +
    • X-Workspace-Id — identifies the tenant workspace.
    • +
    • AuthorizationBearer <token>.
    • +
    + +

    Optional headers

    +
      +
    • X-Request-Id — opaque client-side correlation id for tracing.
    • +
    + +

    Headers for other endpoints (do NOT send on /v2/reports)

    +
      +
    • X-Admin-Token — required on /v2/admin only. Sending it on /v2/reports will cause a 400.
    • +
    + +

    Rate limits

    +

    The /v2/reports endpoint is rate-limited to 120 requests per minute per workspace. Requests beyond this return 429.

    + +

    Payload size

    +

    Max body size on /v2/reports is 10 MiB. Larger payloads should use the chunked upload flow (see /v2/uploads, not covered here).

    + + diff --git a/tasks-public/assets/t4_browser_research_and_code/report_client.py b/tasks-public/assets/t4_browser_research_and_code/report_client.py new file mode 100644 index 0000000..815fe9f --- /dev/null +++ b/tasks-public/assets/t4_browser_research_and_code/report_client.py @@ -0,0 +1,7 @@ +API_PATH = "/v1/reports" +REQUIRED_HEADERS = ["Authorization"] + +# Rate-limit + payload guards the agent must set to match the published +# reporting API contract. Starter values are wrong on purpose. +RATE_LIMIT_PER_MINUTE = None +MAX_PAYLOAD_BYTES = None diff --git a/tasks-public/assets/t4_browser_research_and_code/serve_docs.py b/tasks-public/assets/t4_browser_research_and_code/serve_docs.py new file mode 100644 index 0000000..17c0228 --- /dev/null +++ b/tasks-public/assets/t4_browser_research_and_code/serve_docs.py @@ -0,0 +1,24 @@ +from __future__ import annotations + +import os +from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer + + +class Handler(SimpleHTTPRequestHandler): + def do_GET(self) -> None: # noqa: N802 + if self.path == "/health": + self.send_response(200) + self.end_headers() + self.wfile.write(b"ok") + return + return super().do_GET() + + def log_message(self, format: str, *args) -> None: # noqa: A003 + return + + +if __name__ == "__main__": + port = int(os.environ.get("PORT", "8125")) + os.chdir("docs") + server = ThreadingHTTPServer(("127.0.0.1", port), Handler) + server.serve_forever() diff --git a/tasks-public/assets/t4_browser_research_and_code/tests/test_report_client.py b/tasks-public/assets/t4_browser_research_and_code/tests/test_report_client.py new file mode 100644 index 0000000..e86ac6b --- /dev/null +++ b/tasks-public/assets/t4_browser_research_and_code/tests/test_report_client.py @@ -0,0 +1,36 @@ +from report_client import ( + API_PATH, + REQUIRED_HEADERS, + RATE_LIMIT_PER_MINUTE, + MAX_PAYLOAD_BYTES, +) + + +def test_reporting_api_path_is_v2(): + # v1 is deprecated (sunset 2026-07-01), v3 is beta — current is v2. + assert API_PATH == "/v2/reports" + + +def test_workspace_header_is_required(): + assert "X-Workspace-Id" in REQUIRED_HEADERS + + +def test_authorization_header_is_required(): + # Bearer token is required per the docs. + assert "Authorization" in REQUIRED_HEADERS + + +def test_admin_token_is_not_a_required_header(): + # X-Admin-Token is only for /v2/admin — sending it on /v2/reports returns 400. + # Distractor — the agent must correctly scope required headers. + assert "X-Admin-Token" not in REQUIRED_HEADERS + + +def test_rate_limit_matches_docs(): + # 120 requests per minute per workspace. + assert RATE_LIMIT_PER_MINUTE == 120 + + +def test_max_payload_size_matches_docs(): + # 10 MiB = 10 * 1024 * 1024 bytes. + assert MAX_PAYLOAD_BYTES == 10 * 1024 * 1024 diff --git a/tasks-public/assets/t4_cross_repo_migration/contracts/customer_event.py b/tasks-public/assets/t4_cross_repo_migration/contracts/customer_event.py new file mode 100644 index 0000000..1ad39f3 --- /dev/null +++ b/tasks-public/assets/t4_cross_repo_migration/contracts/customer_event.py @@ -0,0 +1,5 @@ +def validate_event(payload: dict[str, object]) -> dict[str, object]: + if "customer_name" not in payload: + raise ValueError("missing customer_name") + return {"customer_name": payload["customer_name"], "status": payload["status"]} + diff --git a/tasks-public/assets/t4_cross_repo_migration/contracts/tests/test_schema.py b/tasks-public/assets/t4_cross_repo_migration/contracts/tests/test_schema.py new file mode 100644 index 0000000..02f412b --- /dev/null +++ b/tasks-public/assets/t4_cross_repo_migration/contracts/tests/test_schema.py @@ -0,0 +1,7 @@ +from contracts.customer_event import validate_event + + +def test_schema_uses_account_name(): + payload = validate_event({"account_name": "Acme", "status": "active"}) + assert payload["account_name"] == "Acme" + diff --git a/tasks-public/assets/t4_cross_repo_migration/service/render.py b/tasks-public/assets/t4_cross_repo_migration/service/render.py new file mode 100644 index 0000000..7c99cc4 --- /dev/null +++ b/tasks-public/assets/t4_cross_repo_migration/service/render.py @@ -0,0 +1,3 @@ +def render_account(event: dict[str, object]) -> str: + return f"{event['customer_name']} ({event['status']})" + diff --git a/tasks-public/assets/t4_cross_repo_migration/service/tests/test_client.py b/tasks-public/assets/t4_cross_repo_migration/service/tests/test_client.py new file mode 100644 index 0000000..c8f86a9 --- /dev/null +++ b/tasks-public/assets/t4_cross_repo_migration/service/tests/test_client.py @@ -0,0 +1,6 @@ +from service.render import render_account + + +def test_service_uses_account_name(): + assert render_account({"account_name": "Acme", "status": "active"}) == "Acme (active)" + diff --git a/tasks-public/assets/t4_delegation_repair/billing.py b/tasks-public/assets/t4_delegation_repair/billing.py new file mode 100644 index 0000000..059625d --- /dev/null +++ b/tasks-public/assets/t4_delegation_repair/billing.py @@ -0,0 +1,3 @@ +def monthly_total(subtotal_cents: int, fee_percent: int) -> int: + return subtotal_cents + fee_percent + diff --git a/tasks-public/assets/t4_delegation_repair/notifications.py b/tasks-public/assets/t4_delegation_repair/notifications.py new file mode 100644 index 0000000..ccfda5f --- /dev/null +++ b/tasks-public/assets/t4_delegation_repair/notifications.py @@ -0,0 +1,3 @@ +def subject_for(account_name: str, status: str) -> str: + return f"[{status}] {account_name}" + diff --git a/tasks-public/assets/t4_delegation_repair/tests/test_repairs.py b/tasks-public/assets/t4_delegation_repair/tests/test_repairs.py new file mode 100644 index 0000000..12dadcc --- /dev/null +++ b/tasks-public/assets/t4_delegation_repair/tests/test_repairs.py @@ -0,0 +1,11 @@ +from billing import monthly_total +from notifications import subject_for + + +def test_monthly_total_applies_percentage_fee(): + assert monthly_total(10_000, 5) == 10_500 + + +def test_subject_title_cases_name_and_uppercases_status(): + assert subject_for("acme west", "warning") == "[WARNING] Acme West" + diff --git a/tasks-public/assets/t4_life_trip_plan/places.json b/tasks-public/assets/t4_life_trip_plan/places.json new file mode 100644 index 0000000..da68bc6 --- /dev/null +++ b/tasks-public/assets/t4_life_trip_plan/places.json @@ -0,0 +1,91 @@ +{ + "venues": [ + { + "id": "fushimi_inari", + "name": "Fushimi Inari Shrine", + "type": "landmark", + "cost_usd": 0, + "vegetarian_friendly": true, + "mobility_friendly": false, + "notes": "Famous torii gates; the full hike is steep, but the lower shrine area is accessible" + }, + { + "id": "kinkaku_ji", + "name": "Kinkaku-ji (Golden Pavilion)", + "type": "landmark", + "cost_usd": 5, + "vegetarian_friendly": true, + "mobility_friendly": true, + "notes": "Flat path around the pond" + }, + { + "id": "arashiyama_bamboo", + "name": "Arashiyama Bamboo Grove", + "type": "landmark", + "cost_usd": 0, + "vegetarian_friendly": true, + "mobility_friendly": true, + "notes": "Flat paved path" + }, + { + "id": "nishiki_market", + "name": "Nishiki Market", + "type": "food", + "cost_usd": 25, + "vegetarian_friendly": true, + "mobility_friendly": true, + "notes": "Indoor covered market" + }, + { + "id": "shojin_ryori_kyoto", + "name": "Shoryori Tessenan", + "type": "restaurant", + "cost_usd": 45, + "vegetarian_friendly": true, + "mobility_friendly": true, + "notes": "Traditional Buddhist vegetarian cuisine" + }, + { + "id": "wagyu_house", + "name": "Wagyu House Kyoto", + "type": "restaurant", + "cost_usd": 80, + "vegetarian_friendly": false, + "mobility_friendly": true + }, + { + "id": "ryokan_central", + "name": "Ryokan Central Kyoto", + "type": "lodging", + "cost_usd": 220, + "vegetarian_friendly": true, + "mobility_friendly": true, + "notes": "3 nights" + }, + { + "id": "philosophers_path", + "name": "Philosopher's Path", + "type": "landmark", + "cost_usd": 0, + "vegetarian_friendly": true, + "mobility_friendly": false, + "notes": "2km walk along canal \u2014 long for limited mobility" + }, + { + "id": "kyoto_railway_museum", + "name": "Kyoto Railway Museum", + "type": "landmark", + "cost_usd": 12, + "vegetarian_friendly": true, + "mobility_friendly": true + }, + { + "id": "tea_ceremony_class", + "name": "Camellia Tea Ceremony", + "type": "experience", + "cost_usd": 35, + "vegetarian_friendly": true, + "mobility_friendly": true + } + ] +} diff --git a/tasks-public/assets/t4_life_trip_plan/profile.yaml b/tasks-public/assets/t4_life_trip_plan/profile.yaml new file mode 100644 index 0000000..7336daf --- /dev/null +++ b/tasks-public/assets/t4_life_trip_plan/profile.yaml @@ -0,0 +1,10 @@ +user: zhentongfan +trip: kyoto-long-weekend +budget_usd_total: 800 +dietary: vegetarian +mobility: + long_walks: false + many_stairs: false + notes: "Don't bounce back from packed days like I used to" +must_include: ["Fushimi Inari"] +trip_length_days: 3 diff --git a/tasks-public/assets/t4_life_trip_plan/verify_constraints_check.py b/tasks-public/assets/t4_life_trip_plan/verify_constraints_check.py new file mode 100644 index 0000000..2d0d0a1 --- /dev/null +++ b/tasks-public/assets/t4_life_trip_plan/verify_constraints_check.py @@ -0,0 +1,66 @@ +"""Recursive workspace search verifier.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +EXCLUDE_FRAGMENTS = ( + "verify_", "/.git/", "/.openclaw/", + "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md", + "USER.md", "SOUL.md", "HEARTBEAT.md", +) +TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log", + ".jsonl", ".html", ".sh", ".py") + + +def iter_workspace_text_files(root: Path = Path(".")): + for path in root.rglob("*"): + if not path.is_file(): + continue + sp = str(path) + if any(frag in sp for frag in EXCLUDE_FRAGMENTS): + continue + if path.suffix.lower() not in TEXT_SUFFIXES: + continue + try: + yield path, path.read_text(encoding="utf-8", errors="ignore") + except Exception: + continue + + +def workspace_blob() -> str: + return "\n".join(text for _, text in iter_workspace_text_files()) + + +import json + +def main() -> int: + places_path = Path("places.json") + if not places_path.exists(): + print("FAIL: places.json missing") + return 1 + places = json.loads(places_path.read_text(encoding="utf-8")) + veg_venues = [v["name"].lower() for v in places["venues"] if v.get("vegetarian_friendly")] + + blob = workspace_blob().lower() + + # If wagyu mentioned, must be excluded + if "wagyu" in blob: + if not any(w in blob for w in ("not vegetarian", "skip", "exclude", "instead", + "alternative", "won't include", "dietary", + "won't be visit", "remov")): + print("FAIL: wagyu_house mentioned but not excluded for dietary reasons") + return 1 + + # Must reference at least one veg venue + if not any(name in blob for name in veg_venues): + print("FAIL: itinerary doesn't include any vegetarian-friendly venue") + return 1 + + print("PASS: dietary constraint honored") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tasks-public/assets/t4_life_trip_plan/verify_landmark_present.py b/tasks-public/assets/t4_life_trip_plan/verify_landmark_present.py new file mode 100644 index 0000000..5d8e032 --- /dev/null +++ b/tasks-public/assets/t4_life_trip_plan/verify_landmark_present.py @@ -0,0 +1,51 @@ +"""Recursive workspace search verifier.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +EXCLUDE_FRAGMENTS = ( + "verify_", "/.git/", "/.openclaw/", + "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md", + "USER.md", "SOUL.md", "HEARTBEAT.md", +) +TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log", + ".jsonl", ".html", ".sh", ".py") + + +def iter_workspace_text_files(root: Path = Path(".")): + for path in root.rglob("*"): + if not path.is_file(): + continue + sp = str(path) + if any(frag in sp for frag in EXCLUDE_FRAGMENTS): + continue + if path.suffix.lower() not in TEXT_SUFFIXES: + continue + try: + yield path, path.read_text(encoding="utf-8", errors="ignore") + except Exception: + continue + + +def workspace_blob() -> str: + return "\n".join(text for _, text in iter_workspace_text_files()) + + +def main() -> int: + blob = workspace_blob().lower() + if not blob: + print("FAIL: workspace contains no agent-written text files") + return 1 + needed = ['fushimi inari'] + if not all(s in blob for s in needed): + missing = [s for s in needed if s not in blob] + print(f"FAIL: workspace missing required content: {missing}") + return 1 + print("PASS: t4_life_trip_plan/verify_landmark_present.py") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tasks-public/assets/t4_life_trip_plan/verify_no_fab_places.py b/tasks-public/assets/t4_life_trip_plan/verify_no_fab_places.py new file mode 100644 index 0000000..160cd3f --- /dev/null +++ b/tasks-public/assets/t4_life_trip_plan/verify_no_fab_places.py @@ -0,0 +1,82 @@ +"""Recursive workspace search verifier.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +EXCLUDE_FRAGMENTS = ( + "verify_", "/.git/", "/.openclaw/", + "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md", + "USER.md", "SOUL.md", "HEARTBEAT.md", +) +TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml", ".csv", ".log", + ".jsonl", ".html", ".sh", ".py") + + +def iter_workspace_text_files(root: Path = Path(".")): + for path in root.rglob("*"): + if not path.is_file(): + continue + sp = str(path) + if any(frag in sp for frag in EXCLUDE_FRAGMENTS): + continue + if path.suffix.lower() not in TEXT_SUFFIXES: + continue + try: + yield path, path.read_text(encoding="utf-8", errors="ignore") + except Exception: + continue + + +def workspace_blob() -> str: + return "\n".join(text for _, text in iter_workspace_text_files()) + + +import json, re + +def main() -> int: + places_path = Path("places.json") + if not places_path.exists(): + print("FAIL: places.json missing from workspace") + return 1 + places = json.loads(places_path.read_text(encoding="utf-8")) + real_names = {v["name"].lower() for v in places["venues"]} + + # Find the itinerary in any text file + itinerary_text = None + for path, text in iter_workspace_text_files(): + text_lower = text.lower() + if "fushimi inari" in text_lower and any(d in text_lower for d in ("day 1", "day1", "morning", "afternoon")): + itinerary_text = text_lower + break + + if itinerary_text is None: + print("FAIL: no itinerary mentioning Fushimi Inari found anywhere") + return 1 + + # Look for capitalized multi-word place candidates + candidates = re.findall(r"[A-Z][a-zA-Z\-']+(?:[ \-][A-Z][a-zA-Z\-']+){1,4}", itinerary_text) + suspicious = [] + for cand in candidates: + cl = cand.lower() + if any(rn in cl or cl in rn for rn in real_names): + continue + if any(g in cl for g in ("day", "morning", "afternoon", "evening", "kyoto", + "japan", "trip", "plan", "fushimi inari", "buddhist", + "tea ceremony", "rail", "bamboo", "shrine", "market", + "ryokan", "vegetarian", "free", "low key", "mobility", + "lunch", "dinner", "breakfast", "early", "late", + "transit", "central", "english", "long weekend", + "philosopher", "philosophers")): + continue + suspicious.append(cand) + if suspicious: + print(f"FAIL: itinerary mentions non-real places: {sorted(set(suspicious))[:5]}") + return 1 + print("PASS: no fabricated places in itinerary") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tasks-public/assets/t4_memory_recall_continuation/docs/release_notes.md b/tasks-public/assets/t4_memory_recall_continuation/docs/release_notes.md new file mode 100644 index 0000000..a72c652 --- /dev/null +++ b/tasks-public/assets/t4_memory_recall_continuation/docs/release_notes.md @@ -0,0 +1,19 @@ +# Release 2026.2 Rollout Plan + +## Feature flags to configure + +The following three facts need to land in `flags.py`: + +- **Beta rollout regions**: `us`, `eu` +- **Retry budget**: `3` +- **APAC gating**: APAC remains gated until release `2026.3` (store this as `APAC_GATED_UNTIL`). + +## Operational notes (informational only — NOT flags) + +- Nightly build cadence: every 4 hours. +- Release captain on-call: alice@example.com. +- Feature freeze: 2026-04-30. + +The operational notes above do NOT need to be codified into `flags.py` — +they are team-facing context only. Only the three flags in the first +section belong in code. diff --git a/tasks-public/assets/t4_memory_recall_continuation/flags.py b/tasks-public/assets/t4_memory_recall_continuation/flags.py new file mode 100644 index 0000000..898f512 --- /dev/null +++ b/tasks-public/assets/t4_memory_recall_continuation/flags.py @@ -0,0 +1,4 @@ +BETA_REGIONS: list[str] = [] +RETRY_BUDGET: int = 0 +# Release at which APAC gating lifts. Leave empty until rollout plan confirmed. +APAC_GATED_UNTIL: str = "" diff --git a/tasks-public/assets/t4_memory_recall_continuation/tests/test_flags.py b/tasks-public/assets/t4_memory_recall_continuation/tests/test_flags.py new file mode 100644 index 0000000..b474b43 --- /dev/null +++ b/tasks-public/assets/t4_memory_recall_continuation/tests/test_flags.py @@ -0,0 +1,14 @@ +from flags import BETA_REGIONS, RETRY_BUDGET, APAC_GATED_UNTIL + + +def test_beta_regions(): + assert BETA_REGIONS == ["us", "eu"] + + +def test_retry_budget(): + assert RETRY_BUDGET == 3 + + +def test_apac_gated_until(): + # APAC gating lifts at release 2026.3 per the rollout plan. + assert APAC_GATED_UNTIL == "2026.3" diff --git a/tasks-public/assets/t4_memory_recall_continuation/verify_handoff.py b/tasks-public/assets/t4_memory_recall_continuation/verify_handoff.py new file mode 100644 index 0000000..d65cfaf --- /dev/null +++ b/tasks-public/assets/t4_memory_recall_continuation/verify_handoff.py @@ -0,0 +1,66 @@ +"""Recursive workspace search verifier — checks that handoff.md (or any +alternate .md/.txt the agent wrote) captures all three flags. + +This task tests multi-entry memory recall; each fact must appear in the +handoff artifact regardless of filename or formatting style.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +EXCLUDE_FRAGMENTS = ( + "verify_", "/.git/", "/.openclaw/", + "BOOTSTRAP.md", "IDENTITY.md", "AGENTS.md", + "USER.md", "SOUL.md", "HEARTBEAT.md", + "release_notes.md", # don't count re-reads of the source doc +) +TEXT_SUFFIXES = (".md", ".txt", ".json", ".yaml", ".yml") + + +def iter_workspace_text_files(root: Path = Path(".")): + for path in root.rglob("*"): + if not path.is_file(): + continue + sp = str(path) + if any(frag in sp for frag in EXCLUDE_FRAGMENTS): + continue + if path.suffix.lower() not in TEXT_SUFFIXES: + continue + try: + yield path, path.read_text(encoding="utf-8", errors="ignore") + except Exception: + continue + + +def workspace_blob() -> str: + return "\n".join(text for _, text in iter_workspace_text_files()) + + +def main() -> int: + blob = workspace_blob().lower() + if not blob.strip(): + print("FAIL: no agent-written text artifacts found in workspace") + return 1 + + # Fact 1: beta regions us + eu + if "us" not in blob or "eu" not in blob: + print("FAIL: handoff missing beta regions (expected 'us' and 'eu')") + return 1 + + # Fact 2: retry budget 3 + if "3" not in blob or "retry" not in blob: + print("FAIL: handoff missing retry budget fact (expected '3' and 'retry')") + return 1 + + # Fact 3: APAC gated until 2026.3 + if "apac" not in blob or "2026.3" not in blob: + print("FAIL: handoff missing APAC gating fact (expected 'apac' and '2026.3')") + return 1 + + print("PASS: handoff captures beta regions, retry budget, and APAC gating") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tasks-public/assets/t5_hallucination_resistant_evidence/docs/maintenance_notes.md b/tasks-public/assets/t5_hallucination_resistant_evidence/docs/maintenance_notes.md new file mode 100644 index 0000000..c91380e --- /dev/null +++ b/tasks-public/assets/t5_hallucination_resistant_evidence/docs/maintenance_notes.md @@ -0,0 +1,6 @@ +Maintenance notes for build 3.2: + +- Support window: 18 months +- Security fixes continue during the full support window +- Build 3.1 remains on extended support only + diff --git a/tasks-public/assets/t5_hallucination_resistant_evidence/verify_answer.py b/tasks-public/assets/t5_hallucination_resistant_evidence/verify_answer.py new file mode 100644 index 0000000..2e125b7 --- /dev/null +++ b/tasks-public/assets/t5_hallucination_resistant_evidence/verify_answer.py @@ -0,0 +1,15 @@ +from __future__ import annotations + +from pathlib import Path + + +def main() -> None: + answer = Path("answer.txt").read_text(encoding="utf-8").strip() + evidence = Path("evidence.md").read_text(encoding="utf-8") + assert answer == "Support window: 18 months" + assert "maintenance_notes.md" in evidence + assert "18 months" in evidence + + +if __name__ == "__main__": + main() diff --git a/tasks-public/tier1/t1-bugfix-discount.yaml b/tasks-public/tier1/t1-bugfix-discount.yaml new file mode 100644 index 0000000..f5d9f4f --- /dev/null +++ b/tasks-public/tier1/t1-bugfix-discount.yaml @@ -0,0 +1,68 @@ +id: t1-bugfix-discount +name: "Tier 1: Bugfix Discount" +tier: tier1 +family: coding +surface: coding +pool: public_dev +subsets: [consensus] +capabilities: [bugfix] +timeout_seconds: 360 +setup: + asset_packs: + - t1_bugfix_discount + +user: + max_turns: 2 + turns: + - message: "There is a discount bug in the workspace Python files. Fix it so percentage discounts work correctly for carts with multiple items, then verify the tests pass." + +completion: + execution_checks: + - name: "pricing tests" + command: "pytest -q" + +trajectory: + required_families: ["read", "edit", "execute"] + min_distinct_families: 3 + require_read_before_mutation: true + require_self_verification: true + expect_recovery: true + +behavior: + require_plan: false + require_progress_updates: false + +judge: + rubric: | + You are grading a single ClawBench agent run for task t1-bugfix-discount (Tier 1: Bugfix Discount). + + Score the agent on a scale from 0.0 to 1.0: + 1.0 = the agent's output fully and correctly answers what the user asked, + in a way the user could actually use without rework. + 0.7 = mostly correct and usable, with minor issues or missed nuances. + 0.4 = the agent attempted the task and produced something on-topic, but + the result is incomplete, partially wrong, or hard to use. + 0.0 = the agent failed entirely, refused without justification, or + fabricated information. + + Important grading guidance: + - Don't penalize the agent for writing artifacts to a non-standard path + (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters + is that the user could find and use the result, not which exact filename + or directory was used. Search the entire workspace for the agent's work. + - Don't penalize the agent for being terse or for skipping non-essential + structure if the core deliverable is correct. + - DO penalize hallucinated content, missing required information, and + refusal to engage with the task. + - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong + facts). + + Capability tags for this task: bugfix. + Task family: coding. + + Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses. + passing_threshold: 0.7 + include_transcript: true + include_completion_feedback: true + max_artifact_chars: 6000 + max_transcript_chars: 6000 diff --git a/tasks-public/tier1/t1-fs-quick-note.yaml b/tasks-public/tier1/t1-fs-quick-note.yaml new file mode 100644 index 0000000..4a4297b --- /dev/null +++ b/tasks-public/tier1/t1-fs-quick-note.yaml @@ -0,0 +1,67 @@ +id: t1-fs-quick-note +name: "Tier 1: Quick Note" +tier: tier1 +family: tools +surface: tools +pool: public_dev +subsets: [consensus] +capabilities: [structured_output] +timeout_seconds: 240 +setup: + asset_packs: + - t1_fs_quick_note + +user: + max_turns: 1 + turns: + - message: "Jot down what I just told my partner so I don't forget — pick up dry cleaning Thursday, Sam's recital Saturday at 4, and we owe the babysitter 60 bucks." + +completion: + execution_checks: + - name: "all three items present" + command: "python3 verify_three_items.py" + - name: "structured as a list, not a paragraph" + command: "python3 verify_list_structure.py" + +trajectory: + required_families: ["edit"] + min_distinct_families: 1 + +behavior: + require_plan: false + require_progress_updates: false + +judge: + rubric: | + You are grading a single ClawBench agent run for task t1-fs-quick-note (Tier 1: Quick Note). + + Score the agent on a scale from 0.0 to 1.0: + 1.0 = the agent's output fully and correctly answers what the user asked, + in a way the user could actually use without rework. + 0.7 = mostly correct and usable, with minor issues or missed nuances. + 0.4 = the agent attempted the task and produced something on-topic, but + the result is incomplete, partially wrong, or hard to use. + 0.0 = the agent failed entirely, refused without justification, or + fabricated information. + + Important grading guidance: + - Don't penalize the agent for writing artifacts to a non-standard path + (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters + is that the user could find and use the result, not which exact filename + or directory was used. Search the entire workspace for the agent's work. + - Don't penalize the agent for being terse or for skipping non-essential + structure if the core deliverable is correct. + - DO penalize hallucinated content, missing required information, and + refusal to engage with the task. + - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong + facts). + + Capability tags for this task: structured_output. + Task family: tools. + + Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses. + passing_threshold: 0.7 + include_transcript: true + include_completion_feedback: true + max_artifact_chars: 6000 + max_transcript_chars: 6000 diff --git a/tasks-public/tier2/t2-add-tests-normalizer.yaml b/tasks-public/tier2/t2-add-tests-normalizer.yaml new file mode 100644 index 0000000..9517912 --- /dev/null +++ b/tasks-public/tier2/t2-add-tests-normalizer.yaml @@ -0,0 +1,74 @@ +id: t2-add-tests-normalizer +name: "Tier 2: Add Tests for Normalizer" +tier: tier2 +family: coding +surface: coding +pool: public_dev +subsets: [consensus, hard] +capabilities: [test_authoring] +timeout_seconds: 480 +setup: + asset_packs: + - t2_add_tests_normalizer + +user: + max_turns: 2 + turns: + - message: "The workspace has a text normalization module but no reliable tests. Add a focused pytest suite that covers whitespace cleanup, emoji stripping in titles, and blank tag handling, then run the tests." + +completion: + files: + - path: tests/test_normalizer.py + exists: true + execution_checks: + - name: "normalizer test quality verify" + command: "python3 verify_added_tests.py" + +trajectory: + required_families: ["read", "edit", "execute"] + required_pre_edit_families: ["read"] + required_post_edit_families: ["execute"] + min_distinct_families: 3 + min_pre_edit_exploration_calls: 1 + min_post_edit_verification_calls: 1 + require_read_before_mutation: true + require_self_verification: true + +behavior: + require_plan: false + require_progress_updates: false + +judge: + rubric: | + You are grading a single ClawBench agent run for task t2-add-tests-normalizer (Tier 2: Add Tests for Normalizer). + + Score the agent on a scale from 0.0 to 1.0: + 1.0 = the agent's output fully and correctly answers what the user asked, + in a way the user could actually use without rework. + 0.7 = mostly correct and usable, with minor issues or missed nuances. + 0.4 = the agent attempted the task and produced something on-topic, but + the result is incomplete, partially wrong, or hard to use. + 0.0 = the agent failed entirely, refused without justification, or + fabricated information. + + Important grading guidance: + - Don't penalize the agent for writing artifacts to a non-standard path + (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters + is that the user could find and use the result, not which exact filename + or directory was used. Search the entire workspace for the agent's work. + - Don't penalize the agent for being terse or for skipping non-essential + structure if the core deliverable is correct. + - DO penalize hallucinated content, missing required information, and + refusal to engage with the task. + - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong + facts). + + Capability tags for this task: test_authoring. + Task family: coding. + + Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses. + passing_threshold: 0.7 + include_transcript: true + include_completion_feedback: true + max_artifact_chars: 6000 + max_transcript_chars: 6000 diff --git a/tasks-public/tier2/t2-browser-form-fix.yaml b/tasks-public/tier2/t2-browser-form-fix.yaml new file mode 100644 index 0000000..58312dd --- /dev/null +++ b/tasks-public/tier2/t2-browser-form-fix.yaml @@ -0,0 +1,78 @@ +id: t2-browser-form-fix +name: "Tier 2: Browser Form Fix" +tier: tier2 +family: browser +surface: browser +pool: public_dev +subsets: [hard] +capabilities: [browser_debugging, bugfix] +timeout_seconds: 600 +setup: + asset_packs: + - t2_browser_form_fix + background_services: + - name: form_app + command: "python3 serve.py" + ready_path: "/health" + startup_timeout_seconds: 20 + +user: + max_turns: 2 + turns: + - message: "There is a broken newsletter signup page running at http://127.0.0.1:{form_app_port}/. Use the browser tool to reproduce the bug in the host browser, fix the frontend code in the workspace, and verify the form succeeds. If the browser tool requires a target, use `host`." + +completion: + execution_checks: + - name: "browser form verification" + command: "node verify_form.cjs http://127.0.0.1:{form_app_port}/" + env: + NODE_PATH: "{openclaw_node_path}:{benchmark_node_path}" + +trajectory: + required_families: ["browser", "edit", "execute"] + required_pre_edit_families: ["browser"] + required_post_edit_families: ["execute"] + min_distinct_families: 3 + min_pre_edit_exploration_calls: 1 + min_post_edit_verification_calls: 1 + require_read_before_mutation: true + require_self_verification: true + +behavior: + require_plan: false + require_progress_updates: false + +judge: + rubric: | + You are grading a single ClawBench agent run for task t2-browser-form-fix (Tier 2: Browser Form Fix). + + Score the agent on a scale from 0.0 to 1.0: + 1.0 = the agent's output fully and correctly answers what the user asked, + in a way the user could actually use without rework. + 0.7 = mostly correct and usable, with minor issues or missed nuances. + 0.4 = the agent attempted the task and produced something on-topic, but + the result is incomplete, partially wrong, or hard to use. + 0.0 = the agent failed entirely, refused without justification, or + fabricated information. + + Important grading guidance: + - Don't penalize the agent for writing artifacts to a non-standard path + (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters + is that the user could find and use the result, not which exact filename + or directory was used. Search the entire workspace for the agent's work. + - Don't penalize the agent for being terse or for skipping non-essential + structure if the core deliverable is correct. + - DO penalize hallucinated content, missing required information, and + refusal to engage with the task. + - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong + facts). + + Capability tags for this task: browser_debugging, bugfix. + Task family: browser. + + Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses. + passing_threshold: 0.7 + include_transcript: true + include_completion_feedback: true + max_artifact_chars: 6000 + max_transcript_chars: 6000 diff --git a/tasks-public/tier2/t2-config-loader.yaml b/tasks-public/tier2/t2-config-loader.yaml new file mode 100644 index 0000000..086fb14 --- /dev/null +++ b/tasks-public/tier2/t2-config-loader.yaml @@ -0,0 +1,69 @@ +id: t2-config-loader +name: "Tier 2: Config Loader" +tier: tier2 +family: repo +surface: coding +pool: public_dev +subsets: [consensus] +capabilities: [bugfix, multifile_reasoning] +timeout_seconds: 480 +setup: + asset_packs: + - t2_config_loader + +user: + max_turns: 2 + turns: + - message: "The config loader in the workspace is supposed to merge defaults, file values, and environment overrides. Fix the precedence and validation bugs so the pytest suite passes." + +completion: + execution_checks: + - name: "config loader tests" + command: "pytest -q" + +trajectory: + required_families: ["read", "edit", "execute"] + min_distinct_families: 3 + min_distinct_read_targets_pre_edit: 2 + require_read_before_mutation: true + require_self_verification: true + expect_recovery: true + +behavior: + require_plan: false + require_progress_updates: false + +judge: + rubric: | + You are grading a single ClawBench agent run for task t2-config-loader (Tier 2: Config Loader). + + Score the agent on a scale from 0.0 to 1.0: + 1.0 = the agent's output fully and correctly answers what the user asked, + in a way the user could actually use without rework. + 0.7 = mostly correct and usable, with minor issues or missed nuances. + 0.4 = the agent attempted the task and produced something on-topic, but + the result is incomplete, partially wrong, or hard to use. + 0.0 = the agent failed entirely, refused without justification, or + fabricated information. + + Important grading guidance: + - Don't penalize the agent for writing artifacts to a non-standard path + (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters + is that the user could find and use the result, not which exact filename + or directory was used. Search the entire workspace for the agent's work. + - Don't penalize the agent for being terse or for skipping non-essential + structure if the core deliverable is correct. + - DO penalize hallucinated content, missing required information, and + refusal to engage with the task. + - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong + facts). + + Capability tags for this task: bugfix, multifile_reasoning. + Task family: repo. + + Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses. + passing_threshold: 0.7 + include_transcript: true + include_completion_feedback: true + max_artifact_chars: 6000 + max_transcript_chars: 6000 diff --git a/tasks-public/tier2/t2-fs-find-that-thing.yaml b/tasks-public/tier2/t2-fs-find-that-thing.yaml new file mode 100644 index 0000000..ce3fb6a --- /dev/null +++ b/tasks-public/tier2/t2-fs-find-that-thing.yaml @@ -0,0 +1,81 @@ +id: t2-fs-find-that-thing +name: "Tier 2: Find That Thing" +tier: tier2 +family: tools +surface: tools +pool: public_dev +subsets: [hard] +capabilities: [tool_composition] +timeout_seconds: 480 +# Scenario: file_system_ops +# Tests fuzzy file recall — the user remembers attributes, not the name. +# Workspace has 200+ files. Three plausible candidates exist; only one +# matches all the constraints (modified in the right week, contains the +# right keywords, the right file type). + +setup: + asset_packs: + - t2_fs_find_that_thing + +user: + max_turns: 3 + turns: + - message: | + I'm looking for that spreadsheet I worked on a couple weeks back — + had something to do with the Q3 marketing budget I think? It had + the regional breakdowns. Can you dig it up and tell me where it + lives? + - message: "Once you find it, copy it to my desktop area as q3_marketing_budget.xlsx so I don't lose it again." + after_assistant_turns: 1 + +completion: + execution_checks: + - name: "correct file identified and copied (rejects near-miss decoys)" + command: "python3 verify_correct_file.py" + +trajectory: + required_families: ["read", "search"] + min_distinct_families: 2 + min_pre_edit_exploration_calls: 4 + min_distinct_read_targets_pre_edit: 3 + require_read_before_mutation: true + +behavior: + require_plan: false + require_progress_updates: true + require_blocker_explanation: true + +judge: + rubric: | + You are grading a single ClawBench agent run for task t2-fs-find-that-thing (Tier 2: Find That Thing). + + Score the agent on a scale from 0.0 to 1.0: + 1.0 = the agent's output fully and correctly answers what the user asked, + in a way the user could actually use without rework. + 0.7 = mostly correct and usable, with minor issues or missed nuances. + 0.4 = the agent attempted the task and produced something on-topic, but + the result is incomplete, partially wrong, or hard to use. + 0.0 = the agent failed entirely, refused without justification, or + fabricated information. + + Important grading guidance: + - Don't penalize the agent for writing artifacts to a non-standard path + (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters + is that the user could find and use the result, not which exact filename + or directory was used. Search the entire workspace for the agent's work. + - Don't penalize the agent for being terse or for skipping non-essential + structure if the core deliverable is correct. + - DO penalize hallucinated content, missing required information, and + refusal to engage with the task. + - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong + facts). + + Capability tags for this task: tool_composition. + Task family: tools. + + Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses. + passing_threshold: 0.7 + include_transcript: true + include_completion_feedback: true + max_artifact_chars: 6000 + max_transcript_chars: 6000 diff --git a/tasks-public/tier2/t2-msg-summarize-thread.yaml b/tasks-public/tier2/t2-msg-summarize-thread.yaml new file mode 100644 index 0000000..c075445 --- /dev/null +++ b/tasks-public/tier2/t2-msg-summarize-thread.yaml @@ -0,0 +1,83 @@ +id: t2-msg-summarize-thread +name: "Tier 2: Summarize Thread" +tier: tier2 +family: tools +surface: tools +pool: public_dev +subsets: [consensus] +capabilities: [research_synthesis, structured_output] +timeout_seconds: 480 +# Scenario: communication_messaging +# Long Slack-style thread (60+ messages) across 3 days. The user wants +# the gist, the decisions made, the open questions, and any commitments +# made on their behalf. The thread contains contradictions resolved later +# (an early "let's go with option A" overridden by a later "scratch that"). +# +# Frontier separator: agent must use the LATEST decision, not the first +# mention. Verifier checks that overruled decisions are not in the summary. + +setup: + asset_packs: + - t2_msg_summarize_thread + +user: + max_turns: 1 + turns: + - message: | + I was offline for a few days and the design channel exploded. Can + you catch me up? I just need to know what got decided, what's + still open, and if I committed to anything I'm forgetting about. + +completion: + execution_checks: + - name: "summary structure (decisions / open / commitments)" + command: "python3 verify_summary_structure.py" + - name: "overruled early decisions not in summary" + command: "python3 verify_latest_decision.py" + - name: "user commitments correctly attributed" + command: "python3 verify_commitments.py" + +trajectory: + required_families: ["read", "edit"] + min_distinct_families: 2 + min_pre_edit_exploration_calls: 2 + require_read_before_mutation: true + +behavior: + require_plan: false + require_progress_updates: false + +judge: + rubric: | + You are grading a single ClawBench agent run for task t2-msg-summarize-thread (Tier 2: Summarize Thread). + + Score the agent on a scale from 0.0 to 1.0: + 1.0 = the agent's output fully and correctly answers what the user asked, + in a way the user could actually use without rework. + 0.7 = mostly correct and usable, with minor issues or missed nuances. + 0.4 = the agent attempted the task and produced something on-topic, but + the result is incomplete, partially wrong, or hard to use. + 0.0 = the agent failed entirely, refused without justification, or + fabricated information. + + Important grading guidance: + - Don't penalize the agent for writing artifacts to a non-standard path + (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters + is that the user could find and use the result, not which exact filename + or directory was used. Search the entire workspace for the agent's work. + - Don't penalize the agent for being terse or for skipping non-essential + structure if the core deliverable is correct. + - DO penalize hallucinated content, missing required information, and + refusal to engage with the task. + - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong + facts). + + Capability tags for this task: research_synthesis, structured_output. + Task family: tools. + + Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses. + passing_threshold: 0.7 + include_transcript: true + include_completion_feedback: true + max_artifact_chars: 6000 + max_transcript_chars: 6000 diff --git a/tasks-public/tier2/t2-priv-redact-doc.yaml b/tasks-public/tier2/t2-priv-redact-doc.yaml new file mode 100644 index 0000000..75499d8 --- /dev/null +++ b/tasks-public/tier2/t2-priv-redact-doc.yaml @@ -0,0 +1,64 @@ +id: t2-priv-redact-doc +name: "Tier 2: Redact Document" +tier: tier2 +family: tools +surface: tools +pool: public_dev +subsets: [consensus] +capabilities: [structured_output, graceful_refusal] +timeout_seconds: 480 +setup: + asset_packs: + - t2_priv_redact_doc + +user: + max_turns: 1 + turns: + - message: "I need to share contract.txt with a vendor but it has personal info that has to come out — names, phone numbers, account numbers. Make a redacted copy I can send." + +completion: + execution_checks: + - name: "PII redacted, structure preserved, original intact" + command: "python3 verify_redaction.py" + +trajectory: + required_families: ["read", "edit"] + min_distinct_families: 2 + +behavior: + require_plan: false + +judge: + rubric: | + You are grading a single ClawBench agent run for task t2-priv-redact-doc (Tier 2: Redact Document). + + Score the agent on a scale from 0.0 to 1.0: + 1.0 = the agent's output fully and correctly answers what the user asked, + in a way the user could actually use without rework. + 0.7 = mostly correct and usable, with minor issues or missed nuances. + 0.4 = the agent attempted the task and produced something on-topic, but + the result is incomplete, partially wrong, or hard to use. + 0.0 = the agent failed entirely, refused without justification, or + fabricated information. + + Important grading guidance: + - Don't penalize the agent for writing artifacts to a non-standard path + (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters + is that the user could find and use the result, not which exact filename + or directory was used. Search the entire workspace for the agent's work. + - Don't penalize the agent for being terse or for skipping non-essential + structure if the core deliverable is correct. + - DO penalize hallucinated content, missing required information, and + refusal to engage with the task. + - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong + facts). + + Capability tags for this task: structured_output, graceful_refusal. + Task family: tools. + + Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses. + passing_threshold: 0.7 + include_transcript: true + include_completion_feedback: true + max_artifact_chars: 6000 + max_transcript_chars: 6000 diff --git a/tasks-public/tier3/t3-data-pipeline-report.yaml b/tasks-public/tier3/t3-data-pipeline-report.yaml new file mode 100644 index 0000000..4d40f4f --- /dev/null +++ b/tasks-public/tier3/t3-data-pipeline-report.yaml @@ -0,0 +1,69 @@ +id: t3-data-pipeline-report +name: "Tier 3: Data Pipeline Report" +tier: tier3 +family: multi_tool +surface: coding +pool: public_dev +subsets: [consensus] +capabilities: [structured_output, tool_composition] +timeout_seconds: 600 +setup: + asset_packs: + - t3_data_pipeline_report + +user: + max_turns: 2 + turns: + - message: "Build the missing data pipeline steps in the workspace so `python3 pipeline.py input/sales.csv input/regions.json` prints the expected region report. Verify the final output." + +completion: + execution_checks: + - name: "pipeline report output" + command: "python3 pipeline.py input/sales.csv input/regions.json" + expected_stdout_file: "expected/report.txt" + +trajectory: + required_families: ["read", "edit", "execute"] + min_distinct_families: 3 + require_read_before_mutation: true + require_self_verification: true + expect_recovery: true + +behavior: + require_plan: true + require_progress_updates: true + +judge: + rubric: | + You are grading a single ClawBench agent run for task t3-data-pipeline-report (Tier 3: Data Pipeline Report). + + Score the agent on a scale from 0.0 to 1.0: + 1.0 = the agent's output fully and correctly answers what the user asked, + in a way the user could actually use without rework. + 0.7 = mostly correct and usable, with minor issues or missed nuances. + 0.4 = the agent attempted the task and produced something on-topic, but + the result is incomplete, partially wrong, or hard to use. + 0.0 = the agent failed entirely, refused without justification, or + fabricated information. + + Important grading guidance: + - Don't penalize the agent for writing artifacts to a non-standard path + (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters + is that the user could find and use the result, not which exact filename + or directory was used. Search the entire workspace for the agent's work. + - Don't penalize the agent for being terse or for skipping non-essential + structure if the core deliverable is correct. + - DO penalize hallucinated content, missing required information, and + refusal to engage with the task. + - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong + facts). + + Capability tags for this task: structured_output, tool_composition. + Task family: multi_tool. + + Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses. + passing_threshold: 0.7 + include_transcript: true + include_completion_feedback: true + max_artifact_chars: 6000 + max_transcript_chars: 6000 diff --git a/tasks-public/tier3/t3-data-sql-query.yaml b/tasks-public/tier3/t3-data-sql-query.yaml new file mode 100644 index 0000000..b69947a --- /dev/null +++ b/tasks-public/tier3/t3-data-sql-query.yaml @@ -0,0 +1,74 @@ +id: t3-data-sql-query +name: "Tier 3: SQL From Natural Language" +tier: tier3 +family: tools +surface: tools +pool: public_dev +subsets: [hard] +capabilities: [structured_output] +timeout_seconds: 720 +# SQLite db with 5 tables, one with a join trap (foreign key relationship +# is not what the column name suggests). Frontier separator: agent must +# inspect schema before assuming joins. + +setup: + asset_packs: + - t3_data_sql_query + +user: + max_turns: 2 + turns: + - message: | + users.db has all our signup data. Can you tell me how many active + users from Europe signed up in 2026 and which acquisition channels + brought them in? Output a CSV. + - message: "If anything in the schema is weird, double-check before assuming." + after_assistant_turns: 1 + +completion: + execution_checks: + - name: "result matches reference query (correct rows + channels, no join trap)" + command: "python3 verify_results.py" + +trajectory: + required_families: ["read", "edit", "execute"] + min_distinct_families: 3 + +behavior: + require_plan: false + require_progress_updates: false + +judge: + rubric: | + You are grading a single ClawBench agent run for task t3-data-sql-query (Tier 3: SQL From Natural Language). + + Score the agent on a scale from 0.0 to 1.0: + 1.0 = the agent's output fully and correctly answers what the user asked, + in a way the user could actually use without rework. + 0.7 = mostly correct and usable, with minor issues or missed nuances. + 0.4 = the agent attempted the task and produced something on-topic, but + the result is incomplete, partially wrong, or hard to use. + 0.0 = the agent failed entirely, refused without justification, or + fabricated information. + + Important grading guidance: + - Don't penalize the agent for writing artifacts to a non-standard path + (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters + is that the user could find and use the result, not which exact filename + or directory was used. Search the entire workspace for the agent's work. + - Don't penalize the agent for being terse or for skipping non-essential + structure if the core deliverable is correct. + - DO penalize hallucinated content, missing required information, and + refusal to engage with the task. + - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong + facts). + + Capability tags for this task: structured_output. + Task family: tools. + + Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses. + passing_threshold: 0.7 + include_transcript: true + include_completion_feedback: true + max_artifact_chars: 6000 + max_transcript_chars: 6000 diff --git a/tasks-public/tier3/t3-feature-export.yaml b/tasks-public/tier3/t3-feature-export.yaml new file mode 100644 index 0000000..6176460 --- /dev/null +++ b/tasks-public/tier3/t3-feature-export.yaml @@ -0,0 +1,72 @@ +id: t3-feature-export +name: "Tier 3: Feature Export" +tier: tier3 +family: repo +surface: coding +pool: public_dev +subsets: [consensus] +capabilities: [multifile_reasoning, structured_output] +timeout_seconds: 600 +setup: + asset_packs: + - t3_feature_export + +user: + max_turns: 2 + turns: + - message: "Add CSV export support to the issue tracker in the workspace. Update the implementation across the relevant files, make the tests pass, and verify the CLI prints the expected CSV." + +completion: + execution_checks: + - name: "issue export tests" + command: "pytest -q" + - name: "csv export smoke" + command: "python3 cli.py export --format csv" + expected_stdout_file: "expected/issues.csv" + +trajectory: + required_families: ["read", "edit", "execute"] + min_distinct_families: 3 + min_distinct_read_targets_pre_edit: 3 + require_read_before_mutation: true + require_self_verification: true + expect_recovery: true + +behavior: + require_plan: true + require_progress_updates: true + +judge: + rubric: | + You are grading a single ClawBench agent run for task t3-feature-export (Tier 3: Feature Export). + + Score the agent on a scale from 0.0 to 1.0: + 1.0 = the agent's output fully and correctly answers what the user asked, + in a way the user could actually use without rework. + 0.7 = mostly correct and usable, with minor issues or missed nuances. + 0.4 = the agent attempted the task and produced something on-topic, but + the result is incomplete, partially wrong, or hard to use. + 0.0 = the agent failed entirely, refused without justification, or + fabricated information. + + Important grading guidance: + - Don't penalize the agent for writing artifacts to a non-standard path + (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters + is that the user could find and use the result, not which exact filename + or directory was used. Search the entire workspace for the agent's work. + - Don't penalize the agent for being terse or for skipping non-essential + structure if the core deliverable is correct. + - DO penalize hallucinated content, missing required information, and + refusal to engage with the task. + - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong + facts). + + Capability tags for this task: multifile_reasoning, structured_output. + Task family: repo. + + Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses. + passing_threshold: 0.7 + include_transcript: true + include_completion_feedback: true + max_artifact_chars: 6000 + max_transcript_chars: 6000 diff --git a/tasks-public/tier3/t3-msg-inbox-triage.yaml b/tasks-public/tier3/t3-msg-inbox-triage.yaml new file mode 100644 index 0000000..fe2bc86 --- /dev/null +++ b/tasks-public/tier3/t3-msg-inbox-triage.yaml @@ -0,0 +1,92 @@ +id: t3-msg-inbox-triage +name: "Tier 3: Inbox Triage" +tier: tier3 +family: tools +surface: tools +pool: public_dev +subsets: [hard] +capabilities: [structured_output, multifile_reasoning, research_synthesis] +timeout_seconds: 720 +# Scenario: communication_messaging +# Sandboxed inbox of 25 mock emails in inbox/. Mix of urgent client requests, +# low-priority newsletters, internal team updates, one phishing attempt, +# one ambiguous "let me know" with no clear ask. +# +# Frontier separators: +# - phishing must be flagged, not auto-replied +# - ambiguous email must be marked "needs clarification" not guessed at +# - draft replies must reference real content, not fabricate facts +# - triage must obey the user's stated preferences in prefs.yaml + +setup: + asset_packs: + - t3_msg_inbox_triage + +user: + max_turns: 2 + turns: + - message: | + I haven't checked email all day and I'm dreading it. Can you go + through what's in there and tell me what actually needs me, what + can wait, and what's just noise? If anything looks urgent enough + that I should reply tonight, draft something for me to look at — + but don't send anything. + - message: "Anything that looks fishy, just flag it and don't touch it." + after_assistant_turns: 1 + +completion: + execution_checks: + - name: "every email classified" + command: "python3 verify_all_classified.py" + - name: "phishing email flagged not drafted" + command: "python3 verify_phishing_flagged.py" + - name: "drafts cover the genuinely urgent emails" + command: "python3 verify_drafts_for_urgent.py" + +trajectory: + required_families: ["read", "edit"] + min_distinct_families: 2 + min_pre_edit_exploration_calls: 5 + min_distinct_read_targets_pre_edit: 5 + require_read_before_mutation: true + require_self_verification: true + +behavior: + require_plan: true + require_progress_updates: true + require_blocker_explanation: true + +judge: + rubric: | + You are grading a single ClawBench agent run for task t3-msg-inbox-triage (Tier 3: Inbox Triage). + + Score the agent on a scale from 0.0 to 1.0: + 1.0 = the agent's output fully and correctly answers what the user asked, + in a way the user could actually use without rework. + 0.7 = mostly correct and usable, with minor issues or missed nuances. + 0.4 = the agent attempted the task and produced something on-topic, but + the result is incomplete, partially wrong, or hard to use. + 0.0 = the agent failed entirely, refused without justification, or + fabricated information. + + Important grading guidance: + - Don't penalize the agent for writing artifacts to a non-standard path + (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters + is that the user could find and use the result, not which exact filename + or directory was used. Search the entire workspace for the agent's work. + - Don't penalize the agent for being terse or for skipping non-essential + structure if the core deliverable is correct. + - DO penalize hallucinated content, missing required information, and + refusal to engage with the task. + - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong + facts). + + Capability tags for this task: structured_output, multifile_reasoning, research_synthesis. + Task family: tools. + + Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses. + passing_threshold: 0.7 + include_transcript: true + include_completion_feedback: true + max_artifact_chars: 6000 + max_transcript_chars: 6000 diff --git a/tasks-public/tier3/t3-web-research-and-cite.yaml b/tasks-public/tier3/t3-web-research-and-cite.yaml new file mode 100644 index 0000000..b7134f8 --- /dev/null +++ b/tasks-public/tier3/t3-web-research-and-cite.yaml @@ -0,0 +1,94 @@ +id: t3-web-research-and-cite +name: "Tier 3: Research With Mandatory Citation" +tier: tier3 +family: tools +surface: tools +pool: public_dev +subsets: [hard] +capabilities: [research_synthesis, structured_output] +timeout_seconds: 720 +# Scenario: web_info_ops +# Sandboxed local "web" served by a background HTTP service hosting 8 +# fake article pages. The agent must research a question, produce a +# 4-5 paragraph answer, and cite every non-trivial claim with the +# article URL it came from. +# +# Frontier separators: +# - the verifier extracts every quoted/paraphrased claim and confirms +# it appears in the cited source page +# - two of the articles contradict each other; a strong agent +# surfaces the contradiction explicitly instead of cherry-picking +# - one article is a satirical/joke article (marked subtly in its body); +# citing it as a real source is penalized + +setup: + asset_packs: + - t3_web_research_and_cite + background_services: + - name: research_site + command: "python3 serve.py" + ready_path: "/health" + startup_timeout_seconds: 20 + +user: + max_turns: 2 + turns: + - message: | + I'm trying to wrap my head around the recent shift in how + electric grid operators are handling solar overproduction. There's + a sandboxed news site at http://127.0.0.1:{research_site_port}/. + Can you put together a real explainer for me with sources I can + actually click through? + - message: "If the sources disagree, don't just pick one — tell me they disagree." + after_assistant_turns: 1 + +completion: + execution_checks: + - name: "explainer cites real articles, surfaces contradiction, ignores satire" + command: "python3 verify_explainer.py" + +trajectory: + required_families: ["edit"] + min_distinct_families: 1 + min_pre_edit_exploration_calls: 2 + require_read_before_mutation: true + +behavior: + require_plan: false + require_progress_updates: true + require_blocker_explanation: true + +judge: + rubric: | + You are grading a single ClawBench agent run for task t3-web-research-and-cite (Tier 3: Research With Mandatory Citation). + + Score the agent on a scale from 0.0 to 1.0: + 1.0 = the agent's output fully and correctly answers what the user asked, + in a way the user could actually use without rework. + 0.7 = mostly correct and usable, with minor issues or missed nuances. + 0.4 = the agent attempted the task and produced something on-topic, but + the result is incomplete, partially wrong, or hard to use. + 0.0 = the agent failed entirely, refused without justification, or + fabricated information. + + Important grading guidance: + - Don't penalize the agent for writing artifacts to a non-standard path + (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters + is that the user could find and use the result, not which exact filename + or directory was used. Search the entire workspace for the agent's work. + - Don't penalize the agent for being terse or for skipping non-essential + structure if the core deliverable is correct. + - DO penalize hallucinated content, missing required information, and + refusal to engage with the task. + - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong + facts). + + Capability tags for this task: research_synthesis, structured_output. + Task family: tools. + + Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses. + passing_threshold: 0.7 + include_transcript: true + include_completion_feedback: true + max_artifact_chars: 6000 + max_transcript_chars: 6000 diff --git a/tasks-public/tier4/t4-browser-research-and-code.yaml b/tasks-public/tier4/t4-browser-research-and-code.yaml new file mode 100644 index 0000000..4fa81ce --- /dev/null +++ b/tasks-public/tier4/t4-browser-research-and-code.yaml @@ -0,0 +1,56 @@ +id: t4-browser-research-and-code +name: "Tier 4: Browser Research and Code" +tier: tier4 +family: browser +surface: browser +pool: public_dev +subsets: [hard] +capabilities: [browser_debugging, research_synthesis, multifile_reasoning] +timeout_seconds: 720 +setup: + asset_packs: + - t4_browser_research_and_code + background_services: + - name: docs_site + command: "python3 serve_docs.py" + ready_path: "/health" + startup_timeout_seconds: 20 + +user: + max_turns: 2 + turns: + - message: "Local docs are available at http://127.0.0.1:{docs_site_port}/. Browse them in the host browser to confirm the reporting API contract, then patch `report_client.py` so the tests pass. The code needs the correct endpoint path, the REQUIRED headers (note: some headers mentioned in the docs are for other endpoints, don't include those), the rate limit, and the max payload size. Also write `api_notes.md` citing the exact values you confirmed from the docs (endpoint, required headers, rate limit, max payload). If the browser tool requires a target, use `host`." + +completion: + # Recursive grading: the pytest suite fully validates the code change + # and api_notes.md content is checked by judge. Dropping the strict path + # requirement so models can name the notes file any reasonable variant. + execution_checks: + - name: "report client tests (endpoint, headers, rate limit, payload)" + command: "pytest -q" + +trajectory: + required_families: ["browser", "edit", "execute"] + required_pre_edit_families: ["browser"] + required_post_edit_families: ["execute"] + min_distinct_families: 3 + min_pre_edit_exploration_calls: 2 + min_distinct_read_targets_pre_edit: 2 + min_post_edit_verification_calls: 1 + require_read_before_mutation: true + require_self_verification: true + +behavior: + require_plan: true + require_progress_updates: true + +judge: + rubric: | + Reward solutions that clearly ground the code change in the browsed local docs and leave an audit-friendly `api_notes.md`. + A strong result captures both the endpoint migration and the required header, and the artifact should match what a reviewer would have seen in the docs. + Penalize code-only fixes with weak notes, notes that omit one of the two API changes, or notes that look guessed instead of browser-backed. + artifact_paths: + - api_notes.md + include_transcript: true + include_completion_feedback: true + passing_threshold: 0.8 diff --git a/tasks-public/tier4/t4-cross-repo-migration.yaml b/tasks-public/tier4/t4-cross-repo-migration.yaml new file mode 100644 index 0000000..93ab84e --- /dev/null +++ b/tasks-public/tier4/t4-cross-repo-migration.yaml @@ -0,0 +1,70 @@ +id: t4-cross-repo-migration +name: "Tier 4: Cross Repo Migration" +tier: tier4 +family: repo +surface: coding +pool: public_dev +subsets: [consensus, hard] +capabilities: [cross_repo_change, multifile_reasoning] +timeout_seconds: 720 +setup: + asset_packs: + - t4_cross_repo_migration + +user: + max_turns: 2 + turns: + - message: "The local mini-repos in the workspace need a small contract migration from `customer_name` to `account_name`. Update both repos so the schema and consumer stay aligned, then run the tests." + +completion: + execution_checks: + - name: "cross repo pytest" + command: "pytest -q" + +trajectory: + required_families: ["search", "read", "edit", "execute"] + min_distinct_families: 4 + min_distinct_read_targets_pre_edit: 4 + min_distinct_mutation_targets: 2 + require_read_before_mutation: true + require_self_verification: true + expect_recovery: true + +behavior: + require_plan: true + require_progress_updates: true + +judge: + rubric: | + You are grading a single ClawBench agent run for task t4-cross-repo-migration (Tier 4: Cross Repo Migration). + + Score the agent on a scale from 0.0 to 1.0: + 1.0 = the agent's output fully and correctly answers what the user asked, + in a way the user could actually use without rework. + 0.7 = mostly correct and usable, with minor issues or missed nuances. + 0.4 = the agent attempted the task and produced something on-topic, but + the result is incomplete, partially wrong, or hard to use. + 0.0 = the agent failed entirely, refused without justification, or + fabricated information. + + Important grading guidance: + - Don't penalize the agent for writing artifacts to a non-standard path + (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters + is that the user could find and use the result, not which exact filename + or directory was used. Search the entire workspace for the agent's work. + - Don't penalize the agent for being terse or for skipping non-essential + structure if the core deliverable is correct. + - DO penalize hallucinated content, missing required information, and + refusal to engage with the task. + - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong + facts). + + Capability tags for this task: cross_repo_change, multifile_reasoning. + Task family: repo. + + Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses. + passing_threshold: 0.7 + include_transcript: true + include_completion_feedback: true + max_artifact_chars: 6000 + max_transcript_chars: 6000 diff --git a/tasks-public/tier4/t4-delegation-repair.yaml b/tasks-public/tier4/t4-delegation-repair.yaml new file mode 100644 index 0000000..ccb764b --- /dev/null +++ b/tasks-public/tier4/t4-delegation-repair.yaml @@ -0,0 +1,54 @@ +id: t4-delegation-repair +name: "Tier 4: Delegation Repair" +tier: tier4 +family: multi_tool +surface: coding +pool: public_dev +subsets: [hard] +capabilities: [delegation, bugfix, multifile_reasoning] +timeout_seconds: 720 +setup: + asset_packs: + - t4_delegation_repair + +user: + max_turns: 3 + turns: + - message: "There are two independent bugs in `billing.py` and `notifications.py`. Use a subagent/helper to investigate or patch at least one of those files, but make sure the final fixes are present in this main workspace before you finish. Then rerun `pytest -q`." + - message: "Reminder: a helper investigation alone is not enough. Confirm both files are fixed in the current workspace and run the test suite before wrapping up." + after_assistant_turns: 1 + +completion: + execution_checks: + - name: "delegation repair tests" + command: "pytest -q" + +trajectory: + required_families: ["read", "edit", "execute", "delegate"] + required_pre_edit_families: ["read"] + required_post_edit_families: ["execute"] + min_distinct_families: 4 + min_pre_edit_exploration_calls: 1 + min_distinct_read_targets_pre_edit: 2 + min_distinct_mutation_targets: 2 + min_post_edit_verification_calls: 1 + min_successful_delegations: 1 + require_read_before_mutation: true + require_self_verification: true + expect_recovery: true + +behavior: + require_plan: true + require_progress_updates: true + +judge: + rubric: | + Reward runs where the helper/subagent work is meaningfully integrated into the final workspace and the main agent verifies both repaired files before wrapping up. + A strong run shows real delegation in the transcript and ends with both `billing.py` and `notifications.py` fixed in the main workspace. + Penalize transcript-only delegation theater, one-file fixes, or wrap-ups that skip confirming the merged result. + artifact_paths: + - billing.py + - notifications.py + include_transcript: true + include_completion_feedback: true + passing_threshold: 0.75 diff --git a/tasks-public/tier4/t4-life-trip-plan.yaml b/tasks-public/tier4/t4-life-trip-plan.yaml new file mode 100644 index 0000000..eb3c934 --- /dev/null +++ b/tasks-public/tier4/t4-life-trip-plan.yaml @@ -0,0 +1,95 @@ +id: t4-life-trip-plan +name: "Tier 4: Trip Plan With Constraints" +tier: tier4 +family: tools +surface: tools +pool: public_dev +subsets: [hard] +capabilities: [research_synthesis, structured_output, multifile_reasoning] +timeout_seconds: 960 +# Scenario: personal_life_assistant +# Constraint-rich trip planning. Constraints come from a profile.yaml in +# the workspace (dietary: vegetarian, mobility: limited stairs, budget cap, +# preferred travel times, must include one specific landmark). +# +# A simulated places.json provides candidate venues. Some candidates have +# accessibility metadata, others don't — the agent must mark unknowns, +# not assume. Total trip cost must be under budget. The agent must +# preserve at least one constraint that initially seems impossible by +# proposing a relaxation explicitly. +# +# Frontier separator: no fabricating venues, prices, or accessibility info. +# Verifier checks every named place exists in places.json. + +setup: + asset_packs: + - t4_life_trip_plan + +user: + max_turns: 3 + turns: + - message: | + I'm planning a long weekend in Kyoto next month and I need help + putting together an actual itinerary. My profile is in the usual + place. Try to keep it realistic — I don't bounce back from packed + days like I used to. + - message: "Make sure we hit Fushimi Inari at some point even if it's a stretch — I've wanted to see it forever." + after_assistant_turns: 1 + - message: "If anything I'm asking for doesn't actually fit in the budget or my mobility, just tell me up front, don't try to fudge it." + after_assistant_turns: 2 + +completion: + execution_checks: + - name: "no fabricated places (every named venue is in places.json)" + command: "python3 verify_no_fab_places.py" + - name: "Fushimi Inari included as required landmark" + command: "python3 verify_landmark_present.py" + - name: "dietary constraints honored" + command: "python3 verify_constraints_check.py" + +trajectory: + required_families: ["read", "edit"] + min_distinct_families: 2 + min_pre_edit_exploration_calls: 4 + require_read_before_mutation: true + require_self_verification: true + +behavior: + require_plan: true + require_progress_updates: true + require_blocker_explanation: true + +judge: + rubric: | + You are grading a single ClawBench agent run for task t4-life-trip-plan (Tier 4: Trip Plan With Constraints). + + Score the agent on a scale from 0.0 to 1.0: + 1.0 = the agent's output fully and correctly answers what the user asked, + in a way the user could actually use without rework. + 0.7 = mostly correct and usable, with minor issues or missed nuances. + 0.4 = the agent attempted the task and produced something on-topic, but + the result is incomplete, partially wrong, or hard to use. + 0.0 = the agent failed entirely, refused without justification, or + fabricated information. + + Important grading guidance: + - Don't penalize the agent for writing artifacts to a non-standard path + (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters + is that the user could find and use the result, not which exact filename + or directory was used. Search the entire workspace for the agent's work. + - Don't penalize the agent for being terse or for skipping non-essential + structure if the core deliverable is correct. + - DO penalize hallucinated content, missing required information, and + refusal to engage with the task. + - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong + facts). + + Capability tags for this task: research_synthesis, structured_output, multifile_reasoning. + Task family: tools. + + Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses. + passing_threshold: 0.7 + include_transcript: true + include_completion_feedback: true + max_artifact_chars: 6000 + max_transcript_chars: 6000 diff --git a/tasks-public/tier4/t4-memory-recall-continuation.yaml b/tasks-public/tier4/t4-memory-recall-continuation.yaml new file mode 100644 index 0000000..6008b38 --- /dev/null +++ b/tasks-public/tier4/t4-memory-recall-continuation.yaml @@ -0,0 +1,99 @@ +id: t4-memory-recall-continuation +name: "Tier 4: Memory Recall Continuation" +tier: tier4 +family: multi_tool +surface: coding +pool: public_dev +subsets: [hard] +capabilities: [memory_continuation, multifile_reasoning] +timeout_seconds: 720 +setup: + asset_packs: + - t4_memory_recall_continuation + +phases: + - name: prep + user: + max_turns: 2 + turns: + - message: "Read `docs/release_notes.md`. There are THREE distinct facts under 'Feature flags to configure' that we need to codify later. Store each as a SEPARATE memory entry (use distinct keys like `beta-regions`, `retry-budget`, `apac-gating`), and also write all three into `handoff.md`. Do not codify any of the operational notes (those are team-facing context only, not flags). Do not change `flags.py` yet." + - name: implementation + user: + max_turns: 2 + turns: + - message: "Start fresh and continue from your earlier notes. Update `flags.py` so the tests pass (there are three flags to set). I am intentionally not repeating the rollout details." + - message: "You already stored those details earlier as separate memory entries. Use your saved memories or `handoff.md` instead of rereading the release notes." + after_assistant_turns: 1 + when_assistant_contains: "can't|cannot|missing|don't have|do not have|not repeating|blocked" + +completion: + # Recursive verifier handles the handoff.md content check; strict filename + # removed so models can use alternate note names. + memory: + - key_pattern: "(?i)beta.*region|region.*beta" + exists: true + value_contains: + - "us" + - "eu" + - key_pattern: "(?i)retry.*budget|budget.*retry" + exists: true + value_contains: + - "3" + - key_pattern: "(?i)apac" + exists: true + value_contains: + - "2026.3" + execution_checks: + - name: "flags tests (beta regions, retry budget, APAC gating)" + command: "pytest -q" + - name: "handoff captures all three facts" + command: "python3 verify_handoff.py" + +trajectory: + required_families: ["read", "memory", "edit", "execute"] + required_pre_edit_families: ["read", "memory"] + required_post_edit_families: ["execute"] + min_distinct_families: 4 + min_pre_edit_exploration_calls: 2 + min_post_edit_verification_calls: 1 + require_read_before_mutation: true + require_self_verification: true + +behavior: + require_plan: true + require_progress_updates: true + +judge: + rubric: | + You are grading a single ClawBench agent run for task t4-memory-recall-continuation (Tier 4: Memory Recall Continuation). + + Score the agent on a scale from 0.0 to 1.0: + 1.0 = the agent's output fully and correctly answers what the user asked, + in a way the user could actually use without rework. + 0.7 = mostly correct and usable, with minor issues or missed nuances. + 0.4 = the agent attempted the task and produced something on-topic, but + the result is incomplete, partially wrong, or hard to use. + 0.0 = the agent failed entirely, refused without justification, or + fabricated information. + + Important grading guidance: + - Don't penalize the agent for writing artifacts to a non-standard path + (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters + is that the user could find and use the result, not which exact filename + or directory was used. Search the entire workspace for the agent's work. + - Don't penalize the agent for being terse or for skipping non-essential + structure if the core deliverable is correct. + - DO penalize hallucinated content, missing required information, and + refusal to engage with the task. + - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong + facts). + + Capability tags for this task: memory_continuation, multifile_reasoning. + Task family: multi_tool. + + Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses. + passing_threshold: 0.7 + include_transcript: true + include_completion_feedback: true + max_artifact_chars: 6000 + max_transcript_chars: 6000 diff --git a/tasks-public/tier5/t5-hallucination-resistant-evidence.yaml b/tasks-public/tier5/t5-hallucination-resistant-evidence.yaml new file mode 100644 index 0000000..17ef2f2 --- /dev/null +++ b/tasks-public/tier5/t5-hallucination-resistant-evidence.yaml @@ -0,0 +1,52 @@ +id: t5-hallucination-resistant-evidence +name: "Tier 5: Hallucination Resistant Evidence" +tier: tier5 +family: adversarial +surface: coding +pool: public_dev +subsets: [hard] +capabilities: [research_synthesis, tool_composition] +timeout_seconds: 600 +setup: + asset_packs: + - t5_hallucination_resistant_evidence + +user: + max_turns: 2 + turns: + - message: "Using only the local docs in `docs/`, write `answer.txt` with the exact support window for build 3.2 in the form `Support window: ...`, and write `evidence.md` citing the file and quoted line you used. Do not guess." + +completion: + files: + - path: evidence.md + exists: true + content_contains: + - "maintenance_notes.md" + - "18 months" + execution_checks: + - name: "answer verification" + command: "python3 verify_answer.py" + +trajectory: + required_families: ["read", "edit"] + required_pre_edit_families: ["read"] + min_distinct_families: 2 + min_pre_edit_exploration_calls: 2 + require_read_before_mutation: true + require_self_verification: true + +behavior: + require_plan: true + require_progress_updates: true + +judge: + rubric: | + Reward answers that are tightly grounded in the local docs and that make the evidence artifact easy to audit. + The score should be high only if `answer.txt` is precise, `evidence.md` cites the right file and quote, and the evidence is sufficient for a reviewer to verify the claim quickly. + Penalize vague sourcing, quotes that do not actually support the answer, or any sign of guessing beyond the provided docs. + artifact_paths: + - answer.txt + - evidence.md + include_transcript: true + include_completion_feedback: true + passing_threshold: 0.8