diff --git a/Dockerfile b/Dockerfile index 97c9b82..462d89a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,7 +14,7 @@ RUN apt-get update && \ RUN ln -s /app /openclaw ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright -RUN npx -y playwright@1.59.1 install --with-deps chromium && \ +RUN cd /tmp && npx -y playwright@1.59.1 install --with-deps chromium && \ CHROME_PATH="$(find /ms-playwright -path '*/chrome' -type f | sort | head -n 1)" && \ test -x "$CHROME_PATH" && \ ln -sf "$CHROME_PATH" /usr/bin/chromium @@ -28,6 +28,7 @@ COPY --chown=node:node tasks-public/ tasks-public/ COPY --chown=node:node tasks-domain/ tasks-domain/ COPY --chown=node:node profiles/ profiles/ COPY --chown=node:node baselines/ baselines/ +COPY --chown=node:node scripts/ scripts/ COPY --chown=node:node app.py . RUN python3 -m pip install --break-system-packages --no-cache-dir . diff --git a/clawbench/queue.py b/clawbench/queue.py index da718a5..32d84af 100644 --- a/clawbench/queue.py +++ b/clawbench/queue.py @@ -50,6 +50,7 @@ class SubmissionRequest(BaseModel): runs_per_task: int = Field(default=3, ge=1, le=10) max_parallel_lanes: int = Field(default=1, ge=1, le=8) tier: str | None = None # Filter to a specific tier + task_ids: list[str] = Field(default_factory=list) scenario: str | None = None prompt_variant: str = "clear" submitter: str = "" # HF username @@ -65,6 +66,7 @@ class SubmissionRequest(BaseModel): "runs_per_task": self.runs_per_task, "max_parallel_lanes": self.max_parallel_lanes, "tier": self.tier or "", + "task_ids": [task_id.strip() for task_id in self.task_ids if task_id.strip()], "scenario": self.scenario or "", "prompt_variant": self.prompt_variant, } diff --git a/clawbench/worker.py b/clawbench/worker.py index 35fa861..d9396fd 100644 --- a/clawbench/worker.py +++ b/clawbench/worker.py @@ -35,6 +35,12 @@ STALE_EVALUATION_SECONDS = max( int(os.environ.get("CLAWBENCH_STALE_EVALUATION_SECONDS", "1800")), ) OPENCLAW_EVAL_EXEC_HOSTS = {"auto", "gateway", "sandbox", "node"} +OPENCLAW_EVAL_SYSTEM_PROMPT = ( + "You are running an OpenClaw benchmark task. Complete the user's request in the current " + "workspace using the available tools when needed. For file, code, browser, shell, or memory " + "tasks, make the requested changes directly and verify them when practical. Do not ask " + "follow-up questions during the benchmark. Keep any final reply brief." +) @dataclass @@ -676,6 +682,7 @@ class EvalWorker: if self._active_model: _set_nested(data, "agents.defaults.model.primary", self._active_model) _set_nested(data, "agents.defaults.subagents.model.primary", self._active_model) + self._apply_eval_model_defaults(data, self._active_model) tmp_path = cfg_path.with_suffix(".json.tmp") tmp_path.write_text(json.dumps(data, indent=2), encoding="utf-8") @@ -1128,8 +1135,7 @@ class EvalWorker: tmp_path.write_text(json.dumps(approvals, indent=2), encoding="utf-8") tmp_path.replace(approvals_path) - @staticmethod - def _patch_openclaw_config(pairs: list[tuple[str, object]]) -> None: + def _patch_openclaw_config(self, pairs: list[tuple[str, object]]) -> None: state_dir = Path(os.environ.get("OPENCLAW_STATE_DIR") or os.path.expanduser("~/.openclaw")) config_path = state_dir / "openclaw.json" if not config_path.exists(): @@ -1147,12 +1153,50 @@ class EvalWorker: if cursor.get(parts[-1]) != value: cursor[parts[-1]] = value changed = True + if self._active_model: + changed = self._apply_eval_model_defaults(data, self._active_model) or changed if not changed: return tmp_path = config_path.with_suffix(".json.tmp") tmp_path.write_text(json.dumps(data, indent=2), encoding="utf-8") tmp_path.replace(config_path) + @staticmethod + def _apply_eval_model_defaults(data: dict, model: str) -> bool: + """Force eval model parameters that keep benchmark turns low-latency.""" + agents = data.setdefault("agents", {}) + if not isinstance(agents, dict): + data["agents"] = agents = {} + defaults = agents.setdefault("defaults", {}) + if not isinstance(defaults, dict): + agents["defaults"] = defaults = {} + models = defaults.setdefault("models", {}) + if not isinstance(models, dict): + defaults["models"] = models = {} + entry = models.setdefault(model, {}) + if not isinstance(entry, dict): + entry = {} + models[model] = entry + params = entry.setdefault("params", {}) + if not isinstance(params, dict): + params = {} + entry["params"] = params + changed = False + if defaults.get("systemPromptOverride") != OPENCLAW_EVAL_SYSTEM_PROMPT: + defaults["systemPromptOverride"] = OPENCLAW_EVAL_SYSTEM_PROMPT + changed = True + if params.get("fastMode") is not True: + params["fastMode"] = True + changed = True + if model.startswith("openai/"): + if params.get("transport") != "sse": + params["transport"] = "sse" + changed = True + if params.get("openaiWsWarmup") is not False: + params["openaiWsWarmup"] = False + changed = True + return changed + def _find_gateway_cmd(self) -> list[str] | None: import shutil diff --git a/scripts/container_lane_eval.sh b/scripts/container_lane_eval.sh index a47ca06..6aa9a27 100755 --- a/scripts/container_lane_eval.sh +++ b/scripts/container_lane_eval.sh @@ -131,11 +131,27 @@ set_nested(data, "agents.defaults.skipBootstrap", True) set_nested(data, "agents.defaults.sandbox.mode", "off") set_nested(data, "agents.defaults.model.primary", os.environ["SWEEP_MODEL"]) set_nested(data, "agents.defaults.subagents.model.primary", os.environ["SWEEP_MODEL"]) +set_nested( + data, + "agents.defaults.systemPromptOverride", + "You are running an OpenClaw benchmark task. Complete the user's request in the current " + "workspace using the available tools when needed. For file, code, browser, shell, or memory " + "tasks, make the requested changes directly and verify them when practical. Do not ask " + "follow-up questions during the benchmark. Keep any final reply brief.", +) set_nested(data, "tools.exec.host", os.environ.get("OPENCLAW_EXEC_HOST", "gateway")) set_nested(data, "tools.exec.security", "full") set_nested(data, "tools.exec.ask", "off") set_nested(data, "approvals.exec.enabled", False) +models = data.setdefault("agents", {}).setdefault("defaults", {}).setdefault("models", {}) +model_entry = models.setdefault(os.environ["SWEEP_MODEL"], {}) +params = model_entry.setdefault("params", {}) +params["fastMode"] = True +if os.environ["SWEEP_MODEL"].startswith("openai/"): + params["transport"] = "sse" + params["openaiWsWarmup"] = False + cfg_path.write_text(json.dumps(data, indent=2) + "\n", encoding="utf-8") approvals_path = cfg_path.with_name("exec-approvals.json") diff --git a/tests/test_queue.py b/tests/test_queue.py index 92137d6..2db4fe5 100644 --- a/tests/test_queue.py +++ b/tests/test_queue.py @@ -20,6 +20,7 @@ def test_submission_request_defaults_to_single_parallel_lane(): assert request.max_parallel_lanes == 1 assert request.runs_per_task == 3 assert request.judge_affects_score is False + assert request.task_ids == [] def test_submission_request_fingerprint_includes_judge_score_gate(): @@ -33,6 +34,16 @@ def test_submission_request_fingerprint_includes_judge_score_gate(): assert advisory.active_fingerprint() != weighted.active_fingerprint() +def test_submission_request_fingerprint_includes_task_ids(): + all_tasks = SubmissionRequest(model="anthropic/claude-sonnet-4-6") + subset = SubmissionRequest( + model="anthropic/claude-sonnet-4-6", + task_ids=["t1-fs-quick-note"], + ) + + assert all_tasks.active_fingerprint() != subset.active_fingerprint() + + def test_save_local_replaces_queue_file_atomically(tmp_path, monkeypatch): monkeypatch.setattr(queue_module, "LOCAL_QUEUE_DIR", tmp_path) monkeypatch.setattr(queue_module, "HF_TOKEN", "") diff --git a/tests/test_worker.py b/tests/test_worker.py index ae6c08d..7ff27e1 100644 --- a/tests/test_worker.py +++ b/tests/test_worker.py @@ -6,7 +6,14 @@ from types import SimpleNamespace import pytest from clawbench.queue import Job, JobQueue, JobStatus, SubmissionRequest -from clawbench.worker import GATEWAY_PORT, GATEWAY_PORT_SPACING, EvalWorker, JobProgressTracker, ParallelLane +from clawbench.worker import ( + GATEWAY_PORT, + GATEWAY_PORT_SPACING, + OPENCLAW_EVAL_SYSTEM_PROMPT, + EvalWorker, + JobProgressTracker, + ParallelLane, +) class DummyTask: @@ -119,6 +126,8 @@ def test_configure_browser_runtime_pins_subagents_to_active_model(monkeypatch): "defaults": { "skipBootstrap": True, "model": {"primary": "openai-codex/gpt-5.4"}, + "models": {"openai-codex/gpt-5.4": {"params": {"fastMode": True}}}, + "systemPromptOverride": OPENCLAW_EVAL_SYSTEM_PROMPT, "subagents": {"model": {"primary": "openai-codex/gpt-5.4"}}, } }, @@ -128,6 +137,19 @@ def test_configure_browser_runtime_pins_subagents_to_active_model(monkeypatch): } +def test_eval_model_defaults_pin_openai_to_sse_transport() -> None: + data: dict[str, object] = {} + + changed = EvalWorker._apply_eval_model_defaults(data, "openai/gpt-5.5") + + assert changed is True + assert data["agents"]["defaults"]["models"]["openai/gpt-5.5"]["params"] == { + "fastMode": True, + "transport": "sse", + "openaiWsWarmup": False, + } + + @pytest.mark.asyncio async def test_prepare_benchmark_run_restarts_gateway_on_task_boundary(monkeypatch): worker = EvalWorker(JobQueue())