fix(eval): stabilize OpenClaw container sweeps
Some checks failed
CI / Python ${{ matrix.python-version }} test suite (3.11) (push) Has been cancelled
CI / Python ${{ matrix.python-version }} test suite (3.12) (push) Has been cancelled

This commit is contained in:
scoootscooob 2026-05-02 02:50:57 -07:00
parent f09a9f4bf7
commit 5dfa4c9280
6 changed files with 100 additions and 4 deletions

View File

@ -14,7 +14,7 @@ RUN apt-get update && \
RUN ln -s /app /openclaw
ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
RUN npx -y playwright@1.59.1 install --with-deps chromium && \
RUN cd /tmp && npx -y playwright@1.59.1 install --with-deps chromium && \
CHROME_PATH="$(find /ms-playwright -path '*/chrome' -type f | sort | head -n 1)" && \
test -x "$CHROME_PATH" && \
ln -sf "$CHROME_PATH" /usr/bin/chromium
@ -28,6 +28,7 @@ COPY --chown=node:node tasks-public/ tasks-public/
COPY --chown=node:node tasks-domain/ tasks-domain/
COPY --chown=node:node profiles/ profiles/
COPY --chown=node:node baselines/ baselines/
COPY --chown=node:node scripts/ scripts/
COPY --chown=node:node app.py .
RUN python3 -m pip install --break-system-packages --no-cache-dir .

View File

@ -50,6 +50,7 @@ class SubmissionRequest(BaseModel):
runs_per_task: int = Field(default=3, ge=1, le=10)
max_parallel_lanes: int = Field(default=1, ge=1, le=8)
tier: str | None = None # Filter to a specific tier
task_ids: list[str] = Field(default_factory=list)
scenario: str | None = None
prompt_variant: str = "clear"
submitter: str = "" # HF username
@ -65,6 +66,7 @@ class SubmissionRequest(BaseModel):
"runs_per_task": self.runs_per_task,
"max_parallel_lanes": self.max_parallel_lanes,
"tier": self.tier or "",
"task_ids": [task_id.strip() for task_id in self.task_ids if task_id.strip()],
"scenario": self.scenario or "",
"prompt_variant": self.prompt_variant,
}

View File

@ -35,6 +35,12 @@ STALE_EVALUATION_SECONDS = max(
int(os.environ.get("CLAWBENCH_STALE_EVALUATION_SECONDS", "1800")),
)
OPENCLAW_EVAL_EXEC_HOSTS = {"auto", "gateway", "sandbox", "node"}
OPENCLAW_EVAL_SYSTEM_PROMPT = (
"You are running an OpenClaw benchmark task. Complete the user's request in the current "
"workspace using the available tools when needed. For file, code, browser, shell, or memory "
"tasks, make the requested changes directly and verify them when practical. Do not ask "
"follow-up questions during the benchmark. Keep any final reply brief."
)
@dataclass
@ -676,6 +682,7 @@ class EvalWorker:
if self._active_model:
_set_nested(data, "agents.defaults.model.primary", self._active_model)
_set_nested(data, "agents.defaults.subagents.model.primary", self._active_model)
self._apply_eval_model_defaults(data, self._active_model)
tmp_path = cfg_path.with_suffix(".json.tmp")
tmp_path.write_text(json.dumps(data, indent=2), encoding="utf-8")
@ -1128,8 +1135,7 @@ class EvalWorker:
tmp_path.write_text(json.dumps(approvals, indent=2), encoding="utf-8")
tmp_path.replace(approvals_path)
@staticmethod
def _patch_openclaw_config(pairs: list[tuple[str, object]]) -> None:
def _patch_openclaw_config(self, pairs: list[tuple[str, object]]) -> None:
state_dir = Path(os.environ.get("OPENCLAW_STATE_DIR") or os.path.expanduser("~/.openclaw"))
config_path = state_dir / "openclaw.json"
if not config_path.exists():
@ -1147,12 +1153,50 @@ class EvalWorker:
if cursor.get(parts[-1]) != value:
cursor[parts[-1]] = value
changed = True
if self._active_model:
changed = self._apply_eval_model_defaults(data, self._active_model) or changed
if not changed:
return
tmp_path = config_path.with_suffix(".json.tmp")
tmp_path.write_text(json.dumps(data, indent=2), encoding="utf-8")
tmp_path.replace(config_path)
@staticmethod
def _apply_eval_model_defaults(data: dict, model: str) -> bool:
"""Force eval model parameters that keep benchmark turns low-latency."""
agents = data.setdefault("agents", {})
if not isinstance(agents, dict):
data["agents"] = agents = {}
defaults = agents.setdefault("defaults", {})
if not isinstance(defaults, dict):
agents["defaults"] = defaults = {}
models = defaults.setdefault("models", {})
if not isinstance(models, dict):
defaults["models"] = models = {}
entry = models.setdefault(model, {})
if not isinstance(entry, dict):
entry = {}
models[model] = entry
params = entry.setdefault("params", {})
if not isinstance(params, dict):
params = {}
entry["params"] = params
changed = False
if defaults.get("systemPromptOverride") != OPENCLAW_EVAL_SYSTEM_PROMPT:
defaults["systemPromptOverride"] = OPENCLAW_EVAL_SYSTEM_PROMPT
changed = True
if params.get("fastMode") is not True:
params["fastMode"] = True
changed = True
if model.startswith("openai/"):
if params.get("transport") != "sse":
params["transport"] = "sse"
changed = True
if params.get("openaiWsWarmup") is not False:
params["openaiWsWarmup"] = False
changed = True
return changed
def _find_gateway_cmd(self) -> list[str] | None:
import shutil

View File

@ -131,11 +131,27 @@ set_nested(data, "agents.defaults.skipBootstrap", True)
set_nested(data, "agents.defaults.sandbox.mode", "off")
set_nested(data, "agents.defaults.model.primary", os.environ["SWEEP_MODEL"])
set_nested(data, "agents.defaults.subagents.model.primary", os.environ["SWEEP_MODEL"])
set_nested(
data,
"agents.defaults.systemPromptOverride",
"You are running an OpenClaw benchmark task. Complete the user's request in the current "
"workspace using the available tools when needed. For file, code, browser, shell, or memory "
"tasks, make the requested changes directly and verify them when practical. Do not ask "
"follow-up questions during the benchmark. Keep any final reply brief.",
)
set_nested(data, "tools.exec.host", os.environ.get("OPENCLAW_EXEC_HOST", "gateway"))
set_nested(data, "tools.exec.security", "full")
set_nested(data, "tools.exec.ask", "off")
set_nested(data, "approvals.exec.enabled", False)
models = data.setdefault("agents", {}).setdefault("defaults", {}).setdefault("models", {})
model_entry = models.setdefault(os.environ["SWEEP_MODEL"], {})
params = model_entry.setdefault("params", {})
params["fastMode"] = True
if os.environ["SWEEP_MODEL"].startswith("openai/"):
params["transport"] = "sse"
params["openaiWsWarmup"] = False
cfg_path.write_text(json.dumps(data, indent=2) + "\n", encoding="utf-8")
approvals_path = cfg_path.with_name("exec-approvals.json")

View File

@ -20,6 +20,7 @@ def test_submission_request_defaults_to_single_parallel_lane():
assert request.max_parallel_lanes == 1
assert request.runs_per_task == 3
assert request.judge_affects_score is False
assert request.task_ids == []
def test_submission_request_fingerprint_includes_judge_score_gate():
@ -33,6 +34,16 @@ def test_submission_request_fingerprint_includes_judge_score_gate():
assert advisory.active_fingerprint() != weighted.active_fingerprint()
def test_submission_request_fingerprint_includes_task_ids():
all_tasks = SubmissionRequest(model="anthropic/claude-sonnet-4-6")
subset = SubmissionRequest(
model="anthropic/claude-sonnet-4-6",
task_ids=["t1-fs-quick-note"],
)
assert all_tasks.active_fingerprint() != subset.active_fingerprint()
def test_save_local_replaces_queue_file_atomically(tmp_path, monkeypatch):
monkeypatch.setattr(queue_module, "LOCAL_QUEUE_DIR", tmp_path)
monkeypatch.setattr(queue_module, "HF_TOKEN", "")

View File

@ -6,7 +6,14 @@ from types import SimpleNamespace
import pytest
from clawbench.queue import Job, JobQueue, JobStatus, SubmissionRequest
from clawbench.worker import GATEWAY_PORT, GATEWAY_PORT_SPACING, EvalWorker, JobProgressTracker, ParallelLane
from clawbench.worker import (
GATEWAY_PORT,
GATEWAY_PORT_SPACING,
OPENCLAW_EVAL_SYSTEM_PROMPT,
EvalWorker,
JobProgressTracker,
ParallelLane,
)
class DummyTask:
@ -119,6 +126,8 @@ def test_configure_browser_runtime_pins_subagents_to_active_model(monkeypatch):
"defaults": {
"skipBootstrap": True,
"model": {"primary": "openai-codex/gpt-5.4"},
"models": {"openai-codex/gpt-5.4": {"params": {"fastMode": True}}},
"systemPromptOverride": OPENCLAW_EVAL_SYSTEM_PROMPT,
"subagents": {"model": {"primary": "openai-codex/gpt-5.4"}},
}
},
@ -128,6 +137,19 @@ def test_configure_browser_runtime_pins_subagents_to_active_model(monkeypatch):
}
def test_eval_model_defaults_pin_openai_to_sse_transport() -> None:
data: dict[str, object] = {}
changed = EvalWorker._apply_eval_model_defaults(data, "openai/gpt-5.5")
assert changed is True
assert data["agents"]["defaults"]["models"]["openai/gpt-5.5"]["params"] == {
"fastMode": True,
"transport": "sse",
"openaiWsWarmup": False,
}
@pytest.mark.asyncio
async def test_prepare_benchmark_run_restarts_gateway_on_task_boundary(monkeypatch):
worker = EvalWorker(JobQueue())