fix(eval): stabilize OpenClaw container sweeps
This commit is contained in:
parent
f09a9f4bf7
commit
5dfa4c9280
@ -14,7 +14,7 @@ RUN apt-get update && \
|
||||
RUN ln -s /app /openclaw
|
||||
|
||||
ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
|
||||
RUN npx -y playwright@1.59.1 install --with-deps chromium && \
|
||||
RUN cd /tmp && npx -y playwright@1.59.1 install --with-deps chromium && \
|
||||
CHROME_PATH="$(find /ms-playwright -path '*/chrome' -type f | sort | head -n 1)" && \
|
||||
test -x "$CHROME_PATH" && \
|
||||
ln -sf "$CHROME_PATH" /usr/bin/chromium
|
||||
@ -28,6 +28,7 @@ COPY --chown=node:node tasks-public/ tasks-public/
|
||||
COPY --chown=node:node tasks-domain/ tasks-domain/
|
||||
COPY --chown=node:node profiles/ profiles/
|
||||
COPY --chown=node:node baselines/ baselines/
|
||||
COPY --chown=node:node scripts/ scripts/
|
||||
COPY --chown=node:node app.py .
|
||||
|
||||
RUN python3 -m pip install --break-system-packages --no-cache-dir .
|
||||
|
||||
@ -50,6 +50,7 @@ class SubmissionRequest(BaseModel):
|
||||
runs_per_task: int = Field(default=3, ge=1, le=10)
|
||||
max_parallel_lanes: int = Field(default=1, ge=1, le=8)
|
||||
tier: str | None = None # Filter to a specific tier
|
||||
task_ids: list[str] = Field(default_factory=list)
|
||||
scenario: str | None = None
|
||||
prompt_variant: str = "clear"
|
||||
submitter: str = "" # HF username
|
||||
@ -65,6 +66,7 @@ class SubmissionRequest(BaseModel):
|
||||
"runs_per_task": self.runs_per_task,
|
||||
"max_parallel_lanes": self.max_parallel_lanes,
|
||||
"tier": self.tier or "",
|
||||
"task_ids": [task_id.strip() for task_id in self.task_ids if task_id.strip()],
|
||||
"scenario": self.scenario or "",
|
||||
"prompt_variant": self.prompt_variant,
|
||||
}
|
||||
|
||||
@ -35,6 +35,12 @@ STALE_EVALUATION_SECONDS = max(
|
||||
int(os.environ.get("CLAWBENCH_STALE_EVALUATION_SECONDS", "1800")),
|
||||
)
|
||||
OPENCLAW_EVAL_EXEC_HOSTS = {"auto", "gateway", "sandbox", "node"}
|
||||
OPENCLAW_EVAL_SYSTEM_PROMPT = (
|
||||
"You are running an OpenClaw benchmark task. Complete the user's request in the current "
|
||||
"workspace using the available tools when needed. For file, code, browser, shell, or memory "
|
||||
"tasks, make the requested changes directly and verify them when practical. Do not ask "
|
||||
"follow-up questions during the benchmark. Keep any final reply brief."
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -676,6 +682,7 @@ class EvalWorker:
|
||||
if self._active_model:
|
||||
_set_nested(data, "agents.defaults.model.primary", self._active_model)
|
||||
_set_nested(data, "agents.defaults.subagents.model.primary", self._active_model)
|
||||
self._apply_eval_model_defaults(data, self._active_model)
|
||||
|
||||
tmp_path = cfg_path.with_suffix(".json.tmp")
|
||||
tmp_path.write_text(json.dumps(data, indent=2), encoding="utf-8")
|
||||
@ -1128,8 +1135,7 @@ class EvalWorker:
|
||||
tmp_path.write_text(json.dumps(approvals, indent=2), encoding="utf-8")
|
||||
tmp_path.replace(approvals_path)
|
||||
|
||||
@staticmethod
|
||||
def _patch_openclaw_config(pairs: list[tuple[str, object]]) -> None:
|
||||
def _patch_openclaw_config(self, pairs: list[tuple[str, object]]) -> None:
|
||||
state_dir = Path(os.environ.get("OPENCLAW_STATE_DIR") or os.path.expanduser("~/.openclaw"))
|
||||
config_path = state_dir / "openclaw.json"
|
||||
if not config_path.exists():
|
||||
@ -1147,12 +1153,50 @@ class EvalWorker:
|
||||
if cursor.get(parts[-1]) != value:
|
||||
cursor[parts[-1]] = value
|
||||
changed = True
|
||||
if self._active_model:
|
||||
changed = self._apply_eval_model_defaults(data, self._active_model) or changed
|
||||
if not changed:
|
||||
return
|
||||
tmp_path = config_path.with_suffix(".json.tmp")
|
||||
tmp_path.write_text(json.dumps(data, indent=2), encoding="utf-8")
|
||||
tmp_path.replace(config_path)
|
||||
|
||||
@staticmethod
|
||||
def _apply_eval_model_defaults(data: dict, model: str) -> bool:
|
||||
"""Force eval model parameters that keep benchmark turns low-latency."""
|
||||
agents = data.setdefault("agents", {})
|
||||
if not isinstance(agents, dict):
|
||||
data["agents"] = agents = {}
|
||||
defaults = agents.setdefault("defaults", {})
|
||||
if not isinstance(defaults, dict):
|
||||
agents["defaults"] = defaults = {}
|
||||
models = defaults.setdefault("models", {})
|
||||
if not isinstance(models, dict):
|
||||
defaults["models"] = models = {}
|
||||
entry = models.setdefault(model, {})
|
||||
if not isinstance(entry, dict):
|
||||
entry = {}
|
||||
models[model] = entry
|
||||
params = entry.setdefault("params", {})
|
||||
if not isinstance(params, dict):
|
||||
params = {}
|
||||
entry["params"] = params
|
||||
changed = False
|
||||
if defaults.get("systemPromptOverride") != OPENCLAW_EVAL_SYSTEM_PROMPT:
|
||||
defaults["systemPromptOverride"] = OPENCLAW_EVAL_SYSTEM_PROMPT
|
||||
changed = True
|
||||
if params.get("fastMode") is not True:
|
||||
params["fastMode"] = True
|
||||
changed = True
|
||||
if model.startswith("openai/"):
|
||||
if params.get("transport") != "sse":
|
||||
params["transport"] = "sse"
|
||||
changed = True
|
||||
if params.get("openaiWsWarmup") is not False:
|
||||
params["openaiWsWarmup"] = False
|
||||
changed = True
|
||||
return changed
|
||||
|
||||
def _find_gateway_cmd(self) -> list[str] | None:
|
||||
import shutil
|
||||
|
||||
|
||||
@ -131,11 +131,27 @@ set_nested(data, "agents.defaults.skipBootstrap", True)
|
||||
set_nested(data, "agents.defaults.sandbox.mode", "off")
|
||||
set_nested(data, "agents.defaults.model.primary", os.environ["SWEEP_MODEL"])
|
||||
set_nested(data, "agents.defaults.subagents.model.primary", os.environ["SWEEP_MODEL"])
|
||||
set_nested(
|
||||
data,
|
||||
"agents.defaults.systemPromptOverride",
|
||||
"You are running an OpenClaw benchmark task. Complete the user's request in the current "
|
||||
"workspace using the available tools when needed. For file, code, browser, shell, or memory "
|
||||
"tasks, make the requested changes directly and verify them when practical. Do not ask "
|
||||
"follow-up questions during the benchmark. Keep any final reply brief.",
|
||||
)
|
||||
set_nested(data, "tools.exec.host", os.environ.get("OPENCLAW_EXEC_HOST", "gateway"))
|
||||
set_nested(data, "tools.exec.security", "full")
|
||||
set_nested(data, "tools.exec.ask", "off")
|
||||
set_nested(data, "approvals.exec.enabled", False)
|
||||
|
||||
models = data.setdefault("agents", {}).setdefault("defaults", {}).setdefault("models", {})
|
||||
model_entry = models.setdefault(os.environ["SWEEP_MODEL"], {})
|
||||
params = model_entry.setdefault("params", {})
|
||||
params["fastMode"] = True
|
||||
if os.environ["SWEEP_MODEL"].startswith("openai/"):
|
||||
params["transport"] = "sse"
|
||||
params["openaiWsWarmup"] = False
|
||||
|
||||
cfg_path.write_text(json.dumps(data, indent=2) + "\n", encoding="utf-8")
|
||||
|
||||
approvals_path = cfg_path.with_name("exec-approvals.json")
|
||||
|
||||
@ -20,6 +20,7 @@ def test_submission_request_defaults_to_single_parallel_lane():
|
||||
assert request.max_parallel_lanes == 1
|
||||
assert request.runs_per_task == 3
|
||||
assert request.judge_affects_score is False
|
||||
assert request.task_ids == []
|
||||
|
||||
|
||||
def test_submission_request_fingerprint_includes_judge_score_gate():
|
||||
@ -33,6 +34,16 @@ def test_submission_request_fingerprint_includes_judge_score_gate():
|
||||
assert advisory.active_fingerprint() != weighted.active_fingerprint()
|
||||
|
||||
|
||||
def test_submission_request_fingerprint_includes_task_ids():
|
||||
all_tasks = SubmissionRequest(model="anthropic/claude-sonnet-4-6")
|
||||
subset = SubmissionRequest(
|
||||
model="anthropic/claude-sonnet-4-6",
|
||||
task_ids=["t1-fs-quick-note"],
|
||||
)
|
||||
|
||||
assert all_tasks.active_fingerprint() != subset.active_fingerprint()
|
||||
|
||||
|
||||
def test_save_local_replaces_queue_file_atomically(tmp_path, monkeypatch):
|
||||
monkeypatch.setattr(queue_module, "LOCAL_QUEUE_DIR", tmp_path)
|
||||
monkeypatch.setattr(queue_module, "HF_TOKEN", "")
|
||||
|
||||
@ -6,7 +6,14 @@ from types import SimpleNamespace
|
||||
import pytest
|
||||
|
||||
from clawbench.queue import Job, JobQueue, JobStatus, SubmissionRequest
|
||||
from clawbench.worker import GATEWAY_PORT, GATEWAY_PORT_SPACING, EvalWorker, JobProgressTracker, ParallelLane
|
||||
from clawbench.worker import (
|
||||
GATEWAY_PORT,
|
||||
GATEWAY_PORT_SPACING,
|
||||
OPENCLAW_EVAL_SYSTEM_PROMPT,
|
||||
EvalWorker,
|
||||
JobProgressTracker,
|
||||
ParallelLane,
|
||||
)
|
||||
|
||||
|
||||
class DummyTask:
|
||||
@ -119,6 +126,8 @@ def test_configure_browser_runtime_pins_subagents_to_active_model(monkeypatch):
|
||||
"defaults": {
|
||||
"skipBootstrap": True,
|
||||
"model": {"primary": "openai-codex/gpt-5.4"},
|
||||
"models": {"openai-codex/gpt-5.4": {"params": {"fastMode": True}}},
|
||||
"systemPromptOverride": OPENCLAW_EVAL_SYSTEM_PROMPT,
|
||||
"subagents": {"model": {"primary": "openai-codex/gpt-5.4"}},
|
||||
}
|
||||
},
|
||||
@ -128,6 +137,19 @@ def test_configure_browser_runtime_pins_subagents_to_active_model(monkeypatch):
|
||||
}
|
||||
|
||||
|
||||
def test_eval_model_defaults_pin_openai_to_sse_transport() -> None:
|
||||
data: dict[str, object] = {}
|
||||
|
||||
changed = EvalWorker._apply_eval_model_defaults(data, "openai/gpt-5.5")
|
||||
|
||||
assert changed is True
|
||||
assert data["agents"]["defaults"]["models"]["openai/gpt-5.5"]["params"] == {
|
||||
"fastMode": True,
|
||||
"transport": "sse",
|
||||
"openaiWsWarmup": False,
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_prepare_benchmark_run_restarts_gateway_on_task_boundary(monkeypatch):
|
||||
worker = EvalWorker(JobQueue())
|
||||
|
||||
Loading…
Reference in New Issue
Block a user