fix(scoring): include judge gate in run cache key
Some checks failed
CI / Python ${{ matrix.python-version }} test suite (3.11) (push) Has been cancelled
CI / Python ${{ matrix.python-version }} test suite (3.12) (push) Has been cancelled

This commit is contained in:
Vincent Koc 2026-04-28 22:50:06 -07:00
parent fbb13ac4d9
commit 2670dcadf0
No known key found for this signature in database
2 changed files with 11 additions and 0 deletions

View File

@ -547,6 +547,7 @@ class BenchmarkHarness:
"adapter": self.adapter,
"prompt_variant": self.prompt_variant,
"judge_model": self.judge_model,
"judge_affects_score": self.judge_affects_score,
"benchmark_version": __version__,
"task_fingerprint": _task_definition_fingerprint(task),
}

View File

@ -191,6 +191,15 @@ def test_run_cache_path_includes_scoring_inputs(tmp_path: Path):
judge_model="judge-b",
randomize_order=False,
)
different_judge_gate = BenchmarkHarness(
gateway_config=GatewayConfig(),
model="test/model",
task_ids=[task.id],
prompt_variant="clear",
judge_model="judge-a",
judge_affects_score=True,
randomize_order=False,
)
different_prompt = BenchmarkHarness(
gateway_config=GatewayConfig(),
model="test/model",
@ -205,6 +214,7 @@ def test_run_cache_path_includes_scoring_inputs(tmp_path: Path):
assert "v2-" in str(base_path)
assert base_path == same._run_cache_path(tmp_path, task, 0)
assert base_path != different_judge._run_cache_path(tmp_path, task, 0)
assert base_path != different_judge_gate._run_cache_path(tmp_path, task, 0)
assert base_path != different_prompt._run_cache_path(tmp_path, task, 0)