feat: custom judge criteria and multi-dimensional scoring
Add configurable judge rubrics and multi-dimensional scoring with
weighted aggregation. New config fields: judge_criteria (free text)
and judge_dimensions (list of {name, weight, description}). CLI
--judge-criteria flag provides quick overrides. The judge adapter
computes weighted aggregate scores and enriches feedback with
per-dimension breakdowns.
Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
@@ -124,12 +124,11 @@ class TestCircuitBreaker:
|
||||
circuit_breaker_threshold=3,
|
||||
error_strategy="skip",
|
||||
)
|
||||
with patch.object(loop, "_log"):
|
||||
state = await loop.run(
|
||||
Prompt("test"),
|
||||
[SyntheticExample("in", id=0), SyntheticExample("in2", id=1)],
|
||||
"task",
|
||||
)
|
||||
state = await loop.run(
|
||||
Prompt("test"),
|
||||
[SyntheticExample("in", id=0), SyntheticExample("in2", id=1)],
|
||||
"task",
|
||||
)
|
||||
|
||||
error_events = [h for h in state.history if h.get("event") == "error"]
|
||||
cb_events = [h for h in state.history if h.get("event") == "circuit_breaker"]
|
||||
@@ -165,13 +164,12 @@ class TestCircuitBreaker:
|
||||
circuit_breaker_threshold=3,
|
||||
error_strategy="abort",
|
||||
)
|
||||
with patch.object(loop, "_log"):
|
||||
with pytest.raises(RuntimeError, match="LLM down"):
|
||||
await loop.run(
|
||||
Prompt("test"),
|
||||
[SyntheticExample("in", id=0), SyntheticExample("in2", id=1)],
|
||||
"task",
|
||||
)
|
||||
with pytest.raises(RuntimeError, match="LLM down"):
|
||||
await loop.run(
|
||||
Prompt("test"),
|
||||
[SyntheticExample("in", id=0), SyntheticExample("in2", id=1)],
|
||||
"task",
|
||||
)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_resets_on_success(self):
|
||||
@@ -216,12 +214,11 @@ class TestCircuitBreaker:
|
||||
circuit_breaker_threshold=3,
|
||||
error_strategy="skip",
|
||||
)
|
||||
with patch.object(loop, "_log"):
|
||||
state = await loop.run(
|
||||
Prompt("test"),
|
||||
[SyntheticExample("in", id=0), SyntheticExample("in2", id=1)],
|
||||
"task",
|
||||
)
|
||||
state = await loop.run(
|
||||
Prompt("test"),
|
||||
[SyntheticExample("in", id=0), SyntheticExample("in2", id=1)],
|
||||
"task",
|
||||
)
|
||||
|
||||
# Should NOT have tripped — 2 fails, then success reset the counter
|
||||
cb_events = [h for h in state.history if h.get("event") == "circuit_breaker"]
|
||||
@@ -277,6 +274,10 @@ class TestPerCallIsolation:
|
||||
adapter._max_retries = 1
|
||||
adapter._retry_delay_base = 0
|
||||
adapter._semaphore = __import__("asyncio").Semaphore(5)
|
||||
adapter._judge_criteria = ""
|
||||
adapter._judge_dimensions = []
|
||||
adapter._dimension_names = ""
|
||||
adapter._weights = {}
|
||||
|
||||
# Mock _judge to fail on first call, succeed on second
|
||||
call_count = 0
|
||||
|
||||
Reference in New Issue
Block a user