feat: custom judge criteria and multi-dimensional scoring

Add configurable judge rubrics and multi-dimensional scoring with weighted aggregation. New config fields: judge_criteria (free text) and judge_dimensions (list of {name, weight, description}). CLI --judge-criteria flag provides quick overrides. The judge adapter computes weighted aggregate scores and enriches feedback with per-dimension breakdowns. Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-03-29 15:40:21 +00:00
parent 336774a164
commit b9745566c8
8 changed files with 754 additions and 27 deletions
--- a/tests/unit/test_adapter_config.py
+++ b/tests/unit/test_adapter_config.py
@@ -29,7 +29,7 @@ def judge_lm() -> dspy.LM:
    """Dummy LM for judging (ChainOfThought requires reasoning field)."""
    return dspy.utils.DummyLM(
        [
-            {"reasoning": "Evaluating output.", "score": "0.8", "feedback": "Good response."},
+            {"reasoning": "Evaluating output.", "score": "0.8", "feedback": "Good response.", "dimension_scores": "{}"},
        ]
    )

@@ -99,9 +99,9 @@ class TestDSPyJudgeAdapterOwnLM:
    @pytest.mark.asyncio
    async def test_does_not_use_global_lm(self) -> None:
        judge_lm = dspy.utils.DummyLM(
-            [{"reasoning": "ok", "score": "0.9", "feedback": "Judge-specific response"}]
+            [{"reasoning": "ok", "score": "0.9", "feedback": "Judge-specific response", "dimension_scores": "{}"}]
        )
-        global_lm = dspy.utils.DummyLM([{"reasoning": "no", "score": "0.1", "feedback": "Wrong LM!"}])
+        global_lm = dspy.utils.DummyLM([{"reasoning": "no", "score": "0.1", "feedback": "Wrong LM!", "dimension_scores": "{}"}])
        dspy.configure(lm=global_lm)

        adapter = DSPyJudgeAdapter(lm=judge_lm)
@@ -176,7 +176,7 @@ class TestDSPySyntheticAdapterOwnLM:
 class TestPerModelOverrides:
    """Verify that per-model api_base/api_key_env are passed through to dspy.LM."""

-    @patch("prometheus.cli.app.dspy.LM")
+    @patch("prometheus.cli.commands.optimize.dspy.LM")
    def test_per_model_api_base_override(self, mock_lm_cls: MagicMock) -> None:
        """Per-model api_base should be used instead of global."""
        mock_lm_cls.return_value = MagicMock()
--- a/tests/unit/test_error_handling.py
+++ b/tests/unit/test_error_handling.py
@@ -124,12 +124,11 @@ class TestCircuitBreaker:
            circuit_breaker_threshold=3,
            error_strategy="skip",
        )
-        with patch.object(loop, "_log"):
-            state = await loop.run(
-                Prompt("test"),
-                [SyntheticExample("in", id=0), SyntheticExample("in2", id=1)],
-                "task",
-            )
+        state = await loop.run(
+            Prompt("test"),
+            [SyntheticExample("in", id=0), SyntheticExample("in2", id=1)],
+            "task",
+        )

        error_events = [h for h in state.history if h.get("event") == "error"]
        cb_events = [h for h in state.history if h.get("event") == "circuit_breaker"]
@@ -165,13 +164,12 @@ class TestCircuitBreaker:
            circuit_breaker_threshold=3,
            error_strategy="abort",
        )
-        with patch.object(loop, "_log"):
-            with pytest.raises(RuntimeError, match="LLM down"):
-                await loop.run(
-                    Prompt("test"),
-                    [SyntheticExample("in", id=0), SyntheticExample("in2", id=1)],
-                    "task",
-                )
+        with pytest.raises(RuntimeError, match="LLM down"):
+            await loop.run(
+                Prompt("test"),
+                [SyntheticExample("in", id=0), SyntheticExample("in2", id=1)],
+                "task",
+            )

    @pytest.mark.asyncio
    async def test_resets_on_success(self):
@@ -216,12 +214,11 @@ class TestCircuitBreaker:
            circuit_breaker_threshold=3,
            error_strategy="skip",
        )
-        with patch.object(loop, "_log"):
-            state = await loop.run(
-                Prompt("test"),
-                [SyntheticExample("in", id=0), SyntheticExample("in2", id=1)],
-                "task",
-            )
+        state = await loop.run(
+            Prompt("test"),
+            [SyntheticExample("in", id=0), SyntheticExample("in2", id=1)],
+            "task",
+        )

        # Should NOT have tripped — 2 fails, then success reset the counter
        cb_events = [h for h in state.history if h.get("event") == "circuit_breaker"]
@@ -277,6 +274,10 @@ class TestPerCallIsolation:
        adapter._max_retries = 1
        adapter._retry_delay_base = 0
        adapter._semaphore = __import__("asyncio").Semaphore(5)
+        adapter._judge_criteria = ""
+        adapter._judge_dimensions = []
+        adapter._dimension_names = ""
+        adapter._weights = {}

        # Mock _judge to fail on first call, succeed on second
        call_count = 0