feat: custom judge criteria and multi-dimensional scoring
Add configurable judge rubrics and multi-dimensional scoring with
weighted aggregation. New config fields: judge_criteria (free text)
and judge_dimensions (list of {name, weight, description}). CLI
--judge-criteria flag provides quick overrides. The judge adapter
computes weighted aggregate scores and enriches feedback with
per-dimension breakdowns.
Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
@@ -29,7 +29,7 @@ def judge_lm() -> dspy.LM:
|
||||
"""Dummy LM for judging (ChainOfThought requires reasoning field)."""
|
||||
return dspy.utils.DummyLM(
|
||||
[
|
||||
{"reasoning": "Evaluating output.", "score": "0.8", "feedback": "Good response."},
|
||||
{"reasoning": "Evaluating output.", "score": "0.8", "feedback": "Good response.", "dimension_scores": "{}"},
|
||||
]
|
||||
)
|
||||
|
||||
@@ -99,9 +99,9 @@ class TestDSPyJudgeAdapterOwnLM:
|
||||
@pytest.mark.asyncio
|
||||
async def test_does_not_use_global_lm(self) -> None:
|
||||
judge_lm = dspy.utils.DummyLM(
|
||||
[{"reasoning": "ok", "score": "0.9", "feedback": "Judge-specific response"}]
|
||||
[{"reasoning": "ok", "score": "0.9", "feedback": "Judge-specific response", "dimension_scores": "{}"}]
|
||||
)
|
||||
global_lm = dspy.utils.DummyLM([{"reasoning": "no", "score": "0.1", "feedback": "Wrong LM!"}])
|
||||
global_lm = dspy.utils.DummyLM([{"reasoning": "no", "score": "0.1", "feedback": "Wrong LM!", "dimension_scores": "{}"}])
|
||||
dspy.configure(lm=global_lm)
|
||||
|
||||
adapter = DSPyJudgeAdapter(lm=judge_lm)
|
||||
@@ -176,7 +176,7 @@ class TestDSPySyntheticAdapterOwnLM:
|
||||
class TestPerModelOverrides:
|
||||
"""Verify that per-model api_base/api_key_env are passed through to dspy.LM."""
|
||||
|
||||
@patch("prometheus.cli.app.dspy.LM")
|
||||
@patch("prometheus.cli.commands.optimize.dspy.LM")
|
||||
def test_per_model_api_base_override(self, mock_lm_cls: MagicMock) -> None:
|
||||
"""Per-model api_base should be used instead of global."""
|
||||
mock_lm_cls.return_value = MagicMock()
|
||||
|
||||
Reference in New Issue
Block a user