feat: custom judge criteria and multi-dimensional scoring

Add configurable judge rubrics and multi-dimensional scoring with weighted aggregation. New config fields: judge_criteria (free text) and judge_dimensions (list of {name, weight, description}). CLI --judge-criteria flag provides quick overrides. The judge adapter computes weighted aggregate scores and enriches feedback with per-dimension breakdowns. Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-03-29 15:40:21 +00:00
parent 336774a164
commit b9745566c8
8 changed files with 754 additions and 27 deletions
--- a/src/prometheus/infrastructure/judge_adapter.py
+++ b/src/prometheus/infrastructure/judge_adapter.py
@@ -6,12 +6,15 @@ Implements the JudgePort via the DSPy OutputJudge module.
 from __future__ import annotations

 import asyncio
+import json
 import logging
-from typing import Self
+from typing import Any

 import dspy

+from prometheus.application.dto import JudgeDimension
 from prometheus.domain.ports import JudgePort
+from prometheus.domain.scoring import weighted_aggregate
 from prometheus.infrastructure.dspy_modules import OutputJudge
 from prometheus.infrastructure.retry import async_retry_with_backoff

@@ -25,6 +28,9 @@ class DSPyJudgeAdapter(JudgePort):
    instead of crashing the whole batch.

    Judge calls run in parallel (bounded by *max_concurrency*).
+
+    When *judge_criteria* or *judge_dimensions* are provided, the judge applies
+    custom rubrics and/or multi-dimensional scoring with weighted aggregation.
    """

    def __init__(
@@ -33,12 +39,26 @@ class DSPyJudgeAdapter(JudgePort):
        max_retries: int = 3,
        retry_delay_base: float = 1.0,
        max_concurrency: int = 5,
+        judge_criteria: str | None = None,
+        judge_dimensions: list[JudgeDimension] | None = None,
    ) -> None:
        self._lm = lm
        self._judge = OutputJudge()
        self._max_retries = max_retries
        self._retry_delay_base = retry_delay_base
        self._semaphore = asyncio.Semaphore(max_concurrency)
+        self._judge_criteria = judge_criteria or ""
+        self._judge_dimensions = judge_dimensions or []
+        self._dimension_names = (
+            ",".join(d.name for d in self._judge_dimensions)
+            if self._judge_dimensions
+            else ""
+        )
+        self._weights: dict[str, float] = (
+            {d.name: d.weight for d in self._judge_dimensions}
+            if self._judge_dimensions
+            else {}
+        )

    async def judge_batch(
        self,
@@ -74,7 +94,7 @@ class DSPyJudgeAdapter(JudgePort):
            pred = await asyncio.to_thread(
                self._sync_judge, task_description, input_text, output_text,
            )
-            return (pred.score, pred.feedback)
+            return self._aggregate_result(pred)

        return await async_retry_with_backoff(
            _call,
@@ -88,4 +108,35 @@ class DSPyJudgeAdapter(JudgePort):
                task_description=task_description,
                input_text=input_text,
                output_text=output_text,
+                judge_criteria=self._judge_criteria,
+                dimension_names=self._dimension_names,
            )
+
+    def _aggregate_result(self, pred: Any) -> tuple[float, str]:
+        """Compute weighted aggregate score from dimension scores if available."""
+        if not self._judge_dimensions:
+            return (pred.score, pred.feedback)
+
+        # Parse per-dimension scores from LLM output
+        dim_scores: dict[str, float] = {}
+        try:
+            raw = json.loads(pred.dimension_scores)
+            if isinstance(raw, dict):
+                for name in self._weights:
+                    val = raw.get(name)
+                    if val is not None:
+                        dim_scores[name] = max(0.0, min(1.0, float(val)))
+        except (json.JSONDecodeError, ValueError, TypeError):
+            logger.debug("Failed to parse dimension_scores, falling back to overall score")
+
+        if not dim_scores:
+            return (pred.score, pred.feedback)
+
+        aggregate = weighted_aggregate(dim_scores, self._weights)
+        # Enrich feedback with per-dimension breakdown
+        dim_breakdown = ", ".join(
+            f"{name}={dim_scores.get(name, 0.0):.2f}"
+            for name in self._weights
+        )
+        feedback = f"{pred.feedback} [{dim_breakdown}]"
+        return (aggregate, feedback)