feat: custom judge criteria and multi-dimensional scoring

Add configurable judge rubrics and multi-dimensional scoring with
weighted aggregation. New config fields: judge_criteria (free text)
and judge_dimensions (list of {name, weight, description}). CLI
--judge-criteria flag provides quick overrides. The judge adapter
computes weighted aggregate scores and enriches feedback with
per-dimension breakdowns.

Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
FullStackDev
2026-03-29 15:40:21 +00:00
parent 336774a164
commit b9745566c8
8 changed files with 754 additions and 27 deletions

View File

@@ -6,12 +6,15 @@ Implements the JudgePort via the DSPy OutputJudge module.
from __future__ import annotations
import asyncio
import json
import logging
from typing import Self
from typing import Any
import dspy
from prometheus.application.dto import JudgeDimension
from prometheus.domain.ports import JudgePort
from prometheus.domain.scoring import weighted_aggregate
from prometheus.infrastructure.dspy_modules import OutputJudge
from prometheus.infrastructure.retry import async_retry_with_backoff
@@ -25,6 +28,9 @@ class DSPyJudgeAdapter(JudgePort):
instead of crashing the whole batch.
Judge calls run in parallel (bounded by *max_concurrency*).
When *judge_criteria* or *judge_dimensions* are provided, the judge applies
custom rubrics and/or multi-dimensional scoring with weighted aggregation.
"""
def __init__(
@@ -33,12 +39,26 @@ class DSPyJudgeAdapter(JudgePort):
max_retries: int = 3,
retry_delay_base: float = 1.0,
max_concurrency: int = 5,
judge_criteria: str | None = None,
judge_dimensions: list[JudgeDimension] | None = None,
) -> None:
self._lm = lm
self._judge = OutputJudge()
self._max_retries = max_retries
self._retry_delay_base = retry_delay_base
self._semaphore = asyncio.Semaphore(max_concurrency)
self._judge_criteria = judge_criteria or ""
self._judge_dimensions = judge_dimensions or []
self._dimension_names = (
",".join(d.name for d in self._judge_dimensions)
if self._judge_dimensions
else ""
)
self._weights: dict[str, float] = (
{d.name: d.weight for d in self._judge_dimensions}
if self._judge_dimensions
else {}
)
async def judge_batch(
self,
@@ -74,7 +94,7 @@ class DSPyJudgeAdapter(JudgePort):
pred = await asyncio.to_thread(
self._sync_judge, task_description, input_text, output_text,
)
return (pred.score, pred.feedback)
return self._aggregate_result(pred)
return await async_retry_with_backoff(
_call,
@@ -88,4 +108,35 @@ class DSPyJudgeAdapter(JudgePort):
task_description=task_description,
input_text=input_text,
output_text=output_text,
judge_criteria=self._judge_criteria,
dimension_names=self._dimension_names,
)
def _aggregate_result(self, pred: Any) -> tuple[float, str]:
"""Compute weighted aggregate score from dimension scores if available."""
if not self._judge_dimensions:
return (pred.score, pred.feedback)
# Parse per-dimension scores from LLM output
dim_scores: dict[str, float] = {}
try:
raw = json.loads(pred.dimension_scores)
if isinstance(raw, dict):
for name in self._weights:
val = raw.get(name)
if val is not None:
dim_scores[name] = max(0.0, min(1.0, float(val)))
except (json.JSONDecodeError, ValueError, TypeError):
logger.debug("Failed to parse dimension_scores, falling back to overall score")
if not dim_scores:
return (pred.score, pred.feedback)
aggregate = weighted_aggregate(dim_scores, self._weights)
# Enrich feedback with per-dimension breakdown
dim_breakdown = ", ".join(
f"{name}={dim_scores.get(name, 0.0):.2f}"
for name in self._weights
)
feedback = f"{pred.feedback} [{dim_breakdown}]"
return (aggregate, feedback)