feat: custom judge criteria and multi-dimensional scoring
Add configurable judge rubrics and multi-dimensional scoring with
weighted aggregation. New config fields: judge_criteria (free text)
and judge_dimensions (list of {name, weight, description}). CLI
--judge-criteria flag provides quick overrides. The judge adapter
computes weighted aggregate scores and enriches feedback with
per-dimension breakdowns.
Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
@@ -6,12 +6,15 @@ Implements the JudgePort via the DSPy OutputJudge module.
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
from typing import Self
|
||||
from typing import Any
|
||||
|
||||
import dspy
|
||||
|
||||
from prometheus.application.dto import JudgeDimension
|
||||
from prometheus.domain.ports import JudgePort
|
||||
from prometheus.domain.scoring import weighted_aggregate
|
||||
from prometheus.infrastructure.dspy_modules import OutputJudge
|
||||
from prometheus.infrastructure.retry import async_retry_with_backoff
|
||||
|
||||
@@ -25,6 +28,9 @@ class DSPyJudgeAdapter(JudgePort):
|
||||
instead of crashing the whole batch.
|
||||
|
||||
Judge calls run in parallel (bounded by *max_concurrency*).
|
||||
|
||||
When *judge_criteria* or *judge_dimensions* are provided, the judge applies
|
||||
custom rubrics and/or multi-dimensional scoring with weighted aggregation.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@@ -33,12 +39,26 @@ class DSPyJudgeAdapter(JudgePort):
|
||||
max_retries: int = 3,
|
||||
retry_delay_base: float = 1.0,
|
||||
max_concurrency: int = 5,
|
||||
judge_criteria: str | None = None,
|
||||
judge_dimensions: list[JudgeDimension] | None = None,
|
||||
) -> None:
|
||||
self._lm = lm
|
||||
self._judge = OutputJudge()
|
||||
self._max_retries = max_retries
|
||||
self._retry_delay_base = retry_delay_base
|
||||
self._semaphore = asyncio.Semaphore(max_concurrency)
|
||||
self._judge_criteria = judge_criteria or ""
|
||||
self._judge_dimensions = judge_dimensions or []
|
||||
self._dimension_names = (
|
||||
",".join(d.name for d in self._judge_dimensions)
|
||||
if self._judge_dimensions
|
||||
else ""
|
||||
)
|
||||
self._weights: dict[str, float] = (
|
||||
{d.name: d.weight for d in self._judge_dimensions}
|
||||
if self._judge_dimensions
|
||||
else {}
|
||||
)
|
||||
|
||||
async def judge_batch(
|
||||
self,
|
||||
@@ -74,7 +94,7 @@ class DSPyJudgeAdapter(JudgePort):
|
||||
pred = await asyncio.to_thread(
|
||||
self._sync_judge, task_description, input_text, output_text,
|
||||
)
|
||||
return (pred.score, pred.feedback)
|
||||
return self._aggregate_result(pred)
|
||||
|
||||
return await async_retry_with_backoff(
|
||||
_call,
|
||||
@@ -88,4 +108,35 @@ class DSPyJudgeAdapter(JudgePort):
|
||||
task_description=task_description,
|
||||
input_text=input_text,
|
||||
output_text=output_text,
|
||||
judge_criteria=self._judge_criteria,
|
||||
dimension_names=self._dimension_names,
|
||||
)
|
||||
|
||||
def _aggregate_result(self, pred: Any) -> tuple[float, str]:
|
||||
"""Compute weighted aggregate score from dimension scores if available."""
|
||||
if not self._judge_dimensions:
|
||||
return (pred.score, pred.feedback)
|
||||
|
||||
# Parse per-dimension scores from LLM output
|
||||
dim_scores: dict[str, float] = {}
|
||||
try:
|
||||
raw = json.loads(pred.dimension_scores)
|
||||
if isinstance(raw, dict):
|
||||
for name in self._weights:
|
||||
val = raw.get(name)
|
||||
if val is not None:
|
||||
dim_scores[name] = max(0.0, min(1.0, float(val)))
|
||||
except (json.JSONDecodeError, ValueError, TypeError):
|
||||
logger.debug("Failed to parse dimension_scores, falling back to overall score")
|
||||
|
||||
if not dim_scores:
|
||||
return (pred.score, pred.feedback)
|
||||
|
||||
aggregate = weighted_aggregate(dim_scores, self._weights)
|
||||
# Enrich feedback with per-dimension breakdown
|
||||
dim_breakdown = ", ".join(
|
||||
f"{name}={dim_scores.get(name, 0.0):.2f}"
|
||||
for name in self._weights
|
||||
)
|
||||
feedback = f"{pred.feedback} [{dim_breakdown}]"
|
||||
return (aggregate, feedback)
|
||||
|
||||
Reference in New Issue
Block a user