Files
Prompt-optimizer/src/prometheus/infrastructure/judge_adapter.py
FullStackDev a5bf2ad59c feat: v0.2.0 sprint — ground truth eval, crossover/mutation, checkpointing, similarity guards, dataset loader, CLI commands, extended test coverage
Aggregates all v0.2.0 sprint work (GARAA-30 through GARAA-40) and fixes
2 integration tests that broke when the codebase went async (DSPyLLMAdapter
and full pipeline tests now properly await coroutines).

277 tests pass (260 unit + 17 integration).

Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-03-29 19:13:50 +00:00

146 lines
4.9 KiB
Python

"""
Adapter: LLM-as-Judge.
Implements the JudgePort via the DSPy OutputJudge module.
"""
from __future__ import annotations
import asyncio
import json
import logging
from typing import Any
import dspy
from prometheus.application.dto import JudgeDimension
from prometheus.domain.ports import JudgePort
from prometheus.domain.scoring import weighted_aggregate
from prometheus.infrastructure.dspy_modules import OutputJudge
from prometheus.infrastructure.retry import async_retry_with_backoff
logger = logging.getLogger(__name__)
class DSPyJudgeAdapter(JudgePort):
"""Evaluates a batch of (input, output) pairs by calling the Judge for each.
Per-call isolation: a failure on one item returns a zero-score sentinel
instead of crashing the whole batch.
Judge calls run in parallel (bounded by *max_concurrency*).
When *judge_criteria* or *judge_dimensions* are provided, the judge applies
custom rubrics and/or multi-dimensional scoring with weighted aggregation.
"""
def __init__(
self,
lm: dspy.LM,
max_retries: int = 3,
retry_delay_base: float = 1.0,
max_concurrency: int = 5,
judge_criteria: str | None = None,
judge_dimensions: list[JudgeDimension] | None = None,
) -> None:
self._lm = lm
self._judge = OutputJudge()
self._max_retries = max_retries
self._retry_delay_base = retry_delay_base
self._semaphore = asyncio.Semaphore(max_concurrency)
self._judge_criteria = judge_criteria or ""
self._judge_dimensions = judge_dimensions or []
self._dimension_names = (
",".join(d.name for d in self._judge_dimensions)
if self._judge_dimensions
else ""
)
self._weights: dict[str, float] = (
{d.name: d.weight for d in self._judge_dimensions}
if self._judge_dimensions
else {}
)
self.call_count: int = 0
async def judge_batch(
self,
task_description: str,
pairs: list[tuple[str, str]],
) -> list[tuple[float, str]]:
tasks = [
self._judge_single_safe(task_description, input_text, output_text)
for input_text, output_text in pairs
]
return list(await asyncio.gather(*tasks))
async def _judge_single_safe(
self,
task_description: str,
input_text: str,
output_text: str,
) -> tuple[float, str]:
async with self._semaphore:
try:
return await self._judge_single(task_description, input_text, output_text)
except Exception as exc:
logger.warning("Judge call failed for input '%s': %s", input_text[:40], exc)
return (0.0, f"[judge error: {exc}]")
async def _judge_single(
self,
task_description: str,
input_text: str,
output_text: str,
) -> tuple[float, str]:
async def _call() -> tuple[float, str]:
pred = await asyncio.to_thread(
self._sync_judge, task_description, input_text, output_text,
)
return self._aggregate_result(pred)
return await async_retry_with_backoff(
_call,
max_retries=self._max_retries,
retry_delay_base=self._retry_delay_base,
)
def _sync_judge(self, task_description: str, input_text: str, output_text: str):
with dspy.context(lm=self._lm):
result = self._judge(
task_description=task_description,
input_text=input_text,
output_text=output_text,
judge_criteria=self._judge_criteria,
dimension_names=self._dimension_names,
)
self.call_count += 1
return result
def _aggregate_result(self, pred: Any) -> tuple[float, str]:
"""Compute weighted aggregate score from dimension scores if available."""
if not self._judge_dimensions:
return (pred.score, pred.feedback)
# Parse per-dimension scores from LLM output
dim_scores: dict[str, float] = {}
try:
raw = json.loads(pred.dimension_scores)
if isinstance(raw, dict):
for name in self._weights:
val = raw.get(name)
if val is not None:
dim_scores[name] = max(0.0, min(1.0, float(val)))
except (json.JSONDecodeError, ValueError, TypeError):
logger.debug("Failed to parse dimension_scores, falling back to overall score")
if not dim_scores:
return (pred.score, pred.feedback)
aggregate = weighted_aggregate(dim_scores, self._weights)
# Enrich feedback with per-dimension breakdown
dim_breakdown = ", ".join(
f"{name}={dim_scores.get(name, 0.0):.2f}"
for name in self._weights
)
feedback = f"{pred.feedback} [{dim_breakdown}]"
return (aggregate, feedback)