""" Adapter: LLM-as-Judge. Implements the JudgePort via the DSPy OutputJudge module. """ from __future__ import annotations import asyncio import json import logging from typing import Any import dspy from prometheus.application.dto import JudgeDimension from prometheus.domain.ports import JudgePort from prometheus.domain.scoring import weighted_aggregate from prometheus.infrastructure.dspy_modules import OutputJudge from prometheus.infrastructure.retry import async_retry_with_backoff logger = logging.getLogger(__name__) class DSPyJudgeAdapter(JudgePort): """Evaluates a batch of (input, output) pairs by calling the Judge for each. Per-call isolation: a failure on one item returns a zero-score sentinel instead of crashing the whole batch. Judge calls run in parallel (bounded by *max_concurrency*). When *judge_criteria* or *judge_dimensions* are provided, the judge applies custom rubrics and/or multi-dimensional scoring with weighted aggregation. """ def __init__( self, lm: dspy.LM, max_retries: int = 3, retry_delay_base: float = 1.0, max_concurrency: int = 5, judge_criteria: str | None = None, judge_dimensions: list[JudgeDimension] | None = None, ) -> None: self._lm = lm self._judge = OutputJudge() self._max_retries = max_retries self._retry_delay_base = retry_delay_base self._semaphore = asyncio.Semaphore(max_concurrency) self._judge_criteria = judge_criteria or "" self._judge_dimensions = judge_dimensions or [] self._dimension_names = ( ",".join(d.name for d in self._judge_dimensions) if self._judge_dimensions else "" ) self._weights: dict[str, float] = ( {d.name: d.weight for d in self._judge_dimensions} if self._judge_dimensions else {} ) self.call_count: int = 0 async def judge_batch( self, task_description: str, pairs: list[tuple[str, str]], ) -> list[tuple[float, str]]: tasks = [ self._judge_single_safe(task_description, input_text, output_text) for input_text, output_text in pairs ] return list(await asyncio.gather(*tasks)) async def _judge_single_safe( self, task_description: str, input_text: str, output_text: str, ) -> tuple[float, str]: async with self._semaphore: try: return await self._judge_single(task_description, input_text, output_text) except Exception as exc: logger.warning("Judge call failed for input '%s…': %s", input_text[:40], exc) return (0.0, f"[judge error: {exc}]") async def _judge_single( self, task_description: str, input_text: str, output_text: str, ) -> tuple[float, str]: async def _call() -> tuple[float, str]: pred = await asyncio.to_thread( self._sync_judge, task_description, input_text, output_text, ) return self._aggregate_result(pred) return await async_retry_with_backoff( _call, max_retries=self._max_retries, retry_delay_base=self._retry_delay_base, ) def _sync_judge(self, task_description: str, input_text: str, output_text: str): with dspy.context(lm=self._lm): result = self._judge( task_description=task_description, input_text=input_text, output_text=output_text, judge_criteria=self._judge_criteria, dimension_names=self._dimension_names, ) self.call_count += 1 return result def _aggregate_result(self, pred: Any) -> tuple[float, str]: """Compute weighted aggregate score from dimension scores if available.""" if not self._judge_dimensions: return (pred.score, pred.feedback) # Parse per-dimension scores from LLM output dim_scores: dict[str, float] = {} try: raw = json.loads(pred.dimension_scores) if isinstance(raw, dict): for name in self._weights: val = raw.get(name) if val is not None: dim_scores[name] = max(0.0, min(1.0, float(val))) except (json.JSONDecodeError, ValueError, TypeError): logger.debug("Failed to parse dimension_scores, falling back to overall score") if not dim_scores: return (pred.score, pred.feedback) aggregate = weighted_aggregate(dim_scores, self._weights) # Enrich feedback with per-dimension breakdown dim_breakdown = ", ".join( f"{name}={dim_scores.get(name, 0.0):.2f}" for name in self._weights ) feedback = f"{pred.feedback} [{dim_breakdown}]" return (aggregate, feedback)