Aggregates all v0.2.0 sprint work (GARAA-30 through GARAA-40) and fixes 2 integration tests that broke when the codebase went async (DSPyLLMAdapter and full pipeline tests now properly await coroutines). 277 tests pass (260 unit + 17 integration). Co-Authored-By: Paperclip <noreply@paperclip.ing>
146 lines
4.9 KiB
Python
146 lines
4.9 KiB
Python
"""
|
|
Adapter: LLM-as-Judge.
|
|
|
|
Implements the JudgePort via the DSPy OutputJudge module.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import json
|
|
import logging
|
|
from typing import Any
|
|
|
|
import dspy
|
|
|
|
from prometheus.application.dto import JudgeDimension
|
|
from prometheus.domain.ports import JudgePort
|
|
from prometheus.domain.scoring import weighted_aggregate
|
|
from prometheus.infrastructure.dspy_modules import OutputJudge
|
|
from prometheus.infrastructure.retry import async_retry_with_backoff
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class DSPyJudgeAdapter(JudgePort):
|
|
"""Evaluates a batch of (input, output) pairs by calling the Judge for each.
|
|
|
|
Per-call isolation: a failure on one item returns a zero-score sentinel
|
|
instead of crashing the whole batch.
|
|
|
|
Judge calls run in parallel (bounded by *max_concurrency*).
|
|
|
|
When *judge_criteria* or *judge_dimensions* are provided, the judge applies
|
|
custom rubrics and/or multi-dimensional scoring with weighted aggregation.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
lm: dspy.LM,
|
|
max_retries: int = 3,
|
|
retry_delay_base: float = 1.0,
|
|
max_concurrency: int = 5,
|
|
judge_criteria: str | None = None,
|
|
judge_dimensions: list[JudgeDimension] | None = None,
|
|
) -> None:
|
|
self._lm = lm
|
|
self._judge = OutputJudge()
|
|
self._max_retries = max_retries
|
|
self._retry_delay_base = retry_delay_base
|
|
self._semaphore = asyncio.Semaphore(max_concurrency)
|
|
self._judge_criteria = judge_criteria or ""
|
|
self._judge_dimensions = judge_dimensions or []
|
|
self._dimension_names = (
|
|
",".join(d.name for d in self._judge_dimensions)
|
|
if self._judge_dimensions
|
|
else ""
|
|
)
|
|
self._weights: dict[str, float] = (
|
|
{d.name: d.weight for d in self._judge_dimensions}
|
|
if self._judge_dimensions
|
|
else {}
|
|
)
|
|
self.call_count: int = 0
|
|
|
|
async def judge_batch(
|
|
self,
|
|
task_description: str,
|
|
pairs: list[tuple[str, str]],
|
|
) -> list[tuple[float, str]]:
|
|
tasks = [
|
|
self._judge_single_safe(task_description, input_text, output_text)
|
|
for input_text, output_text in pairs
|
|
]
|
|
return list(await asyncio.gather(*tasks))
|
|
|
|
async def _judge_single_safe(
|
|
self,
|
|
task_description: str,
|
|
input_text: str,
|
|
output_text: str,
|
|
) -> tuple[float, str]:
|
|
async with self._semaphore:
|
|
try:
|
|
return await self._judge_single(task_description, input_text, output_text)
|
|
except Exception as exc:
|
|
logger.warning("Judge call failed for input '%s…': %s", input_text[:40], exc)
|
|
return (0.0, f"[judge error: {exc}]")
|
|
|
|
async def _judge_single(
|
|
self,
|
|
task_description: str,
|
|
input_text: str,
|
|
output_text: str,
|
|
) -> tuple[float, str]:
|
|
async def _call() -> tuple[float, str]:
|
|
pred = await asyncio.to_thread(
|
|
self._sync_judge, task_description, input_text, output_text,
|
|
)
|
|
return self._aggregate_result(pred)
|
|
|
|
return await async_retry_with_backoff(
|
|
_call,
|
|
max_retries=self._max_retries,
|
|
retry_delay_base=self._retry_delay_base,
|
|
)
|
|
|
|
def _sync_judge(self, task_description: str, input_text: str, output_text: str):
|
|
with dspy.context(lm=self._lm):
|
|
result = self._judge(
|
|
task_description=task_description,
|
|
input_text=input_text,
|
|
output_text=output_text,
|
|
judge_criteria=self._judge_criteria,
|
|
dimension_names=self._dimension_names,
|
|
)
|
|
self.call_count += 1
|
|
return result
|
|
|
|
def _aggregate_result(self, pred: Any) -> tuple[float, str]:
|
|
"""Compute weighted aggregate score from dimension scores if available."""
|
|
if not self._judge_dimensions:
|
|
return (pred.score, pred.feedback)
|
|
|
|
# Parse per-dimension scores from LLM output
|
|
dim_scores: dict[str, float] = {}
|
|
try:
|
|
raw = json.loads(pred.dimension_scores)
|
|
if isinstance(raw, dict):
|
|
for name in self._weights:
|
|
val = raw.get(name)
|
|
if val is not None:
|
|
dim_scores[name] = max(0.0, min(1.0, float(val)))
|
|
except (json.JSONDecodeError, ValueError, TypeError):
|
|
logger.debug("Failed to parse dimension_scores, falling back to overall score")
|
|
|
|
if not dim_scores:
|
|
return (pred.score, pred.feedback)
|
|
|
|
aggregate = weighted_aggregate(dim_scores, self._weights)
|
|
# Enrich feedback with per-dimension breakdown
|
|
dim_breakdown = ", ".join(
|
|
f"{name}={dim_scores.get(name, 0.0):.2f}"
|
|
for name in self._weights
|
|
)
|
|
feedback = f"{pred.feedback} [{dim_breakdown}]"
|
|
return (aggregate, feedback)
|