feat: error handling, retry with backoff, and circuit breaker
Add robust error handling to the evolution loop and LLM adapters: - Retry utility with exponential backoff for transient errors (429, 5xx, timeouts) - Per-call error isolation in evaluator and judge adapter - Circuit breaker in EvolutionLoop (trips after N consecutive failures) - CLI flags: --max-retries, --error-strategy (skip|retry|abort) - Config fields: max_retries, retry_delay_base, circuit_breaker_threshold, error_strategy - 16 new unit tests covering all error handling paths Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
@@ -5,21 +5,34 @@ Implements the JudgePort via the DSPy OutputJudge module.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
import dspy
|
||||
|
||||
from prometheus.domain.ports import JudgePort
|
||||
from prometheus.infrastructure.dspy_modules import OutputJudge
|
||||
from prometheus.infrastructure.retry import retry_with_backoff
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DSPyJudgeAdapter(JudgePort):
|
||||
"""Evaluates a batch of (input, output) pairs by calling the Judge for each.
|
||||
|
||||
Sequential for MVP. Future: parallelize via dspy.Parallel.
|
||||
Per-call isolation: a failure on one item returns a zero-score sentinel
|
||||
instead of crashing the whole batch.
|
||||
"""
|
||||
|
||||
def __init__(self, lm: dspy.LM) -> None:
|
||||
def __init__(
|
||||
self,
|
||||
lm: dspy.LM,
|
||||
max_retries: int = 3,
|
||||
retry_delay_base: float = 1.0,
|
||||
) -> None:
|
||||
self._lm = lm
|
||||
self._judge = OutputJudge()
|
||||
self._max_retries = max_retries
|
||||
self._retry_delay_base = retry_delay_base
|
||||
|
||||
def judge_batch(
|
||||
self,
|
||||
@@ -29,10 +42,26 @@ class DSPyJudgeAdapter(JudgePort):
|
||||
results: list[tuple[float, str]] = []
|
||||
with dspy.context(lm=self._lm):
|
||||
for input_text, output_text in pairs:
|
||||
pred = self._judge(
|
||||
results.append(self._judge_single(task_description, input_text, output_text))
|
||||
return results
|
||||
|
||||
def _judge_single(
|
||||
self,
|
||||
task_description: str,
|
||||
input_text: str,
|
||||
output_text: str,
|
||||
) -> tuple[float, str]:
|
||||
try:
|
||||
pred = retry_with_backoff(
|
||||
lambda: self._judge(
|
||||
task_description=task_description,
|
||||
input_text=input_text,
|
||||
output_text=output_text,
|
||||
)
|
||||
results.append((pred.score, pred.feedback))
|
||||
return results
|
||||
),
|
||||
max_retries=self._max_retries,
|
||||
retry_delay_base=self._retry_delay_base,
|
||||
)
|
||||
return (pred.score, pred.feedback)
|
||||
except Exception as exc:
|
||||
logger.warning("Judge call failed for input '%s…': %s", input_text[:40], exc)
|
||||
return (0.0, f"[judge error: {exc}]")
|
||||
|
||||
Reference in New Issue
Block a user