feat: error handling, retry with backoff, and circuit breaker

Add robust error handling to the evolution loop and LLM adapters:
- Retry utility with exponential backoff for transient errors (429, 5xx, timeouts)
- Per-call error isolation in evaluator and judge adapter
- Circuit breaker in EvolutionLoop (trips after N consecutive failures)
- CLI flags: --max-retries, --error-strategy (skip|retry|abort)
- Config fields: max_retries, retry_delay_base, circuit_breaker_threshold, error_strategy
- 16 new unit tests covering all error handling paths

Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
FullStackDev
2026-03-29 12:47:55 +00:00
parent f516ca4be6
commit e2d111ce5b
10 changed files with 646 additions and 103 deletions

View File

@@ -5,21 +5,34 @@ Implements the JudgePort via the DSPy OutputJudge module.
"""
from __future__ import annotations
import logging
import dspy
from prometheus.domain.ports import JudgePort
from prometheus.infrastructure.dspy_modules import OutputJudge
from prometheus.infrastructure.retry import retry_with_backoff
logger = logging.getLogger(__name__)
class DSPyJudgeAdapter(JudgePort):
"""Evaluates a batch of (input, output) pairs by calling the Judge for each.
Sequential for MVP. Future: parallelize via dspy.Parallel.
Per-call isolation: a failure on one item returns a zero-score sentinel
instead of crashing the whole batch.
"""
def __init__(self, lm: dspy.LM) -> None:
def __init__(
self,
lm: dspy.LM,
max_retries: int = 3,
retry_delay_base: float = 1.0,
) -> None:
self._lm = lm
self._judge = OutputJudge()
self._max_retries = max_retries
self._retry_delay_base = retry_delay_base
def judge_batch(
self,
@@ -29,10 +42,26 @@ class DSPyJudgeAdapter(JudgePort):
results: list[tuple[float, str]] = []
with dspy.context(lm=self._lm):
for input_text, output_text in pairs:
pred = self._judge(
results.append(self._judge_single(task_description, input_text, output_text))
return results
def _judge_single(
self,
task_description: str,
input_text: str,
output_text: str,
) -> tuple[float, str]:
try:
pred = retry_with_backoff(
lambda: self._judge(
task_description=task_description,
input_text=input_text,
output_text=output_text,
)
results.append((pred.score, pred.feedback))
return results
),
max_retries=self._max_retries,
retry_delay_base=self._retry_delay_base,
)
return (pred.score, pred.feedback)
except Exception as exc:
logger.warning("Judge call failed for input '%s': %s", input_text[:40], exc)
return (0.0, f"[judge error: {exc}]")