Files
Prompt-optimizer/tests/integration/test_evolution_integration.py
FullStackDev a5bf2ad59c feat: v0.2.0 sprint — ground truth eval, crossover/mutation, checkpointing, similarity guards, dataset loader, CLI commands, extended test coverage
Aggregates all v0.2.0 sprint work (GARAA-30 through GARAA-40) and fixes
2 integration tests that broke when the codebase went async (DSPyLLMAdapter
and full pipeline tests now properly await coroutines).

277 tests pass (260 unit + 17 integration).

Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-03-29 19:13:50 +00:00

301 lines
11 KiB
Python

"""Integration tests for multi-iteration evolution with mixed accept/reject."""
from __future__ import annotations
from unittest.mock import AsyncMock, MagicMock
import pytest
from prometheus.application.bootstrap import SyntheticBootstrap
from prometheus.application.evaluator import PromptEvaluator
from prometheus.application.evolution import EvolutionLoop
from prometheus.domain.entities import EvalResult, Prompt, SyntheticExample, Trajectory
from prometheus.domain.ports import JudgePort, LLMPort, ProposerPort
def _make_eval(scores: list[float]) -> EvalResult:
return EvalResult(
scores=scores,
feedbacks=["feedback"] * len(scores),
trajectories=[
Trajectory(f"in{i}", f"out{i}", s, "feedback", "prompt")
for i, s in enumerate(scores)
],
)
class TestMultiIterationEvolution:
"""Tests for the evolution loop across multiple iterations."""
@pytest.fixture
def seed_prompt(self) -> Prompt:
return Prompt(text="You are a helpful assistant.")
@pytest.fixture
def task_description(self) -> str:
return "Answer factual questions."
@pytest.fixture
def synthetic_pool(self) -> list[SyntheticExample]:
return [SyntheticExample(input_text=f"input {i}", id=i) for i in range(20)]
@pytest.mark.asyncio
async def test_mixed_accept_reject(
self,
seed_prompt: Prompt,
task_description: str,
synthetic_pool: list[SyntheticExample],
) -> None:
"""Iteration 1: accept, iteration 2: reject, iteration 3: accept."""
mock_llm = MagicMock(spec=LLMPort)
mock_judge = MagicMock(spec=JudgePort)
mock_proposer = MagicMock(spec=ProposerPort)
evaluator = PromptEvaluator(mock_llm, mock_judge)
bootstrap = MagicMock(spec=SyntheticBootstrap)
bootstrap.sample_minibatch.return_value = synthetic_pool[:3]
# Build eval sequence: initial, then per-iteration (current, new)
evals = [
_make_eval([0.3, 0.3, 0.3]), # initial seed eval
# Iter 1: accept (old=0.4, new=0.8)
_make_eval([0.4, 0.4, 0.4]),
_make_eval([0.8, 0.8, 0.8]),
# Iter 2: reject (old=0.7, new=0.2)
_make_eval([0.7, 0.7, 0.7]),
_make_eval([0.2, 0.2, 0.2]),
# Iter 3: accept (old=0.5, new=0.9)
_make_eval([0.5, 0.5, 0.5]),
_make_eval([0.9, 0.9, 0.9]),
]
evaluator.evaluate = AsyncMock(side_effect=evals)
mock_proposer.propose.side_effect = [
Prompt(text="Better prompt v1"),
Prompt(text="Worse prompt v2"),
Prompt(text="Best prompt v3"),
]
loop = EvolutionLoop(
evaluator=evaluator,
proposer=mock_proposer,
bootstrap=bootstrap,
max_iterations=3,
minibatch_size=3,
)
state = await loop.run(seed_prompt, synthetic_pool, task_description)
assert state.iteration == 3
assert state.best_candidate is not None
assert state.best_candidate.best_score == pytest.approx(2.7) # 0.9*3
assert len(state.history) == 3
assert state.history[0]["event"] == "accepted"
assert state.history[1]["event"] == "rejected"
assert state.history[2]["event"] == "accepted"
@pytest.mark.asyncio
async def test_all_rejected_keeps_seed(
self,
seed_prompt: Prompt,
task_description: str,
synthetic_pool: list[SyntheticExample],
) -> None:
"""When all proposals are rejected, the seed prompt stays as best."""
mock_llm = MagicMock(spec=LLMPort)
mock_judge = MagicMock(spec=JudgePort)
mock_proposer = MagicMock(spec=ProposerPort)
evaluator = PromptEvaluator(mock_llm, mock_judge)
bootstrap = MagicMock(spec=SyntheticBootstrap)
bootstrap.sample_minibatch.return_value = synthetic_pool[:3]
evals = [
_make_eval([0.5, 0.5, 0.5]), # initial
]
for _ in range(3):
evals.append(_make_eval([0.5, 0.5, 0.5])) # current
evals.append(_make_eval([0.1, 0.1, 0.1])) # worse proposal
evaluator.evaluate = AsyncMock(side_effect=evals)
mock_proposer.propose.side_effect = [
Prompt(text="bad v1"),
Prompt(text="bad v2"),
Prompt(text="bad v3"),
]
loop = EvolutionLoop(
evaluator=evaluator,
proposer=mock_proposer,
bootstrap=bootstrap,
max_iterations=3,
minibatch_size=3,
)
state = await loop.run(seed_prompt, synthetic_pool, task_description)
assert state.best_candidate.prompt.text == seed_prompt.text
assert state.best_candidate.best_score == pytest.approx(1.5) # 0.5*3
@pytest.mark.asyncio
async def test_all_accepted_chain(
self,
seed_prompt: Prompt,
task_description: str,
synthetic_pool: list[SyntheticExample],
) -> None:
"""All iterations accept, forming an improvement chain."""
mock_llm = MagicMock(spec=LLMPort)
mock_judge = MagicMock(spec=JudgePort)
mock_proposer = MagicMock(spec=ProposerPort)
evaluator = PromptEvaluator(mock_llm, mock_judge)
bootstrap = MagicMock(spec=SyntheticBootstrap)
bootstrap.sample_minibatch.return_value = synthetic_pool[:2]
evals = [
_make_eval([0.2, 0.2]), # initial
]
for i in range(1, 5):
score = 0.2 + i * 0.15
evals.append(_make_eval([score, score])) # current
evals.append(_make_eval([score + 0.1, score + 0.1])) # new (accepted)
evaluator.evaluate = AsyncMock(side_effect=evals)
mock_proposer.propose.side_effect = [
Prompt(text=f"Improved v{i}") for i in range(4)
]
loop = EvolutionLoop(
evaluator=evaluator,
proposer=mock_proposer,
bootstrap=bootstrap,
max_iterations=4,
minibatch_size=2,
)
state = await loop.run(seed_prompt, synthetic_pool, task_description)
assert len(state.candidates) == 5 # seed + 4 accepted
assert all(h["event"] == "accepted" for h in state.history)
@pytest.mark.asyncio
async def test_error_recovery_continues_loop(
self,
seed_prompt: Prompt,
task_description: str,
synthetic_pool: list[SyntheticExample],
) -> None:
"""When an iteration errors, the loop continues."""
mock_llm = MagicMock(spec=LLMPort)
mock_judge = MagicMock(spec=JudgePort)
mock_proposer = MagicMock(spec=ProposerPort)
evaluator = PromptEvaluator(mock_llm, mock_judge)
bootstrap = MagicMock(spec=SyntheticBootstrap)
bootstrap.sample_minibatch.return_value = synthetic_pool[:2]
# Eval sequence for 3 iterations:
# - iter 1: evaluate current → propose → evaluate new (accepted)
# - iter 2: evaluate current → propose (ERROR, no new eval)
# - iter 3: evaluate current → propose → evaluate new (accepted)
evals = [
_make_eval([0.3, 0.3]), # initial
_make_eval([0.5, 0.5]), # iter 1 current
_make_eval([0.9, 0.9]), # iter 1 new (accepted)
_make_eval([0.5, 0.5]), # iter 2 current (proposer errors after this)
_make_eval([0.5, 0.5]), # iter 3 current
_make_eval([0.8, 0.8]), # iter 3 new (accepted)
]
evaluator.evaluate = AsyncMock(side_effect=evals)
# Proposer raises on iter 2
mock_proposer.propose.side_effect = [
Prompt(text="good v1"),
RuntimeError("LLM timeout"),
Prompt(text="good v3"),
]
loop = EvolutionLoop(
evaluator=evaluator,
proposer=mock_proposer,
bootstrap=bootstrap,
max_iterations=3,
minibatch_size=2,
)
state = await loop.run(seed_prompt, synthetic_pool, task_description)
assert state.iteration == 3
assert state.history[1]["event"] == "error"
assert "LLM timeout" in state.history[1]["error"]
assert state.history[0]["event"] == "accepted"
assert state.history[2]["event"] == "accepted"
@pytest.mark.asyncio
async def test_perfect_score_skips_proposer(
self,
seed_prompt: Prompt,
task_description: str,
synthetic_pool: list[SyntheticExample],
) -> None:
"""When all scores are perfect, no proposition is made."""
mock_llm = MagicMock(spec=LLMPort)
mock_judge = MagicMock(spec=JudgePort)
mock_proposer = MagicMock(spec=ProposerPort)
evaluator = PromptEvaluator(mock_llm, mock_judge)
bootstrap = MagicMock(spec=SyntheticBootstrap)
bootstrap.sample_minibatch.return_value = synthetic_pool[:2]
perfect_eval = _make_eval([1.0, 1.0])
evaluator.evaluate = AsyncMock(return_value=perfect_eval)
loop = EvolutionLoop(
evaluator=evaluator,
proposer=mock_proposer,
bootstrap=bootstrap,
max_iterations=5,
minibatch_size=2,
perfect_score=1.0,
)
state = await loop.run(seed_prompt, synthetic_pool, task_description)
mock_proposer.propose.assert_not_called()
assert all(h["event"] == "skip_perfect" for h in state.history)
@pytest.mark.asyncio
async def test_llm_call_counting(
self,
seed_prompt: Prompt,
task_description: str,
synthetic_pool: list[SyntheticExample],
) -> None:
"""Verify LLM call counting: 2*N per eval (execute + judge) + 1 per propose."""
mock_llm = MagicMock(spec=LLMPort)
mock_judge = MagicMock(spec=JudgePort)
mock_proposer = MagicMock(spec=ProposerPort)
evaluator = PromptEvaluator(mock_llm, mock_judge)
bootstrap = MagicMock(spec=SyntheticBootstrap)
bootstrap.sample_minibatch.return_value = synthetic_pool[:3]
evals = [_make_eval([0.3, 0.3, 0.3])] # initial
for _ in range(2):
evals.append(_make_eval([0.4, 0.4, 0.4]))
evals.append(_make_eval([0.6, 0.6, 0.6]))
evaluator.evaluate = AsyncMock(side_effect=evals)
mock_proposer.propose.side_effect = [
Prompt(text="v1"),
Prompt(text="v2"),
]
loop = EvolutionLoop(
evaluator=evaluator,
proposer=mock_proposer,
bootstrap=bootstrap,
max_iterations=2,
minibatch_size=3,
)
state = await loop.run(seed_prompt, synthetic_pool, task_description)
# Initial: 2*3=6, Iter1: 2*3 + 1 + 2*3 = 13, Iter2: same = 13
# Total: 6 + 13 + 13 = 32
assert state.total_llm_calls == 32