"""Integration tests for multi-iteration evolution with mixed accept/reject.""" from __future__ import annotations from unittest.mock import AsyncMock, MagicMock import pytest from prometheus.application.bootstrap import SyntheticBootstrap from prometheus.application.evaluator import PromptEvaluator from prometheus.application.evolution import EvolutionLoop from prometheus.domain.entities import EvalResult, Prompt, SyntheticExample, Trajectory from prometheus.domain.ports import JudgePort, LLMPort, ProposerPort def _make_eval(scores: list[float]) -> EvalResult: return EvalResult( scores=scores, feedbacks=["feedback"] * len(scores), trajectories=[ Trajectory(f"in{i}", f"out{i}", s, "feedback", "prompt") for i, s in enumerate(scores) ], ) class TestMultiIterationEvolution: """Tests for the evolution loop across multiple iterations.""" @pytest.fixture def seed_prompt(self) -> Prompt: return Prompt(text="You are a helpful assistant.") @pytest.fixture def task_description(self) -> str: return "Answer factual questions." @pytest.fixture def synthetic_pool(self) -> list[SyntheticExample]: return [SyntheticExample(input_text=f"input {i}", id=i) for i in range(20)] @pytest.mark.asyncio async def test_mixed_accept_reject( self, seed_prompt: Prompt, task_description: str, synthetic_pool: list[SyntheticExample], ) -> None: """Iteration 1: accept, iteration 2: reject, iteration 3: accept.""" mock_llm = MagicMock(spec=LLMPort) mock_judge = MagicMock(spec=JudgePort) mock_proposer = MagicMock(spec=ProposerPort) evaluator = PromptEvaluator(mock_llm, mock_judge) bootstrap = MagicMock(spec=SyntheticBootstrap) bootstrap.sample_minibatch.return_value = synthetic_pool[:3] # Build eval sequence: initial, then per-iteration (current, new) evals = [ _make_eval([0.3, 0.3, 0.3]), # initial seed eval # Iter 1: accept (old=0.4, new=0.8) _make_eval([0.4, 0.4, 0.4]), _make_eval([0.8, 0.8, 0.8]), # Iter 2: reject (old=0.7, new=0.2) _make_eval([0.7, 0.7, 0.7]), _make_eval([0.2, 0.2, 0.2]), # Iter 3: accept (old=0.5, new=0.9) _make_eval([0.5, 0.5, 0.5]), _make_eval([0.9, 0.9, 0.9]), ] evaluator.evaluate = AsyncMock(side_effect=evals) mock_proposer.propose.side_effect = [ Prompt(text="Better prompt v1"), Prompt(text="Worse prompt v2"), Prompt(text="Best prompt v3"), ] loop = EvolutionLoop( evaluator=evaluator, proposer=mock_proposer, bootstrap=bootstrap, max_iterations=3, minibatch_size=3, ) state = await loop.run(seed_prompt, synthetic_pool, task_description) assert state.iteration == 3 assert state.best_candidate is not None assert state.best_candidate.best_score == pytest.approx(2.7) # 0.9*3 assert len(state.history) == 3 assert state.history[0]["event"] == "accepted" assert state.history[1]["event"] == "rejected" assert state.history[2]["event"] == "accepted" @pytest.mark.asyncio async def test_all_rejected_keeps_seed( self, seed_prompt: Prompt, task_description: str, synthetic_pool: list[SyntheticExample], ) -> None: """When all proposals are rejected, the seed prompt stays as best.""" mock_llm = MagicMock(spec=LLMPort) mock_judge = MagicMock(spec=JudgePort) mock_proposer = MagicMock(spec=ProposerPort) evaluator = PromptEvaluator(mock_llm, mock_judge) bootstrap = MagicMock(spec=SyntheticBootstrap) bootstrap.sample_minibatch.return_value = synthetic_pool[:3] evals = [ _make_eval([0.5, 0.5, 0.5]), # initial ] for _ in range(3): evals.append(_make_eval([0.5, 0.5, 0.5])) # current evals.append(_make_eval([0.1, 0.1, 0.1])) # worse proposal evaluator.evaluate = AsyncMock(side_effect=evals) mock_proposer.propose.side_effect = [ Prompt(text="bad v1"), Prompt(text="bad v2"), Prompt(text="bad v3"), ] loop = EvolutionLoop( evaluator=evaluator, proposer=mock_proposer, bootstrap=bootstrap, max_iterations=3, minibatch_size=3, ) state = await loop.run(seed_prompt, synthetic_pool, task_description) assert state.best_candidate.prompt.text == seed_prompt.text assert state.best_candidate.best_score == pytest.approx(1.5) # 0.5*3 @pytest.mark.asyncio async def test_all_accepted_chain( self, seed_prompt: Prompt, task_description: str, synthetic_pool: list[SyntheticExample], ) -> None: """All iterations accept, forming an improvement chain.""" mock_llm = MagicMock(spec=LLMPort) mock_judge = MagicMock(spec=JudgePort) mock_proposer = MagicMock(spec=ProposerPort) evaluator = PromptEvaluator(mock_llm, mock_judge) bootstrap = MagicMock(spec=SyntheticBootstrap) bootstrap.sample_minibatch.return_value = synthetic_pool[:2] evals = [ _make_eval([0.2, 0.2]), # initial ] for i in range(1, 5): score = 0.2 + i * 0.15 evals.append(_make_eval([score, score])) # current evals.append(_make_eval([score + 0.1, score + 0.1])) # new (accepted) evaluator.evaluate = AsyncMock(side_effect=evals) mock_proposer.propose.side_effect = [ Prompt(text=f"Improved v{i}") for i in range(4) ] loop = EvolutionLoop( evaluator=evaluator, proposer=mock_proposer, bootstrap=bootstrap, max_iterations=4, minibatch_size=2, ) state = await loop.run(seed_prompt, synthetic_pool, task_description) assert len(state.candidates) == 5 # seed + 4 accepted assert all(h["event"] == "accepted" for h in state.history) @pytest.mark.asyncio async def test_error_recovery_continues_loop( self, seed_prompt: Prompt, task_description: str, synthetic_pool: list[SyntheticExample], ) -> None: """When an iteration errors, the loop continues.""" mock_llm = MagicMock(spec=LLMPort) mock_judge = MagicMock(spec=JudgePort) mock_proposer = MagicMock(spec=ProposerPort) evaluator = PromptEvaluator(mock_llm, mock_judge) bootstrap = MagicMock(spec=SyntheticBootstrap) bootstrap.sample_minibatch.return_value = synthetic_pool[:2] # Eval sequence for 3 iterations: # - iter 1: evaluate current → propose → evaluate new (accepted) # - iter 2: evaluate current → propose (ERROR, no new eval) # - iter 3: evaluate current → propose → evaluate new (accepted) evals = [ _make_eval([0.3, 0.3]), # initial _make_eval([0.5, 0.5]), # iter 1 current _make_eval([0.9, 0.9]), # iter 1 new (accepted) _make_eval([0.5, 0.5]), # iter 2 current (proposer errors after this) _make_eval([0.5, 0.5]), # iter 3 current _make_eval([0.8, 0.8]), # iter 3 new (accepted) ] evaluator.evaluate = AsyncMock(side_effect=evals) # Proposer raises on iter 2 mock_proposer.propose.side_effect = [ Prompt(text="good v1"), RuntimeError("LLM timeout"), Prompt(text="good v3"), ] loop = EvolutionLoop( evaluator=evaluator, proposer=mock_proposer, bootstrap=bootstrap, max_iterations=3, minibatch_size=2, ) state = await loop.run(seed_prompt, synthetic_pool, task_description) assert state.iteration == 3 assert state.history[1]["event"] == "error" assert "LLM timeout" in state.history[1]["error"] assert state.history[0]["event"] == "accepted" assert state.history[2]["event"] == "accepted" @pytest.mark.asyncio async def test_perfect_score_skips_proposer( self, seed_prompt: Prompt, task_description: str, synthetic_pool: list[SyntheticExample], ) -> None: """When all scores are perfect, no proposition is made.""" mock_llm = MagicMock(spec=LLMPort) mock_judge = MagicMock(spec=JudgePort) mock_proposer = MagicMock(spec=ProposerPort) evaluator = PromptEvaluator(mock_llm, mock_judge) bootstrap = MagicMock(spec=SyntheticBootstrap) bootstrap.sample_minibatch.return_value = synthetic_pool[:2] perfect_eval = _make_eval([1.0, 1.0]) evaluator.evaluate = AsyncMock(return_value=perfect_eval) loop = EvolutionLoop( evaluator=evaluator, proposer=mock_proposer, bootstrap=bootstrap, max_iterations=5, minibatch_size=2, perfect_score=1.0, ) state = await loop.run(seed_prompt, synthetic_pool, task_description) mock_proposer.propose.assert_not_called() assert all(h["event"] == "skip_perfect" for h in state.history) @pytest.mark.asyncio async def test_llm_call_counting( self, seed_prompt: Prompt, task_description: str, synthetic_pool: list[SyntheticExample], ) -> None: """Verify LLM call counting: 2*N per eval (execute + judge) + 1 per propose.""" mock_llm = MagicMock(spec=LLMPort) mock_judge = MagicMock(spec=JudgePort) mock_proposer = MagicMock(spec=ProposerPort) evaluator = PromptEvaluator(mock_llm, mock_judge) bootstrap = MagicMock(spec=SyntheticBootstrap) bootstrap.sample_minibatch.return_value = synthetic_pool[:3] evals = [_make_eval([0.3, 0.3, 0.3])] # initial for _ in range(2): evals.append(_make_eval([0.4, 0.4, 0.4])) evals.append(_make_eval([0.6, 0.6, 0.6])) evaluator.evaluate = AsyncMock(side_effect=evals) mock_proposer.propose.side_effect = [ Prompt(text="v1"), Prompt(text="v2"), ] loop = EvolutionLoop( evaluator=evaluator, proposer=mock_proposer, bootstrap=bootstrap, max_iterations=2, minibatch_size=3, ) state = await loop.run(seed_prompt, synthetic_pool, task_description) # Initial: 2*3=6, Iter1: 2*3 + 1 + 2*3 = 13, Iter2: same = 13 # Total: 6 + 13 + 13 = 32 assert state.total_llm_calls == 32