"""Unit tests for the evolution loop — with full mocking.""" from __future__ import annotations from unittest.mock import MagicMock, patch from prometheus.application.bootstrap import SyntheticBootstrap from prometheus.application.evaluator import PromptEvaluator from prometheus.application.evolution import EvolutionLoop from prometheus.domain.entities import EvalResult, Prompt, SyntheticExample, Trajectory class TestEvolutionLoop: def test_accepts_improvement( self, seed_prompt: Prompt, synthetic_pool: list[SyntheticExample], task_description: str, mock_llm_port: MagicMock, mock_judge_port: MagicMock, mock_proposer_port: MagicMock, ) -> None: """When the new prompt improves the score, the best candidate is updated.""" evaluator = PromptEvaluator(mock_llm_port, mock_judge_port) bootstrap = MagicMock(spec=SyntheticBootstrap) bootstrap.sample_minibatch.return_value = synthetic_pool[:5] initial_eval = EvalResult( scores=[0.3, 0.4, 0.3, 0.5, 0.2], feedbacks=["bad"] * 5, trajectories=[ Trajectory(f"input{i}", f"output{i}", s, "bad", "prompt") for i, s in enumerate([0.3, 0.4, 0.3, 0.5, 0.2]) ], ) old_eval = EvalResult( scores=[0.3, 0.4, 0.3, 0.5, 0.2], feedbacks=["bad"] * 5, trajectories=[ Trajectory(f"input{i}", f"output{i}", s, "bad", "prompt") for i, s in enumerate([0.3, 0.4, 0.3, 0.5, 0.2]) ], ) new_eval = EvalResult( scores=[0.8, 0.9, 0.7, 0.8, 0.9], feedbacks=["good"] * 5, trajectories=[], ) evaluator.evaluate = MagicMock(side_effect=[initial_eval, old_eval, new_eval]) loop = EvolutionLoop( evaluator=evaluator, proposer=mock_proposer_port, bootstrap=bootstrap, max_iterations=1, minibatch_size=5, ) with patch.object(loop, "_log"): state = loop.run(seed_prompt, synthetic_pool, task_description) assert state.best_candidate is not None assert state.best_candidate.best_score > 0 def test_rejects_regression( self, seed_prompt: Prompt, synthetic_pool: list[SyntheticExample], task_description: str, mock_llm_port: MagicMock, mock_judge_port: MagicMock, mock_proposer_port: MagicMock, ) -> None: """When the new prompt degrades the score, the best candidate stays unchanged.""" evaluator = PromptEvaluator(mock_llm_port, mock_judge_port) bootstrap = MagicMock(spec=SyntheticBootstrap) bootstrap.sample_minibatch.return_value = synthetic_pool[:5] initial_eval = EvalResult( scores=[0.7, 0.8, 0.7, 0.8, 0.9], feedbacks=["ok"] * 5, trajectories=[ Trajectory(f"input{i}", f"output{i}", s, "ok", "prompt") for i, s in enumerate([0.7, 0.8, 0.7, 0.8, 0.9]) ], ) old_eval = EvalResult( scores=[0.7, 0.8, 0.7, 0.8, 0.9], feedbacks=["ok"] * 5, trajectories=[ Trajectory(f"input{i}", f"output{i}", s, "ok", "prompt") for i, s in enumerate([0.7, 0.8, 0.7, 0.8, 0.9]) ], ) new_eval = EvalResult( scores=[0.2, 0.1, 0.3, 0.2, 0.1], feedbacks=["bad"] * 5, trajectories=[], ) evaluator.evaluate = MagicMock(side_effect=[initial_eval, old_eval, new_eval]) loop = EvolutionLoop( evaluator=evaluator, proposer=mock_proposer_port, bootstrap=bootstrap, max_iterations=1, minibatch_size=5, ) with patch.object(loop, "_log"): state = loop.run(seed_prompt, synthetic_pool, task_description) assert state.best_candidate is not None assert state.best_candidate.prompt.text == seed_prompt.text def test_skips_perfect_scores( self, seed_prompt: Prompt, synthetic_pool: list[SyntheticExample], task_description: str, mock_llm_port: MagicMock, mock_judge_port: MagicMock, mock_proposer_port: MagicMock, ) -> None: """When all scores are perfect, no proposition is made.""" evaluator = PromptEvaluator(mock_llm_port, mock_judge_port) bootstrap = MagicMock(spec=SyntheticBootstrap) bootstrap.sample_minibatch.return_value = synthetic_pool[:5] perfect_eval = EvalResult( scores=[1.0, 1.0, 1.0, 1.0, 1.0], feedbacks=["perfect"] * 5, trajectories=[ Trajectory(f"input{i}", f"output{i}", 1.0, "perfect", "prompt") for i in range(5) ], ) evaluator.evaluate = MagicMock(return_value=perfect_eval) loop = EvolutionLoop( evaluator=evaluator, proposer=mock_proposer_port, bootstrap=bootstrap, max_iterations=3, minibatch_size=5, ) with patch.object(loop, "_log"): loop.run(seed_prompt, synthetic_pool, task_description) mock_proposer_port.propose.assert_not_called()