"""Unit tests for the evolution loop — with full mocking.""" from __future__ import annotations from unittest.mock import AsyncMock, MagicMock, patch import pytest from prometheus.application.bootstrap import SyntheticBootstrap from prometheus.application.evaluator import PromptEvaluator from prometheus.application.evolution import EvolutionLoop from prometheus.domain.entities import EvalResult, Prompt, SyntheticExample, Trajectory class TestEvolutionLoop: @pytest.mark.asyncio async def test_accepts_improvement( self, seed_prompt: Prompt, synthetic_pool: list[SyntheticExample], task_description: str, mock_llm_port: AsyncMock, mock_judge_port: AsyncMock, mock_proposer_port: AsyncMock, ) -> None: """When the new prompt improves the score, the best candidate is updated.""" evaluator = PromptEvaluator(mock_llm_port, mock_judge_port) bootstrap = MagicMock(spec=SyntheticBootstrap) bootstrap.sample_minibatch.return_value = synthetic_pool[:5] initial_eval = EvalResult( scores=[0.3, 0.4, 0.3, 0.5, 0.2], feedbacks=["bad"] * 5, trajectories=[ Trajectory(f"input{i}", f"output{i}", s, "bad", "prompt") for i, s in enumerate([0.3, 0.4, 0.3, 0.5, 0.2]) ], ) old_eval = EvalResult( scores=[0.3, 0.4, 0.3, 0.5, 0.2], feedbacks=["bad"] * 5, trajectories=[ Trajectory(f"input{i}", f"output{i}", s, "bad", "prompt") for i, s in enumerate([0.3, 0.4, 0.3, 0.5, 0.2]) ], ) new_eval = EvalResult( scores=[0.8, 0.9, 0.7, 0.8, 0.9], feedbacks=["good"] * 5, trajectories=[], ) evaluator.evaluate = AsyncMock(side_effect=[initial_eval, old_eval, new_eval]) loop = EvolutionLoop( evaluator=evaluator, proposer=mock_proposer_port, bootstrap=bootstrap, max_iterations=1, minibatch_size=5, ) with patch.object(loop, "_log"): state = await loop.run(seed_prompt, synthetic_pool, task_description) assert state.best_candidate is not None assert state.best_candidate.best_score > 0 @pytest.mark.asyncio async def test_rejects_regression( self, seed_prompt: Prompt, synthetic_pool: list[SyntheticExample], task_description: str, mock_llm_port: AsyncMock, mock_judge_port: AsyncMock, mock_proposer_port: AsyncMock, ) -> None: """When the new prompt degrades the score, the best candidate stays unchanged.""" evaluator = PromptEvaluator(mock_llm_port, mock_judge_port) bootstrap = MagicMock(spec=SyntheticBootstrap) bootstrap.sample_minibatch.return_value = synthetic_pool[:5] initial_eval = EvalResult( scores=[0.7, 0.8, 0.7, 0.8, 0.9], feedbacks=["ok"] * 5, trajectories=[ Trajectory(f"input{i}", f"output{i}", s, "ok", "prompt") for i, s in enumerate([0.7, 0.8, 0.7, 0.8, 0.9]) ], ) old_eval = EvalResult( scores=[0.7, 0.8, 0.7, 0.8, 0.9], feedbacks=["ok"] * 5, trajectories=[ Trajectory(f"input{i}", f"output{i}", s, "ok", "prompt") for i, s in enumerate([0.7, 0.8, 0.7, 0.8, 0.9]) ], ) new_eval = EvalResult( scores=[0.2, 0.1, 0.3, 0.2, 0.1], feedbacks=["bad"] * 5, trajectories=[], ) evaluator.evaluate = AsyncMock(side_effect=[initial_eval, old_eval, new_eval]) loop = EvolutionLoop( evaluator=evaluator, proposer=mock_proposer_port, bootstrap=bootstrap, max_iterations=1, minibatch_size=5, ) with patch.object(loop, "_log"): state = await loop.run(seed_prompt, synthetic_pool, task_description) assert state.best_candidate is not None assert state.best_candidate.prompt.text == seed_prompt.text @pytest.mark.asyncio async def test_skips_perfect_scores( self, seed_prompt: Prompt, synthetic_pool: list[SyntheticExample], task_description: str, mock_llm_port: AsyncMock, mock_judge_port: AsyncMock, mock_proposer_port: AsyncMock, ) -> None: """When all scores are perfect, no proposition is made.""" evaluator = PromptEvaluator(mock_llm_port, mock_judge_port) bootstrap = MagicMock(spec=SyntheticBootstrap) bootstrap.sample_minibatch.return_value = synthetic_pool[:5] perfect_eval = EvalResult( scores=[1.0, 1.0, 1.0, 1.0, 1.0], feedbacks=["perfect"] * 5, trajectories=[ Trajectory(f"input{i}", f"output{i}", 1.0, "perfect", "prompt") for i in range(5) ], ) evaluator.evaluate = AsyncMock(return_value=perfect_eval) loop = EvolutionLoop( evaluator=evaluator, proposer=mock_proposer_port, bootstrap=bootstrap, max_iterations=3, minibatch_size=5, ) with patch.object(loop, "_log"): await loop.run(seed_prompt, synthetic_pool, task_description) mock_proposer_port.propose.assert_not_called()