"""Unit tests for the evolution loop — with full mocking.""" from __future__ import annotations from unittest.mock import AsyncMock, MagicMock, patch import pytest from prometheus.application.bootstrap import SyntheticBootstrap from prometheus.application.evaluator import PromptEvaluator from prometheus.application.evolution import EvolutionLoop from prometheus.domain.entities import ( Candidate, EvalResult, Prompt, SyntheticExample, Trajectory, ) def _make_eval(scores: list[float], label: str = "ok") -> EvalResult: """Helper to build an EvalResult from a list of scores.""" return EvalResult( scores=scores, feedbacks=[label] * len(scores), trajectories=[ Trajectory(f"input{i}", f"output{i}", s, label, "prompt") for i, s in enumerate(scores) ], ) class TestEvolutionLoop: """Tests for the original single-candidate hill-climbing mode (population_size=1).""" @pytest.mark.asyncio async def test_accepts_improvement( self, seed_prompt: Prompt, synthetic_pool: list[SyntheticExample], task_description: str, mock_llm_port: AsyncMock, mock_judge_port: AsyncMock, mock_proposer_port: AsyncMock, ) -> None: """When the new prompt improves the score, the best candidate is updated.""" evaluator = PromptEvaluator(mock_llm_port, mock_judge_port) bootstrap = MagicMock(spec=SyntheticBootstrap) bootstrap.sample_minibatch.return_value = synthetic_pool[:5] low_eval = _make_eval([0.3, 0.4, 0.3, 0.5, 0.2], "bad") high_eval = _make_eval([0.8, 0.9, 0.7, 0.8, 0.9], "good") evaluator.evaluate = AsyncMock(side_effect=[low_eval, low_eval, high_eval]) loop = EvolutionLoop( evaluator=evaluator, proposer=mock_proposer_port, bootstrap=bootstrap, max_iterations=1, minibatch_size=5, ) state = await loop.run(seed_prompt, synthetic_pool, task_description) assert state.best_candidate is not None assert state.best_candidate.best_score > 0 @pytest.mark.asyncio async def test_rejects_regression( self, seed_prompt: Prompt, synthetic_pool: list[SyntheticExample], task_description: str, mock_llm_port: AsyncMock, mock_judge_port: AsyncMock, mock_proposer_port: AsyncMock, ) -> None: """When the new prompt degrades the score, the best candidate stays unchanged.""" evaluator = PromptEvaluator(mock_llm_port, mock_judge_port) bootstrap = MagicMock(spec=SyntheticBootstrap) bootstrap.sample_minibatch.return_value = synthetic_pool[:5] high_eval = _make_eval([0.7, 0.8, 0.7, 0.8, 0.9], "ok") low_eval = _make_eval([0.2, 0.1, 0.3, 0.2, 0.1], "bad") evaluator.evaluate = AsyncMock(side_effect=[high_eval, high_eval, low_eval]) loop = EvolutionLoop( evaluator=evaluator, proposer=mock_proposer_port, bootstrap=bootstrap, max_iterations=1, minibatch_size=5, ) state = await loop.run(seed_prompt, synthetic_pool, task_description) assert state.best_candidate is not None assert state.best_candidate.prompt.text == seed_prompt.text @pytest.mark.asyncio async def test_skips_perfect_scores( self, seed_prompt: Prompt, synthetic_pool: list[SyntheticExample], task_description: str, mock_llm_port: AsyncMock, mock_judge_port: AsyncMock, mock_proposer_port: AsyncMock, ) -> None: """When all scores are perfect, no proposition is made.""" evaluator = PromptEvaluator(mock_llm_port, mock_judge_port) bootstrap = MagicMock(spec=SyntheticBootstrap) bootstrap.sample_minibatch.return_value = synthetic_pool[:5] perfect_eval = _make_eval([1.0, 1.0, 1.0, 1.0, 1.0], "perfect") evaluator.evaluate = AsyncMock(return_value=perfect_eval) loop = EvolutionLoop( evaluator=evaluator, proposer=mock_proposer_port, bootstrap=bootstrap, max_iterations=3, minibatch_size=5, ) await loop.run(seed_prompt, synthetic_pool, task_description) mock_proposer_port.propose.assert_not_called() class TestPopulationEvolution: """Tests for population-based evolution (population_size > 1).""" @pytest.mark.asyncio async def test_population_initialization( self, seed_prompt: Prompt, synthetic_pool: list[SyntheticExample], task_description: str, mock_llm_port: AsyncMock, mock_judge_port: AsyncMock, mock_proposer_port: AsyncMock, mock_mutation_port: AsyncMock, ) -> None: """Population is initialized with the right number of candidates.""" evaluator = PromptEvaluator(mock_llm_port, mock_judge_port) evaluator.evaluate = AsyncMock( return_value=_make_eval([0.5] * 5, "ok") ) bootstrap = MagicMock(spec=SyntheticBootstrap) bootstrap.sample_minibatch.return_value = synthetic_pool[:5] loop = EvolutionLoop( evaluator=evaluator, proposer=mock_proposer_port, bootstrap=bootstrap, max_iterations=0, # no iterations, just initialization minibatch_size=5, population_size=4, mutation_port=mock_mutation_port, ) state = await loop.run(seed_prompt, synthetic_pool, task_description) # 1 seed + 3 mutations = 4 candidates assert len(state.candidates) == 4 assert mock_mutation_port.mutate.call_count == 3 @pytest.mark.asyncio async def test_population_initialization_uses_proposer_fallback( self, seed_prompt: Prompt, synthetic_pool: list[SyntheticExample], task_description: str, mock_llm_port: AsyncMock, mock_judge_port: AsyncMock, mock_proposer_port: AsyncMock, ) -> None: """When no mutation_port is provided, population init falls back to proposer.""" evaluator = PromptEvaluator(mock_llm_port, mock_judge_port) evaluator.evaluate = AsyncMock( return_value=_make_eval([0.5] * 5, "ok") ) bootstrap = MagicMock(spec=SyntheticBootstrap) bootstrap.sample_minibatch.return_value = synthetic_pool[:5] loop = EvolutionLoop( evaluator=evaluator, proposer=mock_proposer_port, bootstrap=bootstrap, max_iterations=0, minibatch_size=5, population_size=3, # mutation_port intentionally omitted ) state = await loop.run(seed_prompt, synthetic_pool, task_description) assert len(state.candidates) == 3 assert mock_proposer_port.propose.call_count == 2 # 3-1 = 2 init mutations @pytest.mark.asyncio async def test_population_iteration_replaces_worst( self, seed_prompt: Prompt, synthetic_pool: list[SyntheticExample], task_description: str, mock_llm_port: AsyncMock, mock_judge_port: AsyncMock, mock_proposer_port: AsyncMock, mock_crossover_port: AsyncMock, mock_mutation_port: AsyncMock, ) -> None: """Crossover child replaces worst candidate when its fitness is higher.""" evaluator = PromptEvaluator(mock_llm_port, mock_judge_port) bootstrap = MagicMock(spec=SyntheticBootstrap) bootstrap.sample_minibatch.return_value = synthetic_pool[:5] # Sequence: # 1. Initial eval (seed) # 2. Population init: 3 mutation calls use proposer.propose(), NOT evaluator.evaluate # 3. Population iteration: crossover produces child → eval child # Only 2 evaluator.evaluate calls total seed_eval = _make_eval([0.5] * 5, "ok") # Crossover child eval - high score to beat worst child_eval = _make_eval([0.9, 0.9, 0.8, 0.9, 0.8], "great") all_evals = [seed_eval, child_eval] evaluator.evaluate = AsyncMock(side_effect=all_evals) loop = EvolutionLoop( evaluator=evaluator, proposer=mock_proposer_port, bootstrap=bootstrap, max_iterations=1, minibatch_size=5, population_size=4, crossover_rate=1.0, crossover_port=mock_crossover_port, mutation_rate=0.0, # disable post-crossover mutation for determinism ) state = await loop.run(seed_prompt, synthetic_pool, task_description) accepted_events = [h for h in state.history if h.get("event") == "pop_accepted"] assert len(accepted_events) >= 1 @pytest.mark.asyncio async def test_population_iteration_rejects_inferior_child( self, seed_prompt: Prompt, synthetic_pool: list[SyntheticExample], task_description: str, mock_llm_port: AsyncMock, mock_judge_port: AsyncMock, mock_proposer_port: AsyncMock, mock_crossover_port: AsyncMock, ) -> None: """Inferior child is rejected and doesn't replace any candidate.""" evaluator = PromptEvaluator(mock_llm_port, mock_judge_port) bootstrap = MagicMock(spec=SyntheticBootstrap) bootstrap.sample_minibatch.return_value = synthetic_pool[:5] seed_eval = _make_eval([0.8] * 5, "ok") # Crossover produces very LOW-scoring child child_eval = _make_eval([0.1] * 5, "terrible") all_evals = [seed_eval, child_eval] evaluator.evaluate = AsyncMock(side_effect=all_evals) loop = EvolutionLoop( evaluator=evaluator, proposer=mock_proposer_port, bootstrap=bootstrap, max_iterations=1, minibatch_size=5, population_size=4, crossover_rate=1.0, crossover_port=mock_crossover_port, mutation_rate=0.0, ) state = await loop.run(seed_prompt, synthetic_pool, task_description) rejected_events = [h for h in state.history if h.get("event") == "pop_rejected"] assert len(rejected_events) >= 1 class TestDiversityScore: """Tests for the diversity/similarity scoring logic.""" def test_identical_prompts_have_high_similarity(self) -> None: """Identical prompts should have very high similarity.""" identical = Prompt(text="You are a helpful assistant. Answer the question.") pop_a = Candidate(prompt=identical, best_score=4.0, generation=0) pop_b = Candidate( prompt=Prompt(text="Completely different prompt about data analysis."), best_score=3.0, generation=0, ) sim_same = EvolutionLoop._compute_diversity_score(identical, [pop_a, pop_b]) # Average includes similarity to the different member, so ~0.5 not 0.9+ assert sim_same > 0.3 def test_different_prompts_have_lower_similarity(self) -> None: """Different prompts should have lower similarity than identical ones.""" prompt_a = Prompt(text="You are a helpful assistant. Answer the question.") prompt_b = Prompt(text="Provide detailed analysis of complex data patterns with precision.") pop_a = Candidate(prompt=prompt_a, best_score=4.0, generation=0) pop_b = Candidate(prompt=prompt_b, best_score=3.0, generation=0) sim_a = EvolutionLoop._compute_diversity_score(prompt_a, [pop_a, pop_b]) sim_b = EvolutionLoop._compute_diversity_score(prompt_b, [pop_a, pop_b]) # Both should be < 1.0 since they're different assert sim_a < 1.0 assert sim_b < 1.0 def test_single_member_population_returns_1(self) -> None: """Single-member population always returns 1.0 (no penalty).""" prompt = Prompt(text="Any prompt text here.") pop = [Candidate(prompt=prompt, best_score=1.0, generation=0)] sim = EvolutionLoop._compute_diversity_score(prompt, pop) assert sim == 1.0 def test_empty_prompt_returns_zero(self) -> None: """Empty prompt text returns 0.0 when population has >1 member.""" prompt = Prompt(text="") pop = [ Candidate(prompt=Prompt(text="some text"), best_score=1.0, generation=0), Candidate(prompt=Prompt(text="other text"), best_score=2.0, generation=0), ] sim = EvolutionLoop._compute_diversity_score(prompt, pop) assert sim == 0.0 class TestPromptDiff: """Tests for the static _compute_prompt_diff helper.""" def test_identical_prompts(self) -> None: result = EvolutionLoop._compute_prompt_diff("hello\nworld", "hello\nworld") assert result["lines_added"] == 0 assert result["lines_removed"] == 0 assert result["chars_delta"] == 0 def test_added_lines(self) -> None: result = EvolutionLoop._compute_prompt_diff("hello", "hello\nworld") assert result["lines_added"] == 1 assert result["lines_removed"] == 0 assert result["chars_delta"] == 6 # "\nworld" def test_removed_lines(self) -> None: result = EvolutionLoop._compute_prompt_diff("hello\nworld", "hello") assert result["lines_added"] == 0 assert result["lines_removed"] == 1