Aggregates all v0.2.0 sprint work (GARAA-30 through GARAA-40) and fixes 2 integration tests that broke when the codebase went async (DSPyLLMAdapter and full pipeline tests now properly await coroutines). 277 tests pass (260 unit + 17 integration). Co-Authored-By: Paperclip <noreply@paperclip.ing>
345 lines
13 KiB
Python
345 lines
13 KiB
Python
"""Unit tests for the evolution loop — with full mocking."""
|
|
from __future__ import annotations
|
|
|
|
from unittest.mock import AsyncMock, MagicMock, patch
|
|
|
|
import pytest
|
|
|
|
from prometheus.application.bootstrap import SyntheticBootstrap
|
|
from prometheus.application.evaluator import PromptEvaluator
|
|
from prometheus.application.evolution import EvolutionLoop
|
|
from prometheus.domain.entities import (
|
|
Candidate,
|
|
EvalResult,
|
|
Prompt,
|
|
SyntheticExample,
|
|
Trajectory,
|
|
)
|
|
|
|
|
|
def _make_eval(scores: list[float], label: str = "ok") -> EvalResult:
|
|
"""Helper to build an EvalResult from a list of scores."""
|
|
return EvalResult(
|
|
scores=scores,
|
|
feedbacks=[label] * len(scores),
|
|
trajectories=[
|
|
Trajectory(f"input{i}", f"output{i}", s, label, "prompt")
|
|
for i, s in enumerate(scores)
|
|
],
|
|
)
|
|
|
|
|
|
class TestEvolutionLoop:
|
|
"""Tests for the original single-candidate hill-climbing mode (population_size=1)."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_accepts_improvement(
|
|
self,
|
|
seed_prompt: Prompt,
|
|
synthetic_pool: list[SyntheticExample],
|
|
task_description: str,
|
|
mock_llm_port: AsyncMock,
|
|
mock_judge_port: AsyncMock,
|
|
mock_proposer_port: AsyncMock,
|
|
) -> None:
|
|
"""When the new prompt improves the score, the best candidate is updated."""
|
|
evaluator = PromptEvaluator(mock_llm_port, mock_judge_port)
|
|
bootstrap = MagicMock(spec=SyntheticBootstrap)
|
|
bootstrap.sample_minibatch.return_value = synthetic_pool[:5]
|
|
|
|
low_eval = _make_eval([0.3, 0.4, 0.3, 0.5, 0.2], "bad")
|
|
high_eval = _make_eval([0.8, 0.9, 0.7, 0.8, 0.9], "good")
|
|
evaluator.evaluate = AsyncMock(side_effect=[low_eval, low_eval, high_eval])
|
|
|
|
loop = EvolutionLoop(
|
|
evaluator=evaluator,
|
|
proposer=mock_proposer_port,
|
|
bootstrap=bootstrap,
|
|
max_iterations=1,
|
|
minibatch_size=5,
|
|
)
|
|
state = await loop.run(seed_prompt, synthetic_pool, task_description)
|
|
|
|
assert state.best_candidate is not None
|
|
assert state.best_candidate.best_score > 0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_rejects_regression(
|
|
self,
|
|
seed_prompt: Prompt,
|
|
synthetic_pool: list[SyntheticExample],
|
|
task_description: str,
|
|
mock_llm_port: AsyncMock,
|
|
mock_judge_port: AsyncMock,
|
|
mock_proposer_port: AsyncMock,
|
|
) -> None:
|
|
"""When the new prompt degrades the score, the best candidate stays unchanged."""
|
|
evaluator = PromptEvaluator(mock_llm_port, mock_judge_port)
|
|
bootstrap = MagicMock(spec=SyntheticBootstrap)
|
|
bootstrap.sample_minibatch.return_value = synthetic_pool[:5]
|
|
|
|
high_eval = _make_eval([0.7, 0.8, 0.7, 0.8, 0.9], "ok")
|
|
low_eval = _make_eval([0.2, 0.1, 0.3, 0.2, 0.1], "bad")
|
|
evaluator.evaluate = AsyncMock(side_effect=[high_eval, high_eval, low_eval])
|
|
|
|
loop = EvolutionLoop(
|
|
evaluator=evaluator,
|
|
proposer=mock_proposer_port,
|
|
bootstrap=bootstrap,
|
|
max_iterations=1,
|
|
minibatch_size=5,
|
|
)
|
|
state = await loop.run(seed_prompt, synthetic_pool, task_description)
|
|
|
|
assert state.best_candidate is not None
|
|
assert state.best_candidate.prompt.text == seed_prompt.text
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_skips_perfect_scores(
|
|
self,
|
|
seed_prompt: Prompt,
|
|
synthetic_pool: list[SyntheticExample],
|
|
task_description: str,
|
|
mock_llm_port: AsyncMock,
|
|
mock_judge_port: AsyncMock,
|
|
mock_proposer_port: AsyncMock,
|
|
) -> None:
|
|
"""When all scores are perfect, no proposition is made."""
|
|
evaluator = PromptEvaluator(mock_llm_port, mock_judge_port)
|
|
bootstrap = MagicMock(spec=SyntheticBootstrap)
|
|
bootstrap.sample_minibatch.return_value = synthetic_pool[:5]
|
|
|
|
perfect_eval = _make_eval([1.0, 1.0, 1.0, 1.0, 1.0], "perfect")
|
|
evaluator.evaluate = AsyncMock(return_value=perfect_eval)
|
|
|
|
loop = EvolutionLoop(
|
|
evaluator=evaluator,
|
|
proposer=mock_proposer_port,
|
|
bootstrap=bootstrap,
|
|
max_iterations=3,
|
|
minibatch_size=5,
|
|
)
|
|
await loop.run(seed_prompt, synthetic_pool, task_description)
|
|
mock_proposer_port.propose.assert_not_called()
|
|
|
|
|
|
class TestPopulationEvolution:
|
|
"""Tests for population-based evolution (population_size > 1)."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_population_initialization(
|
|
self,
|
|
seed_prompt: Prompt,
|
|
synthetic_pool: list[SyntheticExample],
|
|
task_description: str,
|
|
mock_llm_port: AsyncMock,
|
|
mock_judge_port: AsyncMock,
|
|
mock_proposer_port: AsyncMock,
|
|
mock_mutation_port: AsyncMock,
|
|
) -> None:
|
|
"""Population is initialized with the right number of candidates."""
|
|
evaluator = PromptEvaluator(mock_llm_port, mock_judge_port)
|
|
evaluator.evaluate = AsyncMock(
|
|
return_value=_make_eval([0.5] * 5, "ok")
|
|
)
|
|
bootstrap = MagicMock(spec=SyntheticBootstrap)
|
|
bootstrap.sample_minibatch.return_value = synthetic_pool[:5]
|
|
|
|
loop = EvolutionLoop(
|
|
evaluator=evaluator,
|
|
proposer=mock_proposer_port,
|
|
bootstrap=bootstrap,
|
|
max_iterations=0, # no iterations, just initialization
|
|
minibatch_size=5,
|
|
population_size=4,
|
|
mutation_port=mock_mutation_port,
|
|
)
|
|
state = await loop.run(seed_prompt, synthetic_pool, task_description)
|
|
|
|
# 1 seed + 3 mutations = 4 candidates
|
|
assert len(state.candidates) == 4
|
|
assert mock_mutation_port.mutate.call_count == 3
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_population_initialization_uses_proposer_fallback(
|
|
self,
|
|
seed_prompt: Prompt,
|
|
synthetic_pool: list[SyntheticExample],
|
|
task_description: str,
|
|
mock_llm_port: AsyncMock,
|
|
mock_judge_port: AsyncMock,
|
|
mock_proposer_port: AsyncMock,
|
|
) -> None:
|
|
"""When no mutation_port is provided, population init falls back to proposer."""
|
|
evaluator = PromptEvaluator(mock_llm_port, mock_judge_port)
|
|
evaluator.evaluate = AsyncMock(
|
|
return_value=_make_eval([0.5] * 5, "ok")
|
|
)
|
|
bootstrap = MagicMock(spec=SyntheticBootstrap)
|
|
bootstrap.sample_minibatch.return_value = synthetic_pool[:5]
|
|
|
|
loop = EvolutionLoop(
|
|
evaluator=evaluator,
|
|
proposer=mock_proposer_port,
|
|
bootstrap=bootstrap,
|
|
max_iterations=0,
|
|
minibatch_size=5,
|
|
population_size=3,
|
|
# mutation_port intentionally omitted
|
|
)
|
|
state = await loop.run(seed_prompt, synthetic_pool, task_description)
|
|
|
|
assert len(state.candidates) == 3
|
|
assert mock_proposer_port.propose.call_count == 2 # 3-1 = 2 init mutations
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_population_iteration_replaces_worst(
|
|
self,
|
|
seed_prompt: Prompt,
|
|
synthetic_pool: list[SyntheticExample],
|
|
task_description: str,
|
|
mock_llm_port: AsyncMock,
|
|
mock_judge_port: AsyncMock,
|
|
mock_proposer_port: AsyncMock,
|
|
mock_crossover_port: AsyncMock,
|
|
mock_mutation_port: AsyncMock,
|
|
) -> None:
|
|
"""Crossover child replaces worst candidate when its fitness is higher."""
|
|
evaluator = PromptEvaluator(mock_llm_port, mock_judge_port)
|
|
bootstrap = MagicMock(spec=SyntheticBootstrap)
|
|
bootstrap.sample_minibatch.return_value = synthetic_pool[:5]
|
|
|
|
# Sequence:
|
|
# 1. Initial eval (seed)
|
|
# 2. Population init: 3 mutation calls use proposer.propose(), NOT evaluator.evaluate
|
|
# 3. Population iteration: crossover produces child → eval child
|
|
# Only 2 evaluator.evaluate calls total
|
|
seed_eval = _make_eval([0.5] * 5, "ok")
|
|
# Crossover child eval - high score to beat worst
|
|
child_eval = _make_eval([0.9, 0.9, 0.8, 0.9, 0.8], "great")
|
|
|
|
all_evals = [seed_eval, child_eval]
|
|
evaluator.evaluate = AsyncMock(side_effect=all_evals)
|
|
|
|
loop = EvolutionLoop(
|
|
evaluator=evaluator,
|
|
proposer=mock_proposer_port,
|
|
bootstrap=bootstrap,
|
|
max_iterations=1,
|
|
minibatch_size=5,
|
|
population_size=4,
|
|
crossover_rate=1.0,
|
|
crossover_port=mock_crossover_port,
|
|
mutation_rate=0.0, # disable post-crossover mutation for determinism
|
|
)
|
|
state = await loop.run(seed_prompt, synthetic_pool, task_description)
|
|
|
|
accepted_events = [h for h in state.history if h.get("event") == "pop_accepted"]
|
|
assert len(accepted_events) >= 1
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_population_iteration_rejects_inferior_child(
|
|
self,
|
|
seed_prompt: Prompt,
|
|
synthetic_pool: list[SyntheticExample],
|
|
task_description: str,
|
|
mock_llm_port: AsyncMock,
|
|
mock_judge_port: AsyncMock,
|
|
mock_proposer_port: AsyncMock,
|
|
mock_crossover_port: AsyncMock,
|
|
) -> None:
|
|
"""Inferior child is rejected and doesn't replace any candidate."""
|
|
evaluator = PromptEvaluator(mock_llm_port, mock_judge_port)
|
|
bootstrap = MagicMock(spec=SyntheticBootstrap)
|
|
bootstrap.sample_minibatch.return_value = synthetic_pool[:5]
|
|
|
|
seed_eval = _make_eval([0.8] * 5, "ok")
|
|
# Crossover produces very LOW-scoring child
|
|
child_eval = _make_eval([0.1] * 5, "terrible")
|
|
|
|
all_evals = [seed_eval, child_eval]
|
|
evaluator.evaluate = AsyncMock(side_effect=all_evals)
|
|
|
|
loop = EvolutionLoop(
|
|
evaluator=evaluator,
|
|
proposer=mock_proposer_port,
|
|
bootstrap=bootstrap,
|
|
max_iterations=1,
|
|
minibatch_size=5,
|
|
population_size=4,
|
|
crossover_rate=1.0,
|
|
crossover_port=mock_crossover_port,
|
|
mutation_rate=0.0,
|
|
)
|
|
state = await loop.run(seed_prompt, synthetic_pool, task_description)
|
|
|
|
rejected_events = [h for h in state.history if h.get("event") == "pop_rejected"]
|
|
assert len(rejected_events) >= 1
|
|
|
|
|
|
class TestDiversityScore:
|
|
"""Tests for the diversity/similarity scoring logic."""
|
|
|
|
def test_identical_prompts_have_high_similarity(self) -> None:
|
|
"""Identical prompts should have very high similarity."""
|
|
identical = Prompt(text="You are a helpful assistant. Answer the question.")
|
|
pop_a = Candidate(prompt=identical, best_score=4.0, generation=0)
|
|
pop_b = Candidate(
|
|
prompt=Prompt(text="Completely different prompt about data analysis."),
|
|
best_score=3.0,
|
|
generation=0,
|
|
)
|
|
sim_same = EvolutionLoop._compute_diversity_score(identical, [pop_a, pop_b])
|
|
# Average includes similarity to the different member, so ~0.5 not 0.9+
|
|
assert sim_same > 0.3
|
|
|
|
def test_different_prompts_have_lower_similarity(self) -> None:
|
|
"""Different prompts should have lower similarity than identical ones."""
|
|
prompt_a = Prompt(text="You are a helpful assistant. Answer the question.")
|
|
prompt_b = Prompt(text="Provide detailed analysis of complex data patterns with precision.")
|
|
pop_a = Candidate(prompt=prompt_a, best_score=4.0, generation=0)
|
|
pop_b = Candidate(prompt=prompt_b, best_score=3.0, generation=0)
|
|
sim_a = EvolutionLoop._compute_diversity_score(prompt_a, [pop_a, pop_b])
|
|
sim_b = EvolutionLoop._compute_diversity_score(prompt_b, [pop_a, pop_b])
|
|
# Both should be < 1.0 since they're different
|
|
assert sim_a < 1.0
|
|
assert sim_b < 1.0
|
|
|
|
def test_single_member_population_returns_1(self) -> None:
|
|
"""Single-member population always returns 1.0 (no penalty)."""
|
|
prompt = Prompt(text="Any prompt text here.")
|
|
pop = [Candidate(prompt=prompt, best_score=1.0, generation=0)]
|
|
sim = EvolutionLoop._compute_diversity_score(prompt, pop)
|
|
assert sim == 1.0
|
|
|
|
def test_empty_prompt_returns_zero(self) -> None:
|
|
"""Empty prompt text returns 0.0 when population has >1 member."""
|
|
prompt = Prompt(text="")
|
|
pop = [
|
|
Candidate(prompt=Prompt(text="some text"), best_score=1.0, generation=0),
|
|
Candidate(prompt=Prompt(text="other text"), best_score=2.0, generation=0),
|
|
]
|
|
sim = EvolutionLoop._compute_diversity_score(prompt, pop)
|
|
assert sim == 0.0
|
|
|
|
|
|
class TestPromptDiff:
|
|
"""Tests for the static _compute_prompt_diff helper."""
|
|
|
|
def test_identical_prompts(self) -> None:
|
|
result = EvolutionLoop._compute_prompt_diff("hello\nworld", "hello\nworld")
|
|
assert result["lines_added"] == 0
|
|
assert result["lines_removed"] == 0
|
|
assert result["chars_delta"] == 0
|
|
|
|
def test_added_lines(self) -> None:
|
|
result = EvolutionLoop._compute_prompt_diff("hello", "hello\nworld")
|
|
assert result["lines_added"] == 1
|
|
assert result["lines_removed"] == 0
|
|
assert result["chars_delta"] == 6 # "\nworld"
|
|
|
|
def test_removed_lines(self) -> None:
|
|
result = EvolutionLoop._compute_prompt_diff("hello\nworld", "hello")
|
|
assert result["lines_added"] == 0
|
|
assert result["lines_removed"] == 1
|