feat: v0.2.0 sprint — ground truth eval, crossover/mutation, checkpointing, similarity guards, dataset loader, CLI commands, extended test coverage
Aggregates all v0.2.0 sprint work (GARAA-30 through GARAA-40) and fixes 2 integration tests that broke when the codebase went async (DSPyLLMAdapter and full pipeline tests now properly await coroutines). 277 tests pass (260 unit + 17 integration). Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
300
tests/integration/test_evolution_integration.py
Normal file
300
tests/integration/test_evolution_integration.py
Normal file
@@ -0,0 +1,300 @@
|
||||
"""Integration tests for multi-iteration evolution with mixed accept/reject."""
|
||||
from __future__ import annotations
|
||||
|
||||
from unittest.mock import AsyncMock, MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from prometheus.application.bootstrap import SyntheticBootstrap
|
||||
from prometheus.application.evaluator import PromptEvaluator
|
||||
from prometheus.application.evolution import EvolutionLoop
|
||||
from prometheus.domain.entities import EvalResult, Prompt, SyntheticExample, Trajectory
|
||||
from prometheus.domain.ports import JudgePort, LLMPort, ProposerPort
|
||||
|
||||
|
||||
def _make_eval(scores: list[float]) -> EvalResult:
|
||||
return EvalResult(
|
||||
scores=scores,
|
||||
feedbacks=["feedback"] * len(scores),
|
||||
trajectories=[
|
||||
Trajectory(f"in{i}", f"out{i}", s, "feedback", "prompt")
|
||||
for i, s in enumerate(scores)
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
class TestMultiIterationEvolution:
|
||||
"""Tests for the evolution loop across multiple iterations."""
|
||||
|
||||
@pytest.fixture
|
||||
def seed_prompt(self) -> Prompt:
|
||||
return Prompt(text="You are a helpful assistant.")
|
||||
|
||||
@pytest.fixture
|
||||
def task_description(self) -> str:
|
||||
return "Answer factual questions."
|
||||
|
||||
@pytest.fixture
|
||||
def synthetic_pool(self) -> list[SyntheticExample]:
|
||||
return [SyntheticExample(input_text=f"input {i}", id=i) for i in range(20)]
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_mixed_accept_reject(
|
||||
self,
|
||||
seed_prompt: Prompt,
|
||||
task_description: str,
|
||||
synthetic_pool: list[SyntheticExample],
|
||||
) -> None:
|
||||
"""Iteration 1: accept, iteration 2: reject, iteration 3: accept."""
|
||||
mock_llm = MagicMock(spec=LLMPort)
|
||||
mock_judge = MagicMock(spec=JudgePort)
|
||||
mock_proposer = MagicMock(spec=ProposerPort)
|
||||
|
||||
evaluator = PromptEvaluator(mock_llm, mock_judge)
|
||||
bootstrap = MagicMock(spec=SyntheticBootstrap)
|
||||
bootstrap.sample_minibatch.return_value = synthetic_pool[:3]
|
||||
|
||||
# Build eval sequence: initial, then per-iteration (current, new)
|
||||
evals = [
|
||||
_make_eval([0.3, 0.3, 0.3]), # initial seed eval
|
||||
# Iter 1: accept (old=0.4, new=0.8)
|
||||
_make_eval([0.4, 0.4, 0.4]),
|
||||
_make_eval([0.8, 0.8, 0.8]),
|
||||
# Iter 2: reject (old=0.7, new=0.2)
|
||||
_make_eval([0.7, 0.7, 0.7]),
|
||||
_make_eval([0.2, 0.2, 0.2]),
|
||||
# Iter 3: accept (old=0.5, new=0.9)
|
||||
_make_eval([0.5, 0.5, 0.5]),
|
||||
_make_eval([0.9, 0.9, 0.9]),
|
||||
]
|
||||
evaluator.evaluate = AsyncMock(side_effect=evals)
|
||||
|
||||
mock_proposer.propose.side_effect = [
|
||||
Prompt(text="Better prompt v1"),
|
||||
Prompt(text="Worse prompt v2"),
|
||||
Prompt(text="Best prompt v3"),
|
||||
]
|
||||
|
||||
loop = EvolutionLoop(
|
||||
evaluator=evaluator,
|
||||
proposer=mock_proposer,
|
||||
bootstrap=bootstrap,
|
||||
max_iterations=3,
|
||||
minibatch_size=3,
|
||||
)
|
||||
state = await loop.run(seed_prompt, synthetic_pool, task_description)
|
||||
|
||||
assert state.iteration == 3
|
||||
assert state.best_candidate is not None
|
||||
assert state.best_candidate.best_score == pytest.approx(2.7) # 0.9*3
|
||||
assert len(state.history) == 3
|
||||
assert state.history[0]["event"] == "accepted"
|
||||
assert state.history[1]["event"] == "rejected"
|
||||
assert state.history[2]["event"] == "accepted"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_all_rejected_keeps_seed(
|
||||
self,
|
||||
seed_prompt: Prompt,
|
||||
task_description: str,
|
||||
synthetic_pool: list[SyntheticExample],
|
||||
) -> None:
|
||||
"""When all proposals are rejected, the seed prompt stays as best."""
|
||||
mock_llm = MagicMock(spec=LLMPort)
|
||||
mock_judge = MagicMock(spec=JudgePort)
|
||||
mock_proposer = MagicMock(spec=ProposerPort)
|
||||
|
||||
evaluator = PromptEvaluator(mock_llm, mock_judge)
|
||||
bootstrap = MagicMock(spec=SyntheticBootstrap)
|
||||
bootstrap.sample_minibatch.return_value = synthetic_pool[:3]
|
||||
|
||||
evals = [
|
||||
_make_eval([0.5, 0.5, 0.5]), # initial
|
||||
]
|
||||
for _ in range(3):
|
||||
evals.append(_make_eval([0.5, 0.5, 0.5])) # current
|
||||
evals.append(_make_eval([0.1, 0.1, 0.1])) # worse proposal
|
||||
evaluator.evaluate = AsyncMock(side_effect=evals)
|
||||
|
||||
mock_proposer.propose.side_effect = [
|
||||
Prompt(text="bad v1"),
|
||||
Prompt(text="bad v2"),
|
||||
Prompt(text="bad v3"),
|
||||
]
|
||||
|
||||
loop = EvolutionLoop(
|
||||
evaluator=evaluator,
|
||||
proposer=mock_proposer,
|
||||
bootstrap=bootstrap,
|
||||
max_iterations=3,
|
||||
minibatch_size=3,
|
||||
)
|
||||
state = await loop.run(seed_prompt, synthetic_pool, task_description)
|
||||
|
||||
assert state.best_candidate.prompt.text == seed_prompt.text
|
||||
assert state.best_candidate.best_score == pytest.approx(1.5) # 0.5*3
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_all_accepted_chain(
|
||||
self,
|
||||
seed_prompt: Prompt,
|
||||
task_description: str,
|
||||
synthetic_pool: list[SyntheticExample],
|
||||
) -> None:
|
||||
"""All iterations accept, forming an improvement chain."""
|
||||
mock_llm = MagicMock(spec=LLMPort)
|
||||
mock_judge = MagicMock(spec=JudgePort)
|
||||
mock_proposer = MagicMock(spec=ProposerPort)
|
||||
|
||||
evaluator = PromptEvaluator(mock_llm, mock_judge)
|
||||
bootstrap = MagicMock(spec=SyntheticBootstrap)
|
||||
bootstrap.sample_minibatch.return_value = synthetic_pool[:2]
|
||||
|
||||
evals = [
|
||||
_make_eval([0.2, 0.2]), # initial
|
||||
]
|
||||
for i in range(1, 5):
|
||||
score = 0.2 + i * 0.15
|
||||
evals.append(_make_eval([score, score])) # current
|
||||
evals.append(_make_eval([score + 0.1, score + 0.1])) # new (accepted)
|
||||
evaluator.evaluate = AsyncMock(side_effect=evals)
|
||||
|
||||
mock_proposer.propose.side_effect = [
|
||||
Prompt(text=f"Improved v{i}") for i in range(4)
|
||||
]
|
||||
|
||||
loop = EvolutionLoop(
|
||||
evaluator=evaluator,
|
||||
proposer=mock_proposer,
|
||||
bootstrap=bootstrap,
|
||||
max_iterations=4,
|
||||
minibatch_size=2,
|
||||
)
|
||||
state = await loop.run(seed_prompt, synthetic_pool, task_description)
|
||||
|
||||
assert len(state.candidates) == 5 # seed + 4 accepted
|
||||
assert all(h["event"] == "accepted" for h in state.history)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_error_recovery_continues_loop(
|
||||
self,
|
||||
seed_prompt: Prompt,
|
||||
task_description: str,
|
||||
synthetic_pool: list[SyntheticExample],
|
||||
) -> None:
|
||||
"""When an iteration errors, the loop continues."""
|
||||
mock_llm = MagicMock(spec=LLMPort)
|
||||
mock_judge = MagicMock(spec=JudgePort)
|
||||
mock_proposer = MagicMock(spec=ProposerPort)
|
||||
|
||||
evaluator = PromptEvaluator(mock_llm, mock_judge)
|
||||
bootstrap = MagicMock(spec=SyntheticBootstrap)
|
||||
bootstrap.sample_minibatch.return_value = synthetic_pool[:2]
|
||||
|
||||
# Eval sequence for 3 iterations:
|
||||
# - iter 1: evaluate current → propose → evaluate new (accepted)
|
||||
# - iter 2: evaluate current → propose (ERROR, no new eval)
|
||||
# - iter 3: evaluate current → propose → evaluate new (accepted)
|
||||
evals = [
|
||||
_make_eval([0.3, 0.3]), # initial
|
||||
_make_eval([0.5, 0.5]), # iter 1 current
|
||||
_make_eval([0.9, 0.9]), # iter 1 new (accepted)
|
||||
_make_eval([0.5, 0.5]), # iter 2 current (proposer errors after this)
|
||||
_make_eval([0.5, 0.5]), # iter 3 current
|
||||
_make_eval([0.8, 0.8]), # iter 3 new (accepted)
|
||||
]
|
||||
evaluator.evaluate = AsyncMock(side_effect=evals)
|
||||
|
||||
# Proposer raises on iter 2
|
||||
mock_proposer.propose.side_effect = [
|
||||
Prompt(text="good v1"),
|
||||
RuntimeError("LLM timeout"),
|
||||
Prompt(text="good v3"),
|
||||
]
|
||||
|
||||
loop = EvolutionLoop(
|
||||
evaluator=evaluator,
|
||||
proposer=mock_proposer,
|
||||
bootstrap=bootstrap,
|
||||
max_iterations=3,
|
||||
minibatch_size=2,
|
||||
)
|
||||
state = await loop.run(seed_prompt, synthetic_pool, task_description)
|
||||
|
||||
assert state.iteration == 3
|
||||
assert state.history[1]["event"] == "error"
|
||||
assert "LLM timeout" in state.history[1]["error"]
|
||||
assert state.history[0]["event"] == "accepted"
|
||||
assert state.history[2]["event"] == "accepted"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_perfect_score_skips_proposer(
|
||||
self,
|
||||
seed_prompt: Prompt,
|
||||
task_description: str,
|
||||
synthetic_pool: list[SyntheticExample],
|
||||
) -> None:
|
||||
"""When all scores are perfect, no proposition is made."""
|
||||
mock_llm = MagicMock(spec=LLMPort)
|
||||
mock_judge = MagicMock(spec=JudgePort)
|
||||
mock_proposer = MagicMock(spec=ProposerPort)
|
||||
|
||||
evaluator = PromptEvaluator(mock_llm, mock_judge)
|
||||
bootstrap = MagicMock(spec=SyntheticBootstrap)
|
||||
bootstrap.sample_minibatch.return_value = synthetic_pool[:2]
|
||||
|
||||
perfect_eval = _make_eval([1.0, 1.0])
|
||||
evaluator.evaluate = AsyncMock(return_value=perfect_eval)
|
||||
|
||||
loop = EvolutionLoop(
|
||||
evaluator=evaluator,
|
||||
proposer=mock_proposer,
|
||||
bootstrap=bootstrap,
|
||||
max_iterations=5,
|
||||
minibatch_size=2,
|
||||
perfect_score=1.0,
|
||||
)
|
||||
state = await loop.run(seed_prompt, synthetic_pool, task_description)
|
||||
|
||||
mock_proposer.propose.assert_not_called()
|
||||
assert all(h["event"] == "skip_perfect" for h in state.history)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_llm_call_counting(
|
||||
self,
|
||||
seed_prompt: Prompt,
|
||||
task_description: str,
|
||||
synthetic_pool: list[SyntheticExample],
|
||||
) -> None:
|
||||
"""Verify LLM call counting: 2*N per eval (execute + judge) + 1 per propose."""
|
||||
mock_llm = MagicMock(spec=LLMPort)
|
||||
mock_judge = MagicMock(spec=JudgePort)
|
||||
mock_proposer = MagicMock(spec=ProposerPort)
|
||||
|
||||
evaluator = PromptEvaluator(mock_llm, mock_judge)
|
||||
bootstrap = MagicMock(spec=SyntheticBootstrap)
|
||||
bootstrap.sample_minibatch.return_value = synthetic_pool[:3]
|
||||
|
||||
evals = [_make_eval([0.3, 0.3, 0.3])] # initial
|
||||
for _ in range(2):
|
||||
evals.append(_make_eval([0.4, 0.4, 0.4]))
|
||||
evals.append(_make_eval([0.6, 0.6, 0.6]))
|
||||
evaluator.evaluate = AsyncMock(side_effect=evals)
|
||||
|
||||
mock_proposer.propose.side_effect = [
|
||||
Prompt(text="v1"),
|
||||
Prompt(text="v2"),
|
||||
]
|
||||
|
||||
loop = EvolutionLoop(
|
||||
evaluator=evaluator,
|
||||
proposer=mock_proposer,
|
||||
bootstrap=bootstrap,
|
||||
max_iterations=2,
|
||||
minibatch_size=3,
|
||||
)
|
||||
state = await loop.run(seed_prompt, synthetic_pool, task_description)
|
||||
|
||||
# Initial: 2*3=6, Iter1: 2*3 + 1 + 2*3 = 13, Iter2: same = 13
|
||||
# Total: 6 + 13 + 13 = 32
|
||||
assert state.total_llm_calls == 32
|
||||
Reference in New Issue
Block a user