feat: v0.2.0 sprint — ground truth eval, crossover/mutation, checkpointing, similarity guards, dataset loader, CLI commands, extended test coverage

Aggregates all v0.2.0 sprint work (GARAA-30 through GARAA-40) and fixes
2 integration tests that broke when the codebase went async (DSPyLLMAdapter
and full pipeline tests now properly await coroutines).

277 tests pass (260 unit + 17 integration).

Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
FullStackDev
2026-03-29 19:13:50 +00:00
parent b9745566c8
commit a5bf2ad59c
43 changed files with 5007 additions and 358 deletions

View File

@@ -20,9 +20,10 @@ def mock_lm() -> dspy.LM:
class TestDSPyLLMAdapter:
def test_execute_returns_response(self, mock_lm: dspy.LM) -> None:
@pytest.mark.asyncio
async def test_execute_returns_response(self, mock_lm: dspy.LM) -> None:
adapter = DSPyLLMAdapter(lm=mock_lm)
prompt = Prompt(text="Answer the question.")
result = adapter.execute(prompt, "What is 2+2?")
result = await adapter.execute(prompt, "What is 2+2?")
assert isinstance(result, str)
assert len(result) > 0

View File

@@ -0,0 +1,300 @@
"""Integration tests for multi-iteration evolution with mixed accept/reject."""
from __future__ import annotations
from unittest.mock import AsyncMock, MagicMock
import pytest
from prometheus.application.bootstrap import SyntheticBootstrap
from prometheus.application.evaluator import PromptEvaluator
from prometheus.application.evolution import EvolutionLoop
from prometheus.domain.entities import EvalResult, Prompt, SyntheticExample, Trajectory
from prometheus.domain.ports import JudgePort, LLMPort, ProposerPort
def _make_eval(scores: list[float]) -> EvalResult:
return EvalResult(
scores=scores,
feedbacks=["feedback"] * len(scores),
trajectories=[
Trajectory(f"in{i}", f"out{i}", s, "feedback", "prompt")
for i, s in enumerate(scores)
],
)
class TestMultiIterationEvolution:
"""Tests for the evolution loop across multiple iterations."""
@pytest.fixture
def seed_prompt(self) -> Prompt:
return Prompt(text="You are a helpful assistant.")
@pytest.fixture
def task_description(self) -> str:
return "Answer factual questions."
@pytest.fixture
def synthetic_pool(self) -> list[SyntheticExample]:
return [SyntheticExample(input_text=f"input {i}", id=i) for i in range(20)]
@pytest.mark.asyncio
async def test_mixed_accept_reject(
self,
seed_prompt: Prompt,
task_description: str,
synthetic_pool: list[SyntheticExample],
) -> None:
"""Iteration 1: accept, iteration 2: reject, iteration 3: accept."""
mock_llm = MagicMock(spec=LLMPort)
mock_judge = MagicMock(spec=JudgePort)
mock_proposer = MagicMock(spec=ProposerPort)
evaluator = PromptEvaluator(mock_llm, mock_judge)
bootstrap = MagicMock(spec=SyntheticBootstrap)
bootstrap.sample_minibatch.return_value = synthetic_pool[:3]
# Build eval sequence: initial, then per-iteration (current, new)
evals = [
_make_eval([0.3, 0.3, 0.3]), # initial seed eval
# Iter 1: accept (old=0.4, new=0.8)
_make_eval([0.4, 0.4, 0.4]),
_make_eval([0.8, 0.8, 0.8]),
# Iter 2: reject (old=0.7, new=0.2)
_make_eval([0.7, 0.7, 0.7]),
_make_eval([0.2, 0.2, 0.2]),
# Iter 3: accept (old=0.5, new=0.9)
_make_eval([0.5, 0.5, 0.5]),
_make_eval([0.9, 0.9, 0.9]),
]
evaluator.evaluate = AsyncMock(side_effect=evals)
mock_proposer.propose.side_effect = [
Prompt(text="Better prompt v1"),
Prompt(text="Worse prompt v2"),
Prompt(text="Best prompt v3"),
]
loop = EvolutionLoop(
evaluator=evaluator,
proposer=mock_proposer,
bootstrap=bootstrap,
max_iterations=3,
minibatch_size=3,
)
state = await loop.run(seed_prompt, synthetic_pool, task_description)
assert state.iteration == 3
assert state.best_candidate is not None
assert state.best_candidate.best_score == pytest.approx(2.7) # 0.9*3
assert len(state.history) == 3
assert state.history[0]["event"] == "accepted"
assert state.history[1]["event"] == "rejected"
assert state.history[2]["event"] == "accepted"
@pytest.mark.asyncio
async def test_all_rejected_keeps_seed(
self,
seed_prompt: Prompt,
task_description: str,
synthetic_pool: list[SyntheticExample],
) -> None:
"""When all proposals are rejected, the seed prompt stays as best."""
mock_llm = MagicMock(spec=LLMPort)
mock_judge = MagicMock(spec=JudgePort)
mock_proposer = MagicMock(spec=ProposerPort)
evaluator = PromptEvaluator(mock_llm, mock_judge)
bootstrap = MagicMock(spec=SyntheticBootstrap)
bootstrap.sample_minibatch.return_value = synthetic_pool[:3]
evals = [
_make_eval([0.5, 0.5, 0.5]), # initial
]
for _ in range(3):
evals.append(_make_eval([0.5, 0.5, 0.5])) # current
evals.append(_make_eval([0.1, 0.1, 0.1])) # worse proposal
evaluator.evaluate = AsyncMock(side_effect=evals)
mock_proposer.propose.side_effect = [
Prompt(text="bad v1"),
Prompt(text="bad v2"),
Prompt(text="bad v3"),
]
loop = EvolutionLoop(
evaluator=evaluator,
proposer=mock_proposer,
bootstrap=bootstrap,
max_iterations=3,
minibatch_size=3,
)
state = await loop.run(seed_prompt, synthetic_pool, task_description)
assert state.best_candidate.prompt.text == seed_prompt.text
assert state.best_candidate.best_score == pytest.approx(1.5) # 0.5*3
@pytest.mark.asyncio
async def test_all_accepted_chain(
self,
seed_prompt: Prompt,
task_description: str,
synthetic_pool: list[SyntheticExample],
) -> None:
"""All iterations accept, forming an improvement chain."""
mock_llm = MagicMock(spec=LLMPort)
mock_judge = MagicMock(spec=JudgePort)
mock_proposer = MagicMock(spec=ProposerPort)
evaluator = PromptEvaluator(mock_llm, mock_judge)
bootstrap = MagicMock(spec=SyntheticBootstrap)
bootstrap.sample_minibatch.return_value = synthetic_pool[:2]
evals = [
_make_eval([0.2, 0.2]), # initial
]
for i in range(1, 5):
score = 0.2 + i * 0.15
evals.append(_make_eval([score, score])) # current
evals.append(_make_eval([score + 0.1, score + 0.1])) # new (accepted)
evaluator.evaluate = AsyncMock(side_effect=evals)
mock_proposer.propose.side_effect = [
Prompt(text=f"Improved v{i}") for i in range(4)
]
loop = EvolutionLoop(
evaluator=evaluator,
proposer=mock_proposer,
bootstrap=bootstrap,
max_iterations=4,
minibatch_size=2,
)
state = await loop.run(seed_prompt, synthetic_pool, task_description)
assert len(state.candidates) == 5 # seed + 4 accepted
assert all(h["event"] == "accepted" for h in state.history)
@pytest.mark.asyncio
async def test_error_recovery_continues_loop(
self,
seed_prompt: Prompt,
task_description: str,
synthetic_pool: list[SyntheticExample],
) -> None:
"""When an iteration errors, the loop continues."""
mock_llm = MagicMock(spec=LLMPort)
mock_judge = MagicMock(spec=JudgePort)
mock_proposer = MagicMock(spec=ProposerPort)
evaluator = PromptEvaluator(mock_llm, mock_judge)
bootstrap = MagicMock(spec=SyntheticBootstrap)
bootstrap.sample_minibatch.return_value = synthetic_pool[:2]
# Eval sequence for 3 iterations:
# - iter 1: evaluate current → propose → evaluate new (accepted)
# - iter 2: evaluate current → propose (ERROR, no new eval)
# - iter 3: evaluate current → propose → evaluate new (accepted)
evals = [
_make_eval([0.3, 0.3]), # initial
_make_eval([0.5, 0.5]), # iter 1 current
_make_eval([0.9, 0.9]), # iter 1 new (accepted)
_make_eval([0.5, 0.5]), # iter 2 current (proposer errors after this)
_make_eval([0.5, 0.5]), # iter 3 current
_make_eval([0.8, 0.8]), # iter 3 new (accepted)
]
evaluator.evaluate = AsyncMock(side_effect=evals)
# Proposer raises on iter 2
mock_proposer.propose.side_effect = [
Prompt(text="good v1"),
RuntimeError("LLM timeout"),
Prompt(text="good v3"),
]
loop = EvolutionLoop(
evaluator=evaluator,
proposer=mock_proposer,
bootstrap=bootstrap,
max_iterations=3,
minibatch_size=2,
)
state = await loop.run(seed_prompt, synthetic_pool, task_description)
assert state.iteration == 3
assert state.history[1]["event"] == "error"
assert "LLM timeout" in state.history[1]["error"]
assert state.history[0]["event"] == "accepted"
assert state.history[2]["event"] == "accepted"
@pytest.mark.asyncio
async def test_perfect_score_skips_proposer(
self,
seed_prompt: Prompt,
task_description: str,
synthetic_pool: list[SyntheticExample],
) -> None:
"""When all scores are perfect, no proposition is made."""
mock_llm = MagicMock(spec=LLMPort)
mock_judge = MagicMock(spec=JudgePort)
mock_proposer = MagicMock(spec=ProposerPort)
evaluator = PromptEvaluator(mock_llm, mock_judge)
bootstrap = MagicMock(spec=SyntheticBootstrap)
bootstrap.sample_minibatch.return_value = synthetic_pool[:2]
perfect_eval = _make_eval([1.0, 1.0])
evaluator.evaluate = AsyncMock(return_value=perfect_eval)
loop = EvolutionLoop(
evaluator=evaluator,
proposer=mock_proposer,
bootstrap=bootstrap,
max_iterations=5,
minibatch_size=2,
perfect_score=1.0,
)
state = await loop.run(seed_prompt, synthetic_pool, task_description)
mock_proposer.propose.assert_not_called()
assert all(h["event"] == "skip_perfect" for h in state.history)
@pytest.mark.asyncio
async def test_llm_call_counting(
self,
seed_prompt: Prompt,
task_description: str,
synthetic_pool: list[SyntheticExample],
) -> None:
"""Verify LLM call counting: 2*N per eval (execute + judge) + 1 per propose."""
mock_llm = MagicMock(spec=LLMPort)
mock_judge = MagicMock(spec=JudgePort)
mock_proposer = MagicMock(spec=ProposerPort)
evaluator = PromptEvaluator(mock_llm, mock_judge)
bootstrap = MagicMock(spec=SyntheticBootstrap)
bootstrap.sample_minibatch.return_value = synthetic_pool[:3]
evals = [_make_eval([0.3, 0.3, 0.3])] # initial
for _ in range(2):
evals.append(_make_eval([0.4, 0.4, 0.4]))
evals.append(_make_eval([0.6, 0.6, 0.6]))
evaluator.evaluate = AsyncMock(side_effect=evals)
mock_proposer.propose.side_effect = [
Prompt(text="v1"),
Prompt(text="v2"),
]
loop = EvolutionLoop(
evaluator=evaluator,
proposer=mock_proposer,
bootstrap=bootstrap,
max_iterations=2,
minibatch_size=3,
)
state = await loop.run(seed_prompt, synthetic_pool, task_description)
# Initial: 2*3=6, Iter1: 2*3 + 1 + 2*3 = 13, Iter2: same = 13
# Total: 6 + 13 + 13 = 32
assert state.total_llm_calls == 32

View File

@@ -1,7 +1,9 @@
"""End-to-end pipeline test with mocked LLM calls."""
from __future__ import annotations
from unittest.mock import MagicMock
from unittest.mock import AsyncMock, MagicMock
import pytest
from prometheus.application.bootstrap import SyntheticBootstrap
from prometheus.application.dto import OptimizationConfig
@@ -23,9 +25,10 @@ def _make_eval(scores: list[float]) -> EvalResult:
class TestFullPipeline:
def test_pipeline_produces_result(self) -> None:
@pytest.mark.asyncio
async def test_pipeline_produces_result(self) -> None:
"""Full pipeline with mocked ports produces an OptimizationResult."""
mock_llm = MagicMock(spec=LLMPort)
mock_llm = AsyncMock(spec=LLMPort)
mock_llm.execute.return_value = "mock response"
mock_judge = MagicMock(spec=JudgePort)
@@ -38,11 +41,11 @@ class TestFullPipeline:
eval_sequence.append(_make_eval([0.6, 0.6, 0.6, 0.6, 0.6])) # new eval (accepted)
mock_judge.judge_batch.return_value = [(0.5, "ok")] * 5
mock_proposer = MagicMock(spec=ProposerPort)
mock_proposer = AsyncMock(spec=ProposerPort)
mock_proposer.propose.return_value = Prompt(text="Improved prompt")
evaluator = PromptEvaluator(mock_llm, mock_judge)
evaluator.evaluate = MagicMock(side_effect=eval_sequence)
evaluator.evaluate = AsyncMock(side_effect=eval_sequence)
mock_gen = MagicMock()
mock_gen.generate_inputs.return_value = [
@@ -65,7 +68,7 @@ class TestFullPipeline:
seed=42,
)
result = use_case.execute(config)
result = await use_case.execute(config)
assert result.initial_prompt == "Answer questions."
assert result.optimized_prompt == "Improved prompt"

View File

@@ -0,0 +1,199 @@
"""Integration test — ground-truth evaluation end-to-end with real similarity metrics."""
from __future__ import annotations
import asyncio
import json
import pytest
from unittest.mock import AsyncMock
from prometheus.application.ground_truth_evaluator import GroundTruthEvaluator
from prometheus.domain.entities import GroundTruthExample, Prompt
from prometheus.domain.ports import LLMPort
from prometheus.infrastructure.dataset_loader import FileDatasetLoader
from prometheus.infrastructure.similarity import (
BleuSimilarity,
CosineSimilarity,
ExactMatchSimilarity,
RougeLSimilarity,
create_similarity_adapter,
)
def _make_dataset(items: list[tuple[str, str]]) -> list[GroundTruthExample]:
return [
GroundTruthExample(input_text=inp, expected_output=exp, id=i)
for i, (inp, exp) in enumerate(items)
]
@pytest.fixture
def qa_dataset():
return _make_dataset([
("What is the capital of France?", "Paris"),
("What is 2+2?", "4"),
("What color is the sky?", "blue"),
])
@pytest.fixture
def prompt():
return Prompt(text="Answer the following question concisely.")
@pytest.fixture
def mock_executor():
"""Returns responses that partially match the ground truth."""
port = AsyncMock(spec=LLMPort)
port.execute.side_effect = [
"Paris is the capital of France.",
"The answer is 4.",
"The sky is blue.",
]
return port
class TestGroundTruthIntegrationWithExactMatch:
@pytest.mark.asyncio
async def test_exact_match_on_qa(self, mock_executor, qa_dataset, prompt):
evaluator = GroundTruthEvaluator(
executor=mock_executor,
similarity=ExactMatchSimilarity(),
)
result = await evaluator.evaluate(prompt, qa_dataset)
# None of the outputs are exact matches with expected outputs
assert all(s == 0.0 for s in result.scores)
@pytest.mark.asyncio
async def test_exact_match_with_exact_outputs(self, qa_dataset, prompt):
exact_executor = AsyncMock(spec=LLMPort)
exact_executor.execute.side_effect = ["Paris", "4", "blue"]
evaluator = GroundTruthEvaluator(
executor=exact_executor,
similarity=ExactMatchSimilarity(),
)
result = await evaluator.evaluate(prompt, qa_dataset)
assert all(s == 1.0 for s in result.scores)
class TestGroundTruthIntegrationWithBleu:
@pytest.mark.asyncio
async def test_bleu_scores_partial_match(self, mock_executor, qa_dataset, prompt):
evaluator = GroundTruthEvaluator(
executor=mock_executor,
similarity=BleuSimilarity(),
)
result = await evaluator.evaluate(prompt, qa_dataset)
assert all(0.0 < s < 1.0 for s in result.scores)
assert result.mean_score > 0.0
@pytest.mark.asyncio
async def test_bleu_perfect_match(self, qa_dataset, prompt):
perfect_executor = AsyncMock(spec=LLMPort)
perfect_executor.execute.side_effect = ["Paris", "4", "blue"]
evaluator = GroundTruthEvaluator(
executor=perfect_executor,
similarity=BleuSimilarity(),
)
result = await evaluator.evaluate(prompt, qa_dataset)
assert all(s > 0.0 for s in result.scores)
class TestGroundTruthIntegrationWithRouge:
@pytest.mark.asyncio
async def test_rouge_l_scores(self, mock_executor, qa_dataset, prompt):
evaluator = GroundTruthEvaluator(
executor=mock_executor,
similarity=RougeLSimilarity(),
)
result = await evaluator.evaluate(prompt, qa_dataset)
assert all(s > 0.0 for s in result.scores)
class TestGroundTruthIntegrationWithCosine:
@pytest.mark.asyncio
async def test_cosine_scores(self, mock_executor, qa_dataset, prompt):
evaluator = GroundTruthEvaluator(
executor=mock_executor,
similarity=CosineSimilarity(),
)
result = await evaluator.evaluate(prompt, qa_dataset)
assert all(s > 0.0 for s in result.scores)
class TestDatasetLoaderIntegration:
@pytest.mark.asyncio
async def test_load_csv_and_evaluate(self, tmp_path, prompt):
csv_file = tmp_path / "eval.csv"
csv_file.write_text("input,expected_output\nWhat is 2+2?,4\nWhat color is grass?,green\n")
loader = FileDatasetLoader()
dataset = loader.load(str(csv_file))
assert len(dataset) == 2
executor = AsyncMock(spec=LLMPort)
executor.execute.side_effect = ["4", "green"]
evaluator = GroundTruthEvaluator(
executor=executor,
similarity=ExactMatchSimilarity(),
)
result = await evaluator.evaluate(prompt, dataset)
assert all(s == 1.0 for s in result.scores)
@pytest.mark.asyncio
async def test_load_json_and_evaluate(self, tmp_path, prompt):
json_file = tmp_path / "eval.json"
data = [
{"input": "What is 2+2?", "expected_output": "4"},
{"input": "What color is grass?", "expected_output": "green"},
]
json_file.write_text(json.dumps(data))
loader = FileDatasetLoader()
dataset = loader.load(str(json_file))
assert len(dataset) == 2
executor = AsyncMock(spec=LLMPort)
executor.execute.side_effect = ["4", "not green"]
evaluator = GroundTruthEvaluator(
executor=executor,
similarity=create_similarity_adapter("bleu"),
)
result = await evaluator.evaluate(prompt, dataset)
# First item should score well, second poorly
assert result.scores[0] > result.scores[1]
class TestMetricComparison:
"""Compare different metrics on the same outputs to ensure they behave differently."""
@pytest.mark.asyncio
async def test_metrics_give_different_scores(self, qa_dataset, prompt):
results = {}
for metric_name, metric_cls in [
("exact", ExactMatchSimilarity),
("bleu", BleuSimilarity),
("rouge_l", RougeLSimilarity),
("cosine", CosineSimilarity),
]:
executor = AsyncMock(spec=LLMPort)
executor.execute.side_effect = [
"Paris is the capital of France.",
"The answer is 4.",
"The sky is blue.",
]
evaluator = GroundTruthEvaluator(
executor=executor,
similarity=metric_cls(),
)
result = await evaluator.evaluate(prompt, qa_dataset)
results[metric_name] = result.mean_score
# Exact match should be 0 (no exact matches)
assert results["exact"] == 0.0
# All other metrics should give partial credit
assert results["bleu"] > 0.0
assert results["rouge_l"] > 0.0
assert results["cosine"] > 0.0