feat: v0.2.0 sprint — ground truth eval, crossover/mutation, checkpointing, similarity guards, dataset loader, CLI commands, extended test coverage

Aggregates all v0.2.0 sprint work (GARAA-30 through GARAA-40) and fixes
2 integration tests that broke when the codebase went async (DSPyLLMAdapter
and full pipeline tests now properly await coroutines).

277 tests pass (260 unit + 17 integration).

Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
FullStackDev
2026-03-29 19:13:50 +00:00
parent b9745566c8
commit a5bf2ad59c
43 changed files with 5007 additions and 358 deletions

View File

@@ -0,0 +1,316 @@
"""Unit tests for hold-out validation and early stopping."""
from __future__ import annotations
from unittest.mock import AsyncMock, MagicMock
import pytest
from prometheus.application.bootstrap import SyntheticBootstrap
from prometheus.application.evaluator import PromptEvaluator
from prometheus.application.evolution import EvolutionLoop
from prometheus.domain.entities import (
Candidate,
EvalResult,
Prompt,
SyntheticExample,
Trajectory,
)
def _make_eval(mean_score: float, n: int = 5) -> EvalResult:
"""Helper: create an EvalResult with a given mean score."""
scores = [mean_score] * n
return EvalResult(
scores=scores,
feedbacks=["feedback"] * n,
trajectories=[
Trajectory(f"input{i}", f"output{i}", mean_score, "feedback", "prompt")
for i in range(n)
],
)
class TestBootstrapSplit:
"""Tests for SyntheticBootstrap.split_pool."""
def test_split_produces_correct_sizes(self):
pool = [SyntheticExample(input_text=f"ex{i}", id=i) for i in range(20)]
train, val = SyntheticBootstrap.split_pool(pool, 0.3)
assert len(train) + len(val) == 20
assert len(val) == 6 # 20 * 0.3 = 6
assert len(train) == 14
def test_split_zero_fraction_returns_all_train(self):
pool = [SyntheticExample(input_text=f"ex{i}", id=i) for i in range(10)]
train, val = SyntheticBootstrap.split_pool(pool, 0.0)
assert len(train) == 10
assert len(val) == 0
def test_split_single_element(self):
pool = [SyntheticExample(input_text="only", id=0)]
train, val = SyntheticBootstrap.split_pool(pool, 0.3)
assert len(train) == 1
assert len(val) == 0
def test_split_deterministic_with_seed(self):
pool = [SyntheticExample(input_text=f"ex{i}", id=i) for i in range(50)]
train1, val1 = SyntheticBootstrap.split_pool(pool, 0.3, rng=MagicMock(wraps=__import__("random").Random(42)))
train2, val2 = SyntheticBootstrap.split_pool(pool, 0.3, rng=MagicMock(wraps=__import__("random").Random(42)))
assert [ex.id for ex in train1] == [ex.id for ex in train2]
assert [ex.id for ex in val1] == [ex.id for ex in val2]
def test_split_no_overlap(self):
pool = [SyntheticExample(input_text=f"ex{i}", id=i) for i in range(30)]
train, val = SyntheticBootstrap.split_pool(pool, 0.3)
train_ids = {ex.id for ex in train}
val_ids = {ex.id for ex in val}
assert train_ids.isdisjoint(val_ids)
assert train_ids | val_ids == {ex.id for ex in pool}
class TestValidationEvaluation:
"""Tests for hold-out evaluation during evolution."""
@pytest.mark.asyncio
async def test_validation_pool_evaluated_after_each_iteration(
self,
seed_prompt: Prompt,
synthetic_pool: list[SyntheticExample],
task_description: str,
mock_llm_port: AsyncMock,
mock_judge_port: AsyncMock,
mock_proposer_port: AsyncMock,
) -> None:
"""When a validation pool is provided, the best candidate is evaluated on it."""
evaluator = PromptEvaluator(mock_llm_port, mock_judge_port)
bootstrap = MagicMock(spec=SyntheticBootstrap)
bootstrap.sample_minibatch.return_value = synthetic_pool[:5]
# Initial eval (train) + validation eval + iteration train eval + new prompt eval + validation eval
train_eval = _make_eval(0.5)
val_eval = _make_eval(0.6)
new_eval = _make_eval(0.7)
val_eval_2 = _make_eval(0.65)
evaluator.evaluate = AsyncMock(
side_effect=[train_eval, val_eval, train_eval, new_eval, val_eval_2]
)
validation_pool = synthetic_pool[-6:]
loop = EvolutionLoop(
evaluator=evaluator,
proposer=mock_proposer_port,
bootstrap=bootstrap,
max_iterations=1,
minibatch_size=5,
)
state = await loop.run(
seed_prompt, synthetic_pool, task_description,
validation_pool=validation_pool,
)
# Should have validation metrics in state
assert state.best_validation_score is not None
# History should contain validation_eval entries
val_events = [h for h in state.history if h["event"] == "validation_eval"]
assert len(val_events) >= 1
@pytest.mark.asyncio
async def test_no_validation_without_pool(
self,
seed_prompt: Prompt,
synthetic_pool: list[SyntheticExample],
task_description: str,
mock_llm_port: AsyncMock,
mock_judge_port: AsyncMock,
mock_proposer_port: AsyncMock,
) -> None:
"""Without a validation pool, no validation is performed."""
evaluator = PromptEvaluator(mock_llm_port, mock_judge_port)
bootstrap = MagicMock(spec=SyntheticBootstrap)
bootstrap.sample_minibatch.return_value = synthetic_pool[:5]
train_eval = _make_eval(0.5)
old_eval = _make_eval(0.5)
new_eval = _make_eval(0.7)
evaluator.evaluate = AsyncMock(side_effect=[train_eval, old_eval, new_eval])
loop = EvolutionLoop(
evaluator=evaluator,
proposer=mock_proposer_port,
bootstrap=bootstrap,
max_iterations=1,
minibatch_size=5,
)
state = await loop.run(seed_prompt, synthetic_pool, task_description)
assert state.best_validation_score is None
assert not state.early_stopped
val_events = [h for h in state.history if h["event"] == "validation_eval"]
assert len(val_events) == 0
class TestEarlyStopping:
"""Tests for early stopping when validation score degrades."""
@pytest.mark.asyncio
async def test_early_stop_triggers_on_patience_exceeded(
self,
seed_prompt: Prompt,
synthetic_pool: list[SyntheticExample],
task_description: str,
mock_llm_port: AsyncMock,
mock_judge_port: AsyncMock,
mock_proposer_port: AsyncMock,
) -> None:
"""Early stopping triggers when validation doesn't improve for K iterations."""
evaluator = PromptEvaluator(mock_llm_port, mock_judge_port)
bootstrap = MagicMock(spec=SyntheticBootstrap)
bootstrap.sample_minibatch.return_value = synthetic_pool[:5]
patience = 3
# Build eval sequence:
# 1. Initial train eval
# 2. Initial validation eval (0.5)
# Then for each of 3 iterations:
# - train eval (current best)
# - train eval (new prompt - accepted)
# - validation eval (degrading)
evals = [
_make_eval(0.5), # initial train
_make_eval(0.5), # initial validation
]
for i in range(patience):
evals.extend([
_make_eval(0.5 + i * 0.1), # current eval (train)
_make_eval(0.6 + i * 0.1), # new eval (train) - accepted
_make_eval(0.4), # validation eval (degrading)
])
evaluator.evaluate = AsyncMock(side_effect=evals)
validation_pool = synthetic_pool[-5:]
loop = EvolutionLoop(
evaluator=evaluator,
proposer=mock_proposer_port,
bootstrap=bootstrap,
max_iterations=10, # would go further without early stop
minibatch_size=5,
early_stop_patience=patience,
)
state = await loop.run(
seed_prompt, synthetic_pool, task_description,
validation_pool=validation_pool,
)
assert state.early_stopped is True
assert state.iteration == patience
assert state.best_validation_score is not None
# Should have an early_stop event in history
early_stop_events = [h for h in state.history if h["event"] == "early_stop"]
assert len(early_stop_events) == 1
@pytest.mark.asyncio
async def test_early_stop_does_not_trigger_when_improving(
self,
seed_prompt: Prompt,
synthetic_pool: list[SyntheticExample],
task_description: str,
mock_llm_port: AsyncMock,
mock_judge_port: AsyncMock,
mock_proposer_port: AsyncMock,
) -> None:
"""When validation keeps improving, early stopping does not trigger."""
evaluator = PromptEvaluator(mock_llm_port, mock_judge_port)
bootstrap = MagicMock(spec=SyntheticBootstrap)
bootstrap.sample_minibatch.return_value = synthetic_pool[:5]
evals = [
_make_eval(0.3), # initial train
_make_eval(0.3), # initial validation
]
# 3 iterations, each with improving validation
for i in range(3):
evals.extend([
_make_eval(0.3 + i * 0.1), # current train eval
_make_eval(0.4 + i * 0.1), # new train eval (accepted)
_make_eval(0.3 + (i + 1) * 0.1), # validation eval (improving)
])
evaluator.evaluate = AsyncMock(side_effect=evals)
validation_pool = synthetic_pool[-5:]
loop = EvolutionLoop(
evaluator=evaluator,
proposer=mock_proposer_port,
bootstrap=bootstrap,
max_iterations=3,
minibatch_size=5,
early_stop_patience=5,
)
state = await loop.run(
seed_prompt, synthetic_pool, task_description,
validation_pool=validation_pool,
)
assert state.early_stopped is False
assert state.iteration == 3
assert state.best_validation_score is not None
@pytest.mark.asyncio
async def test_validation_patience_resets_on_improvement(
self,
seed_prompt: Prompt,
synthetic_pool: list[SyntheticExample],
task_description: str,
mock_llm_port: AsyncMock,
mock_judge_port: AsyncMock,
mock_proposer_port: AsyncMock,
) -> None:
"""Patience counter resets when validation improves after degrading."""
evaluator = PromptEvaluator(mock_llm_port, mock_judge_port)
bootstrap = MagicMock(spec=SyntheticBootstrap)
bootstrap.sample_minibatch.return_value = synthetic_pool[:5]
evals = [
_make_eval(0.3), # initial train
_make_eval(0.3), # initial validation
# iter 1: degrade
_make_eval(0.3), # current train
_make_eval(0.5), # new train (accepted)
_make_eval(0.2), # validation degrade (patience=1)
# iter 2: degrade
_make_eval(0.5), # current train
_make_eval(0.6), # new train (accepted)
_make_eval(0.2), # validation degrade (patience=2)
# iter 3: improve! (resets patience)
_make_eval(0.6), # current train
_make_eval(0.7), # new train (accepted)
_make_eval(0.4), # validation improve (patience=0)
# iter 4: degrade again
_make_eval(0.7), # current train
_make_eval(0.8), # new train (accepted)
_make_eval(0.2), # validation degrade (patience=1)
]
evaluator.evaluate = AsyncMock(side_effect=evals)
validation_pool = synthetic_pool[-5:]
loop = EvolutionLoop(
evaluator=evaluator,
proposer=mock_proposer_port,
bootstrap=bootstrap,
max_iterations=4,
minibatch_size=5,
early_stop_patience=3,
)
state = await loop.run(
seed_prompt, synthetic_pool, task_description,
validation_pool=validation_pool,
)
assert state.early_stopped is False
assert state.iteration == 4