feat: v0.2.0 sprint — ground truth eval, crossover/mutation, checkpointing, similarity guards, dataset loader, CLI commands, extended test coverage
Aggregates all v0.2.0 sprint work (GARAA-30 through GARAA-40) and fixes 2 integration tests that broke when the codebase went async (DSPyLLMAdapter and full pipeline tests now properly await coroutines). 277 tests pass (260 unit + 17 integration). Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
316
tests/unit/test_holdout_validation.py
Normal file
316
tests/unit/test_holdout_validation.py
Normal file
@@ -0,0 +1,316 @@
|
||||
"""Unit tests for hold-out validation and early stopping."""
|
||||
from __future__ import annotations
|
||||
|
||||
from unittest.mock import AsyncMock, MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from prometheus.application.bootstrap import SyntheticBootstrap
|
||||
from prometheus.application.evaluator import PromptEvaluator
|
||||
from prometheus.application.evolution import EvolutionLoop
|
||||
from prometheus.domain.entities import (
|
||||
Candidate,
|
||||
EvalResult,
|
||||
Prompt,
|
||||
SyntheticExample,
|
||||
Trajectory,
|
||||
)
|
||||
|
||||
|
||||
def _make_eval(mean_score: float, n: int = 5) -> EvalResult:
|
||||
"""Helper: create an EvalResult with a given mean score."""
|
||||
scores = [mean_score] * n
|
||||
return EvalResult(
|
||||
scores=scores,
|
||||
feedbacks=["feedback"] * n,
|
||||
trajectories=[
|
||||
Trajectory(f"input{i}", f"output{i}", mean_score, "feedback", "prompt")
|
||||
for i in range(n)
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
class TestBootstrapSplit:
|
||||
"""Tests for SyntheticBootstrap.split_pool."""
|
||||
|
||||
def test_split_produces_correct_sizes(self):
|
||||
pool = [SyntheticExample(input_text=f"ex{i}", id=i) for i in range(20)]
|
||||
train, val = SyntheticBootstrap.split_pool(pool, 0.3)
|
||||
assert len(train) + len(val) == 20
|
||||
assert len(val) == 6 # 20 * 0.3 = 6
|
||||
assert len(train) == 14
|
||||
|
||||
def test_split_zero_fraction_returns_all_train(self):
|
||||
pool = [SyntheticExample(input_text=f"ex{i}", id=i) for i in range(10)]
|
||||
train, val = SyntheticBootstrap.split_pool(pool, 0.0)
|
||||
assert len(train) == 10
|
||||
assert len(val) == 0
|
||||
|
||||
def test_split_single_element(self):
|
||||
pool = [SyntheticExample(input_text="only", id=0)]
|
||||
train, val = SyntheticBootstrap.split_pool(pool, 0.3)
|
||||
assert len(train) == 1
|
||||
assert len(val) == 0
|
||||
|
||||
def test_split_deterministic_with_seed(self):
|
||||
pool = [SyntheticExample(input_text=f"ex{i}", id=i) for i in range(50)]
|
||||
train1, val1 = SyntheticBootstrap.split_pool(pool, 0.3, rng=MagicMock(wraps=__import__("random").Random(42)))
|
||||
train2, val2 = SyntheticBootstrap.split_pool(pool, 0.3, rng=MagicMock(wraps=__import__("random").Random(42)))
|
||||
assert [ex.id for ex in train1] == [ex.id for ex in train2]
|
||||
assert [ex.id for ex in val1] == [ex.id for ex in val2]
|
||||
|
||||
def test_split_no_overlap(self):
|
||||
pool = [SyntheticExample(input_text=f"ex{i}", id=i) for i in range(30)]
|
||||
train, val = SyntheticBootstrap.split_pool(pool, 0.3)
|
||||
train_ids = {ex.id for ex in train}
|
||||
val_ids = {ex.id for ex in val}
|
||||
assert train_ids.isdisjoint(val_ids)
|
||||
assert train_ids | val_ids == {ex.id for ex in pool}
|
||||
|
||||
|
||||
class TestValidationEvaluation:
|
||||
"""Tests for hold-out evaluation during evolution."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_validation_pool_evaluated_after_each_iteration(
|
||||
self,
|
||||
seed_prompt: Prompt,
|
||||
synthetic_pool: list[SyntheticExample],
|
||||
task_description: str,
|
||||
mock_llm_port: AsyncMock,
|
||||
mock_judge_port: AsyncMock,
|
||||
mock_proposer_port: AsyncMock,
|
||||
) -> None:
|
||||
"""When a validation pool is provided, the best candidate is evaluated on it."""
|
||||
evaluator = PromptEvaluator(mock_llm_port, mock_judge_port)
|
||||
bootstrap = MagicMock(spec=SyntheticBootstrap)
|
||||
bootstrap.sample_minibatch.return_value = synthetic_pool[:5]
|
||||
|
||||
# Initial eval (train) + validation eval + iteration train eval + new prompt eval + validation eval
|
||||
train_eval = _make_eval(0.5)
|
||||
val_eval = _make_eval(0.6)
|
||||
new_eval = _make_eval(0.7)
|
||||
val_eval_2 = _make_eval(0.65)
|
||||
|
||||
evaluator.evaluate = AsyncMock(
|
||||
side_effect=[train_eval, val_eval, train_eval, new_eval, val_eval_2]
|
||||
)
|
||||
|
||||
validation_pool = synthetic_pool[-6:]
|
||||
|
||||
loop = EvolutionLoop(
|
||||
evaluator=evaluator,
|
||||
proposer=mock_proposer_port,
|
||||
bootstrap=bootstrap,
|
||||
max_iterations=1,
|
||||
minibatch_size=5,
|
||||
)
|
||||
state = await loop.run(
|
||||
seed_prompt, synthetic_pool, task_description,
|
||||
validation_pool=validation_pool,
|
||||
)
|
||||
|
||||
# Should have validation metrics in state
|
||||
assert state.best_validation_score is not None
|
||||
# History should contain validation_eval entries
|
||||
val_events = [h for h in state.history if h["event"] == "validation_eval"]
|
||||
assert len(val_events) >= 1
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_no_validation_without_pool(
|
||||
self,
|
||||
seed_prompt: Prompt,
|
||||
synthetic_pool: list[SyntheticExample],
|
||||
task_description: str,
|
||||
mock_llm_port: AsyncMock,
|
||||
mock_judge_port: AsyncMock,
|
||||
mock_proposer_port: AsyncMock,
|
||||
) -> None:
|
||||
"""Without a validation pool, no validation is performed."""
|
||||
evaluator = PromptEvaluator(mock_llm_port, mock_judge_port)
|
||||
bootstrap = MagicMock(spec=SyntheticBootstrap)
|
||||
bootstrap.sample_minibatch.return_value = synthetic_pool[:5]
|
||||
|
||||
train_eval = _make_eval(0.5)
|
||||
old_eval = _make_eval(0.5)
|
||||
new_eval = _make_eval(0.7)
|
||||
evaluator.evaluate = AsyncMock(side_effect=[train_eval, old_eval, new_eval])
|
||||
|
||||
loop = EvolutionLoop(
|
||||
evaluator=evaluator,
|
||||
proposer=mock_proposer_port,
|
||||
bootstrap=bootstrap,
|
||||
max_iterations=1,
|
||||
minibatch_size=5,
|
||||
)
|
||||
state = await loop.run(seed_prompt, synthetic_pool, task_description)
|
||||
|
||||
assert state.best_validation_score is None
|
||||
assert not state.early_stopped
|
||||
val_events = [h for h in state.history if h["event"] == "validation_eval"]
|
||||
assert len(val_events) == 0
|
||||
|
||||
|
||||
class TestEarlyStopping:
|
||||
"""Tests for early stopping when validation score degrades."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_early_stop_triggers_on_patience_exceeded(
|
||||
self,
|
||||
seed_prompt: Prompt,
|
||||
synthetic_pool: list[SyntheticExample],
|
||||
task_description: str,
|
||||
mock_llm_port: AsyncMock,
|
||||
mock_judge_port: AsyncMock,
|
||||
mock_proposer_port: AsyncMock,
|
||||
) -> None:
|
||||
"""Early stopping triggers when validation doesn't improve for K iterations."""
|
||||
evaluator = PromptEvaluator(mock_llm_port, mock_judge_port)
|
||||
bootstrap = MagicMock(spec=SyntheticBootstrap)
|
||||
bootstrap.sample_minibatch.return_value = synthetic_pool[:5]
|
||||
|
||||
patience = 3
|
||||
# Build eval sequence:
|
||||
# 1. Initial train eval
|
||||
# 2. Initial validation eval (0.5)
|
||||
# Then for each of 3 iterations:
|
||||
# - train eval (current best)
|
||||
# - train eval (new prompt - accepted)
|
||||
# - validation eval (degrading)
|
||||
evals = [
|
||||
_make_eval(0.5), # initial train
|
||||
_make_eval(0.5), # initial validation
|
||||
]
|
||||
for i in range(patience):
|
||||
evals.extend([
|
||||
_make_eval(0.5 + i * 0.1), # current eval (train)
|
||||
_make_eval(0.6 + i * 0.1), # new eval (train) - accepted
|
||||
_make_eval(0.4), # validation eval (degrading)
|
||||
])
|
||||
|
||||
evaluator.evaluate = AsyncMock(side_effect=evals)
|
||||
|
||||
validation_pool = synthetic_pool[-5:]
|
||||
|
||||
loop = EvolutionLoop(
|
||||
evaluator=evaluator,
|
||||
proposer=mock_proposer_port,
|
||||
bootstrap=bootstrap,
|
||||
max_iterations=10, # would go further without early stop
|
||||
minibatch_size=5,
|
||||
early_stop_patience=patience,
|
||||
)
|
||||
state = await loop.run(
|
||||
seed_prompt, synthetic_pool, task_description,
|
||||
validation_pool=validation_pool,
|
||||
)
|
||||
|
||||
assert state.early_stopped is True
|
||||
assert state.iteration == patience
|
||||
assert state.best_validation_score is not None
|
||||
# Should have an early_stop event in history
|
||||
early_stop_events = [h for h in state.history if h["event"] == "early_stop"]
|
||||
assert len(early_stop_events) == 1
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_early_stop_does_not_trigger_when_improving(
|
||||
self,
|
||||
seed_prompt: Prompt,
|
||||
synthetic_pool: list[SyntheticExample],
|
||||
task_description: str,
|
||||
mock_llm_port: AsyncMock,
|
||||
mock_judge_port: AsyncMock,
|
||||
mock_proposer_port: AsyncMock,
|
||||
) -> None:
|
||||
"""When validation keeps improving, early stopping does not trigger."""
|
||||
evaluator = PromptEvaluator(mock_llm_port, mock_judge_port)
|
||||
bootstrap = MagicMock(spec=SyntheticBootstrap)
|
||||
bootstrap.sample_minibatch.return_value = synthetic_pool[:5]
|
||||
|
||||
evals = [
|
||||
_make_eval(0.3), # initial train
|
||||
_make_eval(0.3), # initial validation
|
||||
]
|
||||
# 3 iterations, each with improving validation
|
||||
for i in range(3):
|
||||
evals.extend([
|
||||
_make_eval(0.3 + i * 0.1), # current train eval
|
||||
_make_eval(0.4 + i * 0.1), # new train eval (accepted)
|
||||
_make_eval(0.3 + (i + 1) * 0.1), # validation eval (improving)
|
||||
])
|
||||
|
||||
evaluator.evaluate = AsyncMock(side_effect=evals)
|
||||
|
||||
validation_pool = synthetic_pool[-5:]
|
||||
|
||||
loop = EvolutionLoop(
|
||||
evaluator=evaluator,
|
||||
proposer=mock_proposer_port,
|
||||
bootstrap=bootstrap,
|
||||
max_iterations=3,
|
||||
minibatch_size=5,
|
||||
early_stop_patience=5,
|
||||
)
|
||||
state = await loop.run(
|
||||
seed_prompt, synthetic_pool, task_description,
|
||||
validation_pool=validation_pool,
|
||||
)
|
||||
|
||||
assert state.early_stopped is False
|
||||
assert state.iteration == 3
|
||||
assert state.best_validation_score is not None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_validation_patience_resets_on_improvement(
|
||||
self,
|
||||
seed_prompt: Prompt,
|
||||
synthetic_pool: list[SyntheticExample],
|
||||
task_description: str,
|
||||
mock_llm_port: AsyncMock,
|
||||
mock_judge_port: AsyncMock,
|
||||
mock_proposer_port: AsyncMock,
|
||||
) -> None:
|
||||
"""Patience counter resets when validation improves after degrading."""
|
||||
evaluator = PromptEvaluator(mock_llm_port, mock_judge_port)
|
||||
bootstrap = MagicMock(spec=SyntheticBootstrap)
|
||||
bootstrap.sample_minibatch.return_value = synthetic_pool[:5]
|
||||
|
||||
evals = [
|
||||
_make_eval(0.3), # initial train
|
||||
_make_eval(0.3), # initial validation
|
||||
# iter 1: degrade
|
||||
_make_eval(0.3), # current train
|
||||
_make_eval(0.5), # new train (accepted)
|
||||
_make_eval(0.2), # validation degrade (patience=1)
|
||||
# iter 2: degrade
|
||||
_make_eval(0.5), # current train
|
||||
_make_eval(0.6), # new train (accepted)
|
||||
_make_eval(0.2), # validation degrade (patience=2)
|
||||
# iter 3: improve! (resets patience)
|
||||
_make_eval(0.6), # current train
|
||||
_make_eval(0.7), # new train (accepted)
|
||||
_make_eval(0.4), # validation improve (patience=0)
|
||||
# iter 4: degrade again
|
||||
_make_eval(0.7), # current train
|
||||
_make_eval(0.8), # new train (accepted)
|
||||
_make_eval(0.2), # validation degrade (patience=1)
|
||||
]
|
||||
|
||||
evaluator.evaluate = AsyncMock(side_effect=evals)
|
||||
validation_pool = synthetic_pool[-5:]
|
||||
|
||||
loop = EvolutionLoop(
|
||||
evaluator=evaluator,
|
||||
proposer=mock_proposer_port,
|
||||
bootstrap=bootstrap,
|
||||
max_iterations=4,
|
||||
minibatch_size=5,
|
||||
early_stop_patience=3,
|
||||
)
|
||||
state = await loop.run(
|
||||
seed_prompt, synthetic_pool, task_description,
|
||||
validation_pool=validation_pool,
|
||||
)
|
||||
|
||||
assert state.early_stopped is False
|
||||
assert state.iteration == 4
|
||||
Reference in New Issue
Block a user