feat: v0.2.0 sprint — ground truth eval, crossover/mutation, checkpointing, similarity guards, dataset loader, CLI commands, extended test coverage

Aggregates all v0.2.0 sprint work (GARAA-30 through GARAA-40) and fixes 2 integration tests that broke when the codebase went async (DSPyLLMAdapter and full pipeline tests now properly await coroutines). 277 tests pass (260 unit + 17 integration). Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-03-29 19:13:50 +00:00
parent b9745566c8
commit a5bf2ad59c
43 changed files with 5007 additions and 358 deletions
--- a/tests/unit/test_holdout_validation.py
+++ b/tests/unit/test_holdout_validation.py
@@ -0,0 +1,316 @@
+"""Unit tests for hold-out validation and early stopping."""
+from __future__ import annotations
+
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+from prometheus.application.bootstrap import SyntheticBootstrap
+from prometheus.application.evaluator import PromptEvaluator
+from prometheus.application.evolution import EvolutionLoop
+from prometheus.domain.entities import (
+    Candidate,
+    EvalResult,
+    Prompt,
+    SyntheticExample,
+    Trajectory,
+)
+
+
+def _make_eval(mean_score: float, n: int = 5) -> EvalResult:
+    """Helper: create an EvalResult with a given mean score."""
+    scores = [mean_score] * n
+    return EvalResult(
+        scores=scores,
+        feedbacks=["feedback"] * n,
+        trajectories=[
+            Trajectory(f"input{i}", f"output{i}", mean_score, "feedback", "prompt")
+            for i in range(n)
+        ],
+    )
+
+
+class TestBootstrapSplit:
+    """Tests for SyntheticBootstrap.split_pool."""
+
+    def test_split_produces_correct_sizes(self):
+        pool = [SyntheticExample(input_text=f"ex{i}", id=i) for i in range(20)]
+        train, val = SyntheticBootstrap.split_pool(pool, 0.3)
+        assert len(train) + len(val) == 20
+        assert len(val) == 6  # 20 * 0.3 = 6
+        assert len(train) == 14
+
+    def test_split_zero_fraction_returns_all_train(self):
+        pool = [SyntheticExample(input_text=f"ex{i}", id=i) for i in range(10)]
+        train, val = SyntheticBootstrap.split_pool(pool, 0.0)
+        assert len(train) == 10
+        assert len(val) == 0
+
+    def test_split_single_element(self):
+        pool = [SyntheticExample(input_text="only", id=0)]
+        train, val = SyntheticBootstrap.split_pool(pool, 0.3)
+        assert len(train) == 1
+        assert len(val) == 0
+
+    def test_split_deterministic_with_seed(self):
+        pool = [SyntheticExample(input_text=f"ex{i}", id=i) for i in range(50)]
+        train1, val1 = SyntheticBootstrap.split_pool(pool, 0.3, rng=MagicMock(wraps=__import__("random").Random(42)))
+        train2, val2 = SyntheticBootstrap.split_pool(pool, 0.3, rng=MagicMock(wraps=__import__("random").Random(42)))
+        assert [ex.id for ex in train1] == [ex.id for ex in train2]
+        assert [ex.id for ex in val1] == [ex.id for ex in val2]
+
+    def test_split_no_overlap(self):
+        pool = [SyntheticExample(input_text=f"ex{i}", id=i) for i in range(30)]
+        train, val = SyntheticBootstrap.split_pool(pool, 0.3)
+        train_ids = {ex.id for ex in train}
+        val_ids = {ex.id for ex in val}
+        assert train_ids.isdisjoint(val_ids)
+        assert train_ids | val_ids == {ex.id for ex in pool}
+
+
+class TestValidationEvaluation:
+    """Tests for hold-out evaluation during evolution."""
+
+    @pytest.mark.asyncio
+    async def test_validation_pool_evaluated_after_each_iteration(
+        self,
+        seed_prompt: Prompt,
+        synthetic_pool: list[SyntheticExample],
+        task_description: str,
+        mock_llm_port: AsyncMock,
+        mock_judge_port: AsyncMock,
+        mock_proposer_port: AsyncMock,
+    ) -> None:
+        """When a validation pool is provided, the best candidate is evaluated on it."""
+        evaluator = PromptEvaluator(mock_llm_port, mock_judge_port)
+        bootstrap = MagicMock(spec=SyntheticBootstrap)
+        bootstrap.sample_minibatch.return_value = synthetic_pool[:5]
+
+        # Initial eval (train) + validation eval + iteration train eval + new prompt eval + validation eval
+        train_eval = _make_eval(0.5)
+        val_eval = _make_eval(0.6)
+        new_eval = _make_eval(0.7)
+        val_eval_2 = _make_eval(0.65)
+
+        evaluator.evaluate = AsyncMock(
+            side_effect=[train_eval, val_eval, train_eval, new_eval, val_eval_2]
+        )
+
+        validation_pool = synthetic_pool[-6:]
+
+        loop = EvolutionLoop(
+            evaluator=evaluator,
+            proposer=mock_proposer_port,
+            bootstrap=bootstrap,
+            max_iterations=1,
+            minibatch_size=5,
+        )
+        state = await loop.run(
+            seed_prompt, synthetic_pool, task_description,
+            validation_pool=validation_pool,
+        )
+
+        # Should have validation metrics in state
+        assert state.best_validation_score is not None
+        # History should contain validation_eval entries
+        val_events = [h for h in state.history if h["event"] == "validation_eval"]
+        assert len(val_events) >= 1
+
+    @pytest.mark.asyncio
+    async def test_no_validation_without_pool(
+        self,
+        seed_prompt: Prompt,
+        synthetic_pool: list[SyntheticExample],
+        task_description: str,
+        mock_llm_port: AsyncMock,
+        mock_judge_port: AsyncMock,
+        mock_proposer_port: AsyncMock,
+    ) -> None:
+        """Without a validation pool, no validation is performed."""
+        evaluator = PromptEvaluator(mock_llm_port, mock_judge_port)
+        bootstrap = MagicMock(spec=SyntheticBootstrap)
+        bootstrap.sample_minibatch.return_value = synthetic_pool[:5]
+
+        train_eval = _make_eval(0.5)
+        old_eval = _make_eval(0.5)
+        new_eval = _make_eval(0.7)
+        evaluator.evaluate = AsyncMock(side_effect=[train_eval, old_eval, new_eval])
+
+        loop = EvolutionLoop(
+            evaluator=evaluator,
+            proposer=mock_proposer_port,
+            bootstrap=bootstrap,
+            max_iterations=1,
+            minibatch_size=5,
+        )
+        state = await loop.run(seed_prompt, synthetic_pool, task_description)
+
+        assert state.best_validation_score is None
+        assert not state.early_stopped
+        val_events = [h for h in state.history if h["event"] == "validation_eval"]
+        assert len(val_events) == 0
+
+
+class TestEarlyStopping:
+    """Tests for early stopping when validation score degrades."""
+
+    @pytest.mark.asyncio
+    async def test_early_stop_triggers_on_patience_exceeded(
+        self,
+        seed_prompt: Prompt,
+        synthetic_pool: list[SyntheticExample],
+        task_description: str,
+        mock_llm_port: AsyncMock,
+        mock_judge_port: AsyncMock,
+        mock_proposer_port: AsyncMock,
+    ) -> None:
+        """Early stopping triggers when validation doesn't improve for K iterations."""
+        evaluator = PromptEvaluator(mock_llm_port, mock_judge_port)
+        bootstrap = MagicMock(spec=SyntheticBootstrap)
+        bootstrap.sample_minibatch.return_value = synthetic_pool[:5]
+
+        patience = 3
+        # Build eval sequence:
+        # 1. Initial train eval
+        # 2. Initial validation eval (0.5)
+        # Then for each of 3 iterations:
+        #   - train eval (current best)
+        #   - train eval (new prompt - accepted)
+        #   - validation eval (degrading)
+        evals = [
+            _make_eval(0.5),   # initial train
+            _make_eval(0.5),   # initial validation
+        ]
+        for i in range(patience):
+            evals.extend([
+                _make_eval(0.5 + i * 0.1),  # current eval (train)
+                _make_eval(0.6 + i * 0.1),  # new eval (train) - accepted
+                _make_eval(0.4),            # validation eval (degrading)
+            ])
+
+        evaluator.evaluate = AsyncMock(side_effect=evals)
+
+        validation_pool = synthetic_pool[-5:]
+
+        loop = EvolutionLoop(
+            evaluator=evaluator,
+            proposer=mock_proposer_port,
+            bootstrap=bootstrap,
+            max_iterations=10,  # would go further without early stop
+            minibatch_size=5,
+            early_stop_patience=patience,
+        )
+        state = await loop.run(
+            seed_prompt, synthetic_pool, task_description,
+            validation_pool=validation_pool,
+        )
+
+        assert state.early_stopped is True
+        assert state.iteration == patience
+        assert state.best_validation_score is not None
+        # Should have an early_stop event in history
+        early_stop_events = [h for h in state.history if h["event"] == "early_stop"]
+        assert len(early_stop_events) == 1
+
+    @pytest.mark.asyncio
+    async def test_early_stop_does_not_trigger_when_improving(
+        self,
+        seed_prompt: Prompt,
+        synthetic_pool: list[SyntheticExample],
+        task_description: str,
+        mock_llm_port: AsyncMock,
+        mock_judge_port: AsyncMock,
+        mock_proposer_port: AsyncMock,
+    ) -> None:
+        """When validation keeps improving, early stopping does not trigger."""
+        evaluator = PromptEvaluator(mock_llm_port, mock_judge_port)
+        bootstrap = MagicMock(spec=SyntheticBootstrap)
+        bootstrap.sample_minibatch.return_value = synthetic_pool[:5]
+
+        evals = [
+            _make_eval(0.3),   # initial train
+            _make_eval(0.3),   # initial validation
+        ]
+        # 3 iterations, each with improving validation
+        for i in range(3):
+            evals.extend([
+                _make_eval(0.3 + i * 0.1),  # current train eval
+                _make_eval(0.4 + i * 0.1),  # new train eval (accepted)
+                _make_eval(0.3 + (i + 1) * 0.1),  # validation eval (improving)
+            ])
+
+        evaluator.evaluate = AsyncMock(side_effect=evals)
+
+        validation_pool = synthetic_pool[-5:]
+
+        loop = EvolutionLoop(
+            evaluator=evaluator,
+            proposer=mock_proposer_port,
+            bootstrap=bootstrap,
+            max_iterations=3,
+            minibatch_size=5,
+            early_stop_patience=5,
+        )
+        state = await loop.run(
+            seed_prompt, synthetic_pool, task_description,
+            validation_pool=validation_pool,
+        )
+
+        assert state.early_stopped is False
+        assert state.iteration == 3
+        assert state.best_validation_score is not None
+
+    @pytest.mark.asyncio
+    async def test_validation_patience_resets_on_improvement(
+        self,
+        seed_prompt: Prompt,
+        synthetic_pool: list[SyntheticExample],
+        task_description: str,
+        mock_llm_port: AsyncMock,
+        mock_judge_port: AsyncMock,
+        mock_proposer_port: AsyncMock,
+    ) -> None:
+        """Patience counter resets when validation improves after degrading."""
+        evaluator = PromptEvaluator(mock_llm_port, mock_judge_port)
+        bootstrap = MagicMock(spec=SyntheticBootstrap)
+        bootstrap.sample_minibatch.return_value = synthetic_pool[:5]
+
+        evals = [
+            _make_eval(0.3),   # initial train
+            _make_eval(0.3),   # initial validation
+            # iter 1: degrade
+            _make_eval(0.3),   # current train
+            _make_eval(0.5),   # new train (accepted)
+            _make_eval(0.2),   # validation degrade (patience=1)
+            # iter 2: degrade
+            _make_eval(0.5),   # current train
+            _make_eval(0.6),   # new train (accepted)
+            _make_eval(0.2),   # validation degrade (patience=2)
+            # iter 3: improve! (resets patience)
+            _make_eval(0.6),   # current train
+            _make_eval(0.7),   # new train (accepted)
+            _make_eval(0.4),   # validation improve (patience=0)
+            # iter 4: degrade again
+            _make_eval(0.7),   # current train
+            _make_eval(0.8),   # new train (accepted)
+            _make_eval(0.2),   # validation degrade (patience=1)
+        ]
+
+        evaluator.evaluate = AsyncMock(side_effect=evals)
+        validation_pool = synthetic_pool[-5:]
+
+        loop = EvolutionLoop(
+            evaluator=evaluator,
+            proposer=mock_proposer_port,
+            bootstrap=bootstrap,
+            max_iterations=4,
+            minibatch_size=5,
+            early_stop_patience=3,
+        )
+        state = await loop.run(
+            seed_prompt, synthetic_pool, task_description,
+            validation_pool=validation_pool,
+        )
+
+        assert state.early_stopped is False
+        assert state.iteration == 4