feat: v0.2.0 sprint — ground truth eval, crossover/mutation, checkpointing, similarity guards, dataset loader, CLI commands, extended test coverage

Aggregates all v0.2.0 sprint work (GARAA-30 through GARAA-40) and fixes 2 integration tests that broke when the codebase went async (DSPyLLMAdapter and full pipeline tests now properly await coroutines). 277 tests pass (260 unit + 17 integration). Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-03-29 19:13:50 +00:00
parent b9745566c8
commit a5bf2ad59c
43 changed files with 5007 additions and 358 deletions
--- a/tests/unit/test_use_cases.py
+++ b/tests/unit/test_use_cases.py
@@ -0,0 +1,233 @@
+"""Unit tests for OptimizePromptUseCase — direct orchestration tests."""
+from __future__ import annotations
+
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+from prometheus.application.bootstrap import SyntheticBootstrap
+from prometheus.application.dto import OptimizationConfig, OptimizationResult
+from prometheus.application.evaluator import PromptEvaluator
+from prometheus.application.evolution import EvolutionLoop
+from prometheus.application.use_cases import OptimizePromptUseCase
+from prometheus.domain.entities import (
+    Candidate,
+    EvalResult,
+    OptimizationState,
+    Prompt,
+    SyntheticExample,
+    Trajectory,
+)
+
+
+def _make_eval(scores: list[float]) -> EvalResult:
+    return EvalResult(
+        scores=scores,
+        feedbacks=["feedback"] * len(scores),
+        trajectories=[
+            Trajectory(f"in{i}", f"out{i}", s, "feedback", "prompt")
+            for i, s in enumerate(scores)
+        ],
+    )
+
+
+def _make_state(
+    iterations: int = 3,
+    initial_score: float = 0.3,
+    final_score: float = 0.8,
+    accepted: bool = True,
+) -> OptimizationState:
+    seed = Candidate(prompt=Prompt(text="seed"), best_score=initial_score, generation=0)
+    best = Candidate(
+        prompt=Prompt(text="optimized" if accepted else "seed"),
+        best_score=final_score,
+        generation=iterations if accepted else 0,
+    )
+    history = []
+    for i in range(1, iterations + 1):
+        event = "accepted" if accepted else "rejected"
+        history.append({"iteration": i, "event": event, "old_score": 0.3, "new_score": 0.8})
+
+    return OptimizationState(
+        iteration=iterations,
+        best_candidate=best,
+        candidates=[seed, best] if accepted else [seed],
+        total_llm_calls=iterations * 11 + 10,
+        history=history,
+    )
+
+
+class TestOptimizePromptUseCaseExecute:
+    """Tests for the execute() orchestration method."""
+
+    @pytest.fixture
+    def mock_evaluator(self) -> MagicMock:
+        return MagicMock(spec=PromptEvaluator)
+
+    @pytest.fixture
+    def mock_proposer(self) -> MagicMock:
+        return MagicMock()
+
+    @pytest.fixture
+    def mock_bootstrap(self) -> MagicMock:
+        return MagicMock(spec=SyntheticBootstrap)
+
+    @pytest.fixture
+    def use_case(
+        self,
+        mock_evaluator: MagicMock,
+        mock_proposer: MagicMock,
+        mock_bootstrap: MagicMock,
+    ) -> OptimizePromptUseCase:
+        return OptimizePromptUseCase(
+            evaluator=mock_evaluator,
+            proposer=mock_proposer,
+            bootstrap=mock_bootstrap,
+        )
+
+    @pytest.fixture
+    def config(self) -> OptimizationConfig:
+        return OptimizationConfig(
+            seed_prompt="Answer the question.",
+            task_description="Q&A task",
+            max_iterations=5,
+            n_synthetic_inputs=20,
+            minibatch_size=5,
+            seed=42,
+        )
+
+    @pytest.mark.asyncio
+    async def test_returns_optimization_result(
+        self,
+        use_case: OptimizePromptUseCase,
+        mock_bootstrap: MagicMock,
+        config: OptimizationConfig,
+    ) -> None:
+        mock_bootstrap.run.return_value = [
+            SyntheticExample(input_text=f"q{i}", id=i) for i in range(20)
+        ]
+
+        mock_state = _make_state(iterations=3, initial_score=0.3, final_score=0.9)
+        with patch.object(EvolutionLoop, "run", return_value=mock_state):
+            result = await use_case.execute(config)
+
+        assert isinstance(result, OptimizationResult)
+        assert result.initial_prompt == "Answer the question."
+        assert result.final_score == 0.9
+        assert result.improvement == pytest.approx(0.6)
+
+    @pytest.mark.asyncio
+    async def test_bootstrap_called_with_config_params(
+        self,
+        use_case: OptimizePromptUseCase,
+        mock_bootstrap: MagicMock,
+        config: OptimizationConfig,
+    ) -> None:
+        mock_bootstrap.run.return_value = []
+        mock_state = _make_state()
+        with patch.object(EvolutionLoop, "run", return_value=mock_state):
+            await use_case.execute(config)
+
+        mock_bootstrap.run.assert_called_once_with(
+            task_description="Q&A task",
+            n_examples=20,
+        )
+
+    @pytest.mark.asyncio
+    async def test_evolution_loop_configured_from_config(
+        self,
+        use_case: OptimizePromptUseCase,
+        mock_bootstrap: MagicMock,
+        config: OptimizationConfig,
+    ) -> None:
+        mock_bootstrap.run.return_value = []
+        mock_state = _make_state()
+
+        with patch.object(EvolutionLoop, "run", return_value=mock_state) as mock_run:
+            await use_case.execute(config)
+
+            # Verify the loop was instantiated with correct params
+            mock_run.assert_called_once()
+            call_args = mock_run.call_args
+            seed_prompt = call_args[0][0]
+            assert seed_prompt.text == "Answer the question."
+            synthetic_pool = call_args[0][1]
+            assert len(synthetic_pool) == 0  # bootstrap returned empty
+            assert call_args[0][2] == "Q&A task"
+
+    @pytest.mark.asyncio
+    async def test_total_llm_calls_includes_bootstrap_call(
+        self,
+        use_case: OptimizePromptUseCase,
+        mock_bootstrap: MagicMock,
+        config: OptimizationConfig,
+    ) -> None:
+        mock_bootstrap.run.return_value = []
+        mock_state = _make_state(iterations=3)
+        # total_llm_calls from state + 1 for bootstrap
+        expected = mock_state.total_llm_calls + 1
+
+        with patch.object(EvolutionLoop, "run", return_value=mock_state):
+            result = await use_case.execute(config)
+
+        assert result.total_llm_calls == expected
+
+    @pytest.mark.asyncio
+    async def test_no_candidates_fallback(
+        self,
+        use_case: OptimizePromptUseCase,
+        mock_bootstrap: MagicMock,
+        config: OptimizationConfig,
+    ) -> None:
+        mock_bootstrap.run.return_value = [
+            SyntheticExample(input_text=f"q{i}", id=i) for i in range(20)
+        ]
+        mock_state = OptimizationState(
+            iteration=0,
+            best_candidate=None,
+            candidates=[],
+            total_llm_calls=0,
+        )
+
+        with patch.object(EvolutionLoop, "run", return_value=mock_state):
+            result = await use_case.execute(config)
+
+        assert result.optimized_prompt == "Answer the question."
+        assert result.initial_score == 0.0
+        assert result.final_score == 0.0
+        assert result.improvement == 0.0
+
+    @pytest.mark.asyncio
+    async def test_iterations_used_matches_state(
+        self,
+        use_case: OptimizePromptUseCase,
+        mock_bootstrap: MagicMock,
+        config: OptimizationConfig,
+    ) -> None:
+        mock_bootstrap.run.return_value = []
+        mock_state = _make_state(iterations=7)
+
+        with patch.object(EvolutionLoop, "run", return_value=mock_state):
+            result = await use_case.execute(config)
+
+        assert result.iterations_used == 7
+
+    @pytest.mark.asyncio
+    async def test_history_passed_through(
+        self,
+        use_case: OptimizePromptUseCase,
+        mock_bootstrap: MagicMock,
+        config: OptimizationConfig,
+    ) -> None:
+        mock_bootstrap.run.return_value = []
+        history = [
+            {"iteration": 1, "event": "accepted"},
+            {"iteration": 2, "event": "rejected"},
+        ]
+        mock_state = _make_state()
+        mock_state.history = history
+
+        with patch.object(EvolutionLoop, "run", return_value=mock_state):
+            result = await use_case.execute(config)
+
+        assert result.history == history