feat: v0.2.0 sprint — ground truth eval, crossover/mutation, checkpointing, similarity guards, dataset loader, CLI commands, extended test coverage

Aggregates all v0.2.0 sprint work (GARAA-30 through GARAA-40) and fixes 2 integration tests that broke when the codebase went async (DSPyLLMAdapter and full pipeline tests now properly await coroutines). 277 tests pass (260 unit + 17 integration). Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-03-29 19:13:50 +00:00
parent b9745566c8
commit a5bf2ad59c
43 changed files with 5007 additions and 358 deletions
--- a/tests/integration/test_full_pipeline.py
+++ b/tests/integration/test_full_pipeline.py
@@ -1,7 +1,9 @@
 """End-to-end pipeline test with mocked LLM calls."""
 from __future__ import annotations

-from unittest.mock import MagicMock
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest

 from prometheus.application.bootstrap import SyntheticBootstrap
 from prometheus.application.dto import OptimizationConfig
@@ -23,9 +25,10 @@ def _make_eval(scores: list[float]) -> EvalResult:


 class TestFullPipeline:
-    def test_pipeline_produces_result(self) -> None:
+    @pytest.mark.asyncio
+    async def test_pipeline_produces_result(self) -> None:
        """Full pipeline with mocked ports produces an OptimizationResult."""
-        mock_llm = MagicMock(spec=LLMPort)
+        mock_llm = AsyncMock(spec=LLMPort)
        mock_llm.execute.return_value = "mock response"

        mock_judge = MagicMock(spec=JudgePort)
@@ -38,11 +41,11 @@ class TestFullPipeline:
            eval_sequence.append(_make_eval([0.6, 0.6, 0.6, 0.6, 0.6]))  # new eval (accepted)
        mock_judge.judge_batch.return_value = [(0.5, "ok")] * 5

-        mock_proposer = MagicMock(spec=ProposerPort)
+        mock_proposer = AsyncMock(spec=ProposerPort)
        mock_proposer.propose.return_value = Prompt(text="Improved prompt")

        evaluator = PromptEvaluator(mock_llm, mock_judge)
-        evaluator.evaluate = MagicMock(side_effect=eval_sequence)
+        evaluator.evaluate = AsyncMock(side_effect=eval_sequence)

        mock_gen = MagicMock()
        mock_gen.generate_inputs.return_value = [
@@ -65,7 +68,7 @@ class TestFullPipeline:
            seed=42,
        )

-        result = use_case.execute(config)
+        result = await use_case.execute(config)

        assert result.initial_prompt == "Answer questions."
        assert result.optimized_prompt == "Improved prompt"