feat: v0.2.0 sprint — ground truth eval, crossover/mutation, checkpointing, similarity guards, dataset loader, CLI commands, extended test coverage

Aggregates all v0.2.0 sprint work (GARAA-30 through GARAA-40) and fixes 2 integration tests that broke when the codebase went async (DSPyLLMAdapter and full pipeline tests now properly await coroutines). 277 tests pass (260 unit + 17 integration). Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-03-29 19:13:50 +00:00
parent b9745566c8
commit a5bf2ad59c
43 changed files with 5007 additions and 358 deletions
--- a/tests/integration/test_ground_truth_eval.py
+++ b/tests/integration/test_ground_truth_eval.py
@@ -0,0 +1,199 @@
+"""Integration test — ground-truth evaluation end-to-end with real similarity metrics."""
+from __future__ import annotations
+
+import asyncio
+import json
+
+import pytest
+from unittest.mock import AsyncMock
+
+from prometheus.application.ground_truth_evaluator import GroundTruthEvaluator
+from prometheus.domain.entities import GroundTruthExample, Prompt
+from prometheus.domain.ports import LLMPort
+from prometheus.infrastructure.dataset_loader import FileDatasetLoader
+from prometheus.infrastructure.similarity import (
+    BleuSimilarity,
+    CosineSimilarity,
+    ExactMatchSimilarity,
+    RougeLSimilarity,
+    create_similarity_adapter,
+)
+
+
+def _make_dataset(items: list[tuple[str, str]]) -> list[GroundTruthExample]:
+    return [
+        GroundTruthExample(input_text=inp, expected_output=exp, id=i)
+        for i, (inp, exp) in enumerate(items)
+    ]
+
+
+@pytest.fixture
+def qa_dataset():
+    return _make_dataset([
+        ("What is the capital of France?", "Paris"),
+        ("What is 2+2?", "4"),
+        ("What color is the sky?", "blue"),
+    ])
+
+
+@pytest.fixture
+def prompt():
+    return Prompt(text="Answer the following question concisely.")
+
+
+@pytest.fixture
+def mock_executor():
+    """Returns responses that partially match the ground truth."""
+    port = AsyncMock(spec=LLMPort)
+    port.execute.side_effect = [
+        "Paris is the capital of France.",
+        "The answer is 4.",
+        "The sky is blue.",
+    ]
+    return port
+
+
+class TestGroundTruthIntegrationWithExactMatch:
+    @pytest.mark.asyncio
+    async def test_exact_match_on_qa(self, mock_executor, qa_dataset, prompt):
+        evaluator = GroundTruthEvaluator(
+            executor=mock_executor,
+            similarity=ExactMatchSimilarity(),
+        )
+        result = await evaluator.evaluate(prompt, qa_dataset)
+        # None of the outputs are exact matches with expected outputs
+        assert all(s == 0.0 for s in result.scores)
+
+    @pytest.mark.asyncio
+    async def test_exact_match_with_exact_outputs(self, qa_dataset, prompt):
+        exact_executor = AsyncMock(spec=LLMPort)
+        exact_executor.execute.side_effect = ["Paris", "4", "blue"]
+        evaluator = GroundTruthEvaluator(
+            executor=exact_executor,
+            similarity=ExactMatchSimilarity(),
+        )
+        result = await evaluator.evaluate(prompt, qa_dataset)
+        assert all(s == 1.0 for s in result.scores)
+
+
+class TestGroundTruthIntegrationWithBleu:
+    @pytest.mark.asyncio
+    async def test_bleu_scores_partial_match(self, mock_executor, qa_dataset, prompt):
+        evaluator = GroundTruthEvaluator(
+            executor=mock_executor,
+            similarity=BleuSimilarity(),
+        )
+        result = await evaluator.evaluate(prompt, qa_dataset)
+        assert all(0.0 < s < 1.0 for s in result.scores)
+        assert result.mean_score > 0.0
+
+    @pytest.mark.asyncio
+    async def test_bleu_perfect_match(self, qa_dataset, prompt):
+        perfect_executor = AsyncMock(spec=LLMPort)
+        perfect_executor.execute.side_effect = ["Paris", "4", "blue"]
+        evaluator = GroundTruthEvaluator(
+            executor=perfect_executor,
+            similarity=BleuSimilarity(),
+        )
+        result = await evaluator.evaluate(prompt, qa_dataset)
+        assert all(s > 0.0 for s in result.scores)
+
+
+class TestGroundTruthIntegrationWithRouge:
+    @pytest.mark.asyncio
+    async def test_rouge_l_scores(self, mock_executor, qa_dataset, prompt):
+        evaluator = GroundTruthEvaluator(
+            executor=mock_executor,
+            similarity=RougeLSimilarity(),
+        )
+        result = await evaluator.evaluate(prompt, qa_dataset)
+        assert all(s > 0.0 for s in result.scores)
+
+
+class TestGroundTruthIntegrationWithCosine:
+    @pytest.mark.asyncio
+    async def test_cosine_scores(self, mock_executor, qa_dataset, prompt):
+        evaluator = GroundTruthEvaluator(
+            executor=mock_executor,
+            similarity=CosineSimilarity(),
+        )
+        result = await evaluator.evaluate(prompt, qa_dataset)
+        assert all(s > 0.0 for s in result.scores)
+
+
+class TestDatasetLoaderIntegration:
+    @pytest.mark.asyncio
+    async def test_load_csv_and_evaluate(self, tmp_path, prompt):
+        csv_file = tmp_path / "eval.csv"
+        csv_file.write_text("input,expected_output\nWhat is 2+2?,4\nWhat color is grass?,green\n")
+
+        loader = FileDatasetLoader()
+        dataset = loader.load(str(csv_file))
+        assert len(dataset) == 2
+
+        executor = AsyncMock(spec=LLMPort)
+        executor.execute.side_effect = ["4", "green"]
+
+        evaluator = GroundTruthEvaluator(
+            executor=executor,
+            similarity=ExactMatchSimilarity(),
+        )
+        result = await evaluator.evaluate(prompt, dataset)
+        assert all(s == 1.0 for s in result.scores)
+
+    @pytest.mark.asyncio
+    async def test_load_json_and_evaluate(self, tmp_path, prompt):
+        json_file = tmp_path / "eval.json"
+        data = [
+            {"input": "What is 2+2?", "expected_output": "4"},
+            {"input": "What color is grass?", "expected_output": "green"},
+        ]
+        json_file.write_text(json.dumps(data))
+
+        loader = FileDatasetLoader()
+        dataset = loader.load(str(json_file))
+        assert len(dataset) == 2
+
+        executor = AsyncMock(spec=LLMPort)
+        executor.execute.side_effect = ["4", "not green"]
+
+        evaluator = GroundTruthEvaluator(
+            executor=executor,
+            similarity=create_similarity_adapter("bleu"),
+        )
+        result = await evaluator.evaluate(prompt, dataset)
+        # First item should score well, second poorly
+        assert result.scores[0] > result.scores[1]
+
+
+class TestMetricComparison:
+    """Compare different metrics on the same outputs to ensure they behave differently."""
+
+    @pytest.mark.asyncio
+    async def test_metrics_give_different_scores(self, qa_dataset, prompt):
+        results = {}
+        for metric_name, metric_cls in [
+            ("exact", ExactMatchSimilarity),
+            ("bleu", BleuSimilarity),
+            ("rouge_l", RougeLSimilarity),
+            ("cosine", CosineSimilarity),
+        ]:
+            executor = AsyncMock(spec=LLMPort)
+            executor.execute.side_effect = [
+                "Paris is the capital of France.",
+                "The answer is 4.",
+                "The sky is blue.",
+            ]
+            evaluator = GroundTruthEvaluator(
+                executor=executor,
+                similarity=metric_cls(),
+            )
+            result = await evaluator.evaluate(prompt, qa_dataset)
+            results[metric_name] = result.mean_score
+
+        # Exact match should be 0 (no exact matches)
+        assert results["exact"] == 0.0
+        # All other metrics should give partial credit
+        assert results["bleu"] > 0.0
+        assert results["rouge_l"] > 0.0
+        assert results["cosine"] > 0.0