feat: v0.2.0 sprint — ground truth eval, crossover/mutation, checkpointing, similarity guards, dataset loader, CLI commands, extended test coverage

Aggregates all v0.2.0 sprint work (GARAA-30 through GARAA-40) and fixes 2 integration tests that broke when the codebase went async (DSPyLLMAdapter and full pipeline tests now properly await coroutines). 277 tests pass (260 unit + 17 integration). Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-03-29 19:13:50 +00:00
parent b9745566c8
commit a5bf2ad59c
43 changed files with 5007 additions and 358 deletions
--- a/tests/unit/test_similarity.py
+++ b/tests/unit/test_similarity.py
@@ -0,0 +1,133 @@
+"""Tests for similarity adapters — exact, BLEU, ROUGE-L, cosine."""
+from __future__ import annotations
+
+import pytest
+
+from prometheus.infrastructure.similarity import (
+    BleuSimilarity,
+    CosineSimilarity,
+    ExactMatchSimilarity,
+    RougeLSimilarity,
+    create_similarity_adapter,
+)
+
+
+class TestExactMatchSimilarity:
+    def test_exact_match(self):
+        s = ExactMatchSimilarity()
+        assert s.compute("Hello World", "Hello World") == 1.0
+
+    def test_case_insensitive(self):
+        s = ExactMatchSimilarity()
+        assert s.compute("hello world", "HELLO WORLD") == 1.0
+
+    def test_whitespace_trimmed(self):
+        s = ExactMatchSimilarity()
+        assert s.compute("  hello  ", "hello") == 1.0
+
+    def test_no_match(self):
+        s = ExactMatchSimilarity()
+        assert s.compute("hello", "world") == 0.0
+
+    def test_partial_no_match(self):
+        s = ExactMatchSimilarity()
+        assert s.compute("hello world", "hello") == 0.0
+
+
+class TestBleuSimilarity:
+    def test_perfect_match(self):
+        s = BleuSimilarity()
+        assert s.compute("the cat sat on the mat", "the cat sat on the mat") == 1.0
+
+    def test_no_overlap(self):
+        s = BleuSimilarity()
+        assert s.compute("aaa bbb ccc", "ddd eee fff") == 0.0
+
+    def test_partial_overlap(self):
+        s = BleuSimilarity()
+        score = s.compute("the cat sat", "the cat")
+        assert 0.0 < score < 1.0
+
+    def test_empty_prediction(self):
+        s = BleuSimilarity()
+        assert s.compute("", "hello world") == 0.0
+
+    def test_empty_expected(self):
+        s = BleuSimilarity()
+        assert s.compute("hello world", "") == 0.0
+
+    def test_both_empty(self):
+        s = BleuSimilarity()
+        assert s.compute("", "") == 0.0
+
+    def test_shorter_prediction_gets_brevity_penalty(self):
+        s = BleuSimilarity()
+        short = s.compute("cat", "the cat sat on the mat")
+        full = s.compute("the cat sat on the mat", "the cat sat on the mat")
+        assert short < full
+
+
+class TestRougeLSimilarity:
+    def test_perfect_match(self):
+        s = RougeLSimilarity()
+        assert s.compute("the cat sat", "the cat sat") == 1.0
+
+    def test_no_overlap(self):
+        s = RougeLSimilarity()
+        assert s.compute("aaa bbb", "ccc ddd") == 0.0
+
+    def test_partial_overlap(self):
+        s = RougeLSimilarity()
+        score = s.compute("the cat sat on the mat", "the cat on the rug")
+        assert 0.0 < score < 1.0
+
+    def test_empty_prediction(self):
+        s = RougeLSimilarity()
+        assert s.compute("", "hello") == 0.0
+
+    def test_subsequence(self):
+        s = RougeLSimilarity()
+        # "cat mat" is a subsequence of "the cat sat on the mat"
+        score = s.compute("cat mat", "the cat sat on the mat")
+        assert score > 0.0
+
+
+class TestCosineSimilarity:
+    def test_identical_texts(self):
+        s = CosineSimilarity()
+        assert s.compute("hello world", "hello world") == pytest.approx(1.0)
+
+    def test_no_overlap(self):
+        s = CosineSimilarity()
+        assert s.compute("aaa bbb", "ccc ddd") == 0.0
+
+    def test_partial_overlap(self):
+        s = CosineSimilarity()
+        score = s.compute("hello world foo", "hello world bar")
+        assert 0.0 < score < 1.0
+
+    def test_empty_prediction(self):
+        s = CosineSimilarity()
+        assert s.compute("", "hello") == 0.0
+
+
+class TestCreateSimilarityAdapter:
+    def test_create_exact(self):
+        adapter = create_similarity_adapter("exact")
+        assert isinstance(adapter, ExactMatchSimilarity)
+
+    def test_create_bleu(self):
+        adapter = create_similarity_adapter("bleu")
+        assert isinstance(adapter, BleuSimilarity)
+
+    def test_create_rouge_l(self):
+        adapter = create_similarity_adapter("rouge_l")
+        assert isinstance(adapter, RougeLSimilarity)
+
+    def test_create_cosine(self):
+        adapter = create_similarity_adapter("cosine")
+        assert isinstance(adapter, CosineSimilarity)
+
+    def test_unknown_metric_raises(self):
+        with pytest.raises(ValueError, match="Unknown eval metric"):
+            create_similarity_adapter("nonexistent")