feat: v0.2.0 sprint — ground truth eval, crossover/mutation, checkpointing, similarity guards, dataset loader, CLI commands, extended test coverage

Aggregates all v0.2.0 sprint work (GARAA-30 through GARAA-40) and fixes
2 integration tests that broke when the codebase went async (DSPyLLMAdapter
and full pipeline tests now properly await coroutines).

277 tests pass (260 unit + 17 integration).

Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
FullStackDev
2026-03-29 19:13:50 +00:00
parent b9745566c8
commit a5bf2ad59c
43 changed files with 5007 additions and 358 deletions

View File

@@ -0,0 +1,133 @@
"""Tests for similarity adapters — exact, BLEU, ROUGE-L, cosine."""
from __future__ import annotations
import pytest
from prometheus.infrastructure.similarity import (
BleuSimilarity,
CosineSimilarity,
ExactMatchSimilarity,
RougeLSimilarity,
create_similarity_adapter,
)
class TestExactMatchSimilarity:
def test_exact_match(self):
s = ExactMatchSimilarity()
assert s.compute("Hello World", "Hello World") == 1.0
def test_case_insensitive(self):
s = ExactMatchSimilarity()
assert s.compute("hello world", "HELLO WORLD") == 1.0
def test_whitespace_trimmed(self):
s = ExactMatchSimilarity()
assert s.compute(" hello ", "hello") == 1.0
def test_no_match(self):
s = ExactMatchSimilarity()
assert s.compute("hello", "world") == 0.0
def test_partial_no_match(self):
s = ExactMatchSimilarity()
assert s.compute("hello world", "hello") == 0.0
class TestBleuSimilarity:
def test_perfect_match(self):
s = BleuSimilarity()
assert s.compute("the cat sat on the mat", "the cat sat on the mat") == 1.0
def test_no_overlap(self):
s = BleuSimilarity()
assert s.compute("aaa bbb ccc", "ddd eee fff") == 0.0
def test_partial_overlap(self):
s = BleuSimilarity()
score = s.compute("the cat sat", "the cat")
assert 0.0 < score < 1.0
def test_empty_prediction(self):
s = BleuSimilarity()
assert s.compute("", "hello world") == 0.0
def test_empty_expected(self):
s = BleuSimilarity()
assert s.compute("hello world", "") == 0.0
def test_both_empty(self):
s = BleuSimilarity()
assert s.compute("", "") == 0.0
def test_shorter_prediction_gets_brevity_penalty(self):
s = BleuSimilarity()
short = s.compute("cat", "the cat sat on the mat")
full = s.compute("the cat sat on the mat", "the cat sat on the mat")
assert short < full
class TestRougeLSimilarity:
def test_perfect_match(self):
s = RougeLSimilarity()
assert s.compute("the cat sat", "the cat sat") == 1.0
def test_no_overlap(self):
s = RougeLSimilarity()
assert s.compute("aaa bbb", "ccc ddd") == 0.0
def test_partial_overlap(self):
s = RougeLSimilarity()
score = s.compute("the cat sat on the mat", "the cat on the rug")
assert 0.0 < score < 1.0
def test_empty_prediction(self):
s = RougeLSimilarity()
assert s.compute("", "hello") == 0.0
def test_subsequence(self):
s = RougeLSimilarity()
# "cat mat" is a subsequence of "the cat sat on the mat"
score = s.compute("cat mat", "the cat sat on the mat")
assert score > 0.0
class TestCosineSimilarity:
def test_identical_texts(self):
s = CosineSimilarity()
assert s.compute("hello world", "hello world") == pytest.approx(1.0)
def test_no_overlap(self):
s = CosineSimilarity()
assert s.compute("aaa bbb", "ccc ddd") == 0.0
def test_partial_overlap(self):
s = CosineSimilarity()
score = s.compute("hello world foo", "hello world bar")
assert 0.0 < score < 1.0
def test_empty_prediction(self):
s = CosineSimilarity()
assert s.compute("", "hello") == 0.0
class TestCreateSimilarityAdapter:
def test_create_exact(self):
adapter = create_similarity_adapter("exact")
assert isinstance(adapter, ExactMatchSimilarity)
def test_create_bleu(self):
adapter = create_similarity_adapter("bleu")
assert isinstance(adapter, BleuSimilarity)
def test_create_rouge_l(self):
adapter = create_similarity_adapter("rouge_l")
assert isinstance(adapter, RougeLSimilarity)
def test_create_cosine(self):
adapter = create_similarity_adapter("cosine")
assert isinstance(adapter, CosineSimilarity)
def test_unknown_metric_raises(self):
with pytest.raises(ValueError, match="Unknown eval metric"):
create_similarity_adapter("nonexistent")