feat: v0.2.0 sprint — ground truth eval, crossover/mutation, checkpointing, similarity guards, dataset loader, CLI commands, extended test coverage
Aggregates all v0.2.0 sprint work (GARAA-30 through GARAA-40) and fixes 2 integration tests that broke when the codebase went async (DSPyLLMAdapter and full pipeline tests now properly await coroutines). 277 tests pass (260 unit + 17 integration). Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
133
tests/unit/test_similarity.py
Normal file
133
tests/unit/test_similarity.py
Normal file
@@ -0,0 +1,133 @@
|
||||
"""Tests for similarity adapters — exact, BLEU, ROUGE-L, cosine."""
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from prometheus.infrastructure.similarity import (
|
||||
BleuSimilarity,
|
||||
CosineSimilarity,
|
||||
ExactMatchSimilarity,
|
||||
RougeLSimilarity,
|
||||
create_similarity_adapter,
|
||||
)
|
||||
|
||||
|
||||
class TestExactMatchSimilarity:
|
||||
def test_exact_match(self):
|
||||
s = ExactMatchSimilarity()
|
||||
assert s.compute("Hello World", "Hello World") == 1.0
|
||||
|
||||
def test_case_insensitive(self):
|
||||
s = ExactMatchSimilarity()
|
||||
assert s.compute("hello world", "HELLO WORLD") == 1.0
|
||||
|
||||
def test_whitespace_trimmed(self):
|
||||
s = ExactMatchSimilarity()
|
||||
assert s.compute(" hello ", "hello") == 1.0
|
||||
|
||||
def test_no_match(self):
|
||||
s = ExactMatchSimilarity()
|
||||
assert s.compute("hello", "world") == 0.0
|
||||
|
||||
def test_partial_no_match(self):
|
||||
s = ExactMatchSimilarity()
|
||||
assert s.compute("hello world", "hello") == 0.0
|
||||
|
||||
|
||||
class TestBleuSimilarity:
|
||||
def test_perfect_match(self):
|
||||
s = BleuSimilarity()
|
||||
assert s.compute("the cat sat on the mat", "the cat sat on the mat") == 1.0
|
||||
|
||||
def test_no_overlap(self):
|
||||
s = BleuSimilarity()
|
||||
assert s.compute("aaa bbb ccc", "ddd eee fff") == 0.0
|
||||
|
||||
def test_partial_overlap(self):
|
||||
s = BleuSimilarity()
|
||||
score = s.compute("the cat sat", "the cat")
|
||||
assert 0.0 < score < 1.0
|
||||
|
||||
def test_empty_prediction(self):
|
||||
s = BleuSimilarity()
|
||||
assert s.compute("", "hello world") == 0.0
|
||||
|
||||
def test_empty_expected(self):
|
||||
s = BleuSimilarity()
|
||||
assert s.compute("hello world", "") == 0.0
|
||||
|
||||
def test_both_empty(self):
|
||||
s = BleuSimilarity()
|
||||
assert s.compute("", "") == 0.0
|
||||
|
||||
def test_shorter_prediction_gets_brevity_penalty(self):
|
||||
s = BleuSimilarity()
|
||||
short = s.compute("cat", "the cat sat on the mat")
|
||||
full = s.compute("the cat sat on the mat", "the cat sat on the mat")
|
||||
assert short < full
|
||||
|
||||
|
||||
class TestRougeLSimilarity:
|
||||
def test_perfect_match(self):
|
||||
s = RougeLSimilarity()
|
||||
assert s.compute("the cat sat", "the cat sat") == 1.0
|
||||
|
||||
def test_no_overlap(self):
|
||||
s = RougeLSimilarity()
|
||||
assert s.compute("aaa bbb", "ccc ddd") == 0.0
|
||||
|
||||
def test_partial_overlap(self):
|
||||
s = RougeLSimilarity()
|
||||
score = s.compute("the cat sat on the mat", "the cat on the rug")
|
||||
assert 0.0 < score < 1.0
|
||||
|
||||
def test_empty_prediction(self):
|
||||
s = RougeLSimilarity()
|
||||
assert s.compute("", "hello") == 0.0
|
||||
|
||||
def test_subsequence(self):
|
||||
s = RougeLSimilarity()
|
||||
# "cat mat" is a subsequence of "the cat sat on the mat"
|
||||
score = s.compute("cat mat", "the cat sat on the mat")
|
||||
assert score > 0.0
|
||||
|
||||
|
||||
class TestCosineSimilarity:
|
||||
def test_identical_texts(self):
|
||||
s = CosineSimilarity()
|
||||
assert s.compute("hello world", "hello world") == pytest.approx(1.0)
|
||||
|
||||
def test_no_overlap(self):
|
||||
s = CosineSimilarity()
|
||||
assert s.compute("aaa bbb", "ccc ddd") == 0.0
|
||||
|
||||
def test_partial_overlap(self):
|
||||
s = CosineSimilarity()
|
||||
score = s.compute("hello world foo", "hello world bar")
|
||||
assert 0.0 < score < 1.0
|
||||
|
||||
def test_empty_prediction(self):
|
||||
s = CosineSimilarity()
|
||||
assert s.compute("", "hello") == 0.0
|
||||
|
||||
|
||||
class TestCreateSimilarityAdapter:
|
||||
def test_create_exact(self):
|
||||
adapter = create_similarity_adapter("exact")
|
||||
assert isinstance(adapter, ExactMatchSimilarity)
|
||||
|
||||
def test_create_bleu(self):
|
||||
adapter = create_similarity_adapter("bleu")
|
||||
assert isinstance(adapter, BleuSimilarity)
|
||||
|
||||
def test_create_rouge_l(self):
|
||||
adapter = create_similarity_adapter("rouge_l")
|
||||
assert isinstance(adapter, RougeLSimilarity)
|
||||
|
||||
def test_create_cosine(self):
|
||||
adapter = create_similarity_adapter("cosine")
|
||||
assert isinstance(adapter, CosineSimilarity)
|
||||
|
||||
def test_unknown_metric_raises(self):
|
||||
with pytest.raises(ValueError, match="Unknown eval metric"):
|
||||
create_similarity_adapter("nonexistent")
|
||||
Reference in New Issue
Block a user