Aggregates all v0.2.0 sprint work (GARAA-30 through GARAA-40) and fixes 2 integration tests that broke when the codebase went async (DSPyLLMAdapter and full pipeline tests now properly await coroutines). 277 tests pass (260 unit + 17 integration). Co-Authored-By: Paperclip <noreply@paperclip.ing>
200 lines
6.7 KiB
Python
200 lines
6.7 KiB
Python
"""Integration test — ground-truth evaluation end-to-end with real similarity metrics."""
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import json
|
|
|
|
import pytest
|
|
from unittest.mock import AsyncMock
|
|
|
|
from prometheus.application.ground_truth_evaluator import GroundTruthEvaluator
|
|
from prometheus.domain.entities import GroundTruthExample, Prompt
|
|
from prometheus.domain.ports import LLMPort
|
|
from prometheus.infrastructure.dataset_loader import FileDatasetLoader
|
|
from prometheus.infrastructure.similarity import (
|
|
BleuSimilarity,
|
|
CosineSimilarity,
|
|
ExactMatchSimilarity,
|
|
RougeLSimilarity,
|
|
create_similarity_adapter,
|
|
)
|
|
|
|
|
|
def _make_dataset(items: list[tuple[str, str]]) -> list[GroundTruthExample]:
|
|
return [
|
|
GroundTruthExample(input_text=inp, expected_output=exp, id=i)
|
|
for i, (inp, exp) in enumerate(items)
|
|
]
|
|
|
|
|
|
@pytest.fixture
|
|
def qa_dataset():
|
|
return _make_dataset([
|
|
("What is the capital of France?", "Paris"),
|
|
("What is 2+2?", "4"),
|
|
("What color is the sky?", "blue"),
|
|
])
|
|
|
|
|
|
@pytest.fixture
|
|
def prompt():
|
|
return Prompt(text="Answer the following question concisely.")
|
|
|
|
|
|
@pytest.fixture
|
|
def mock_executor():
|
|
"""Returns responses that partially match the ground truth."""
|
|
port = AsyncMock(spec=LLMPort)
|
|
port.execute.side_effect = [
|
|
"Paris is the capital of France.",
|
|
"The answer is 4.",
|
|
"The sky is blue.",
|
|
]
|
|
return port
|
|
|
|
|
|
class TestGroundTruthIntegrationWithExactMatch:
|
|
@pytest.mark.asyncio
|
|
async def test_exact_match_on_qa(self, mock_executor, qa_dataset, prompt):
|
|
evaluator = GroundTruthEvaluator(
|
|
executor=mock_executor,
|
|
similarity=ExactMatchSimilarity(),
|
|
)
|
|
result = await evaluator.evaluate(prompt, qa_dataset)
|
|
# None of the outputs are exact matches with expected outputs
|
|
assert all(s == 0.0 for s in result.scores)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_exact_match_with_exact_outputs(self, qa_dataset, prompt):
|
|
exact_executor = AsyncMock(spec=LLMPort)
|
|
exact_executor.execute.side_effect = ["Paris", "4", "blue"]
|
|
evaluator = GroundTruthEvaluator(
|
|
executor=exact_executor,
|
|
similarity=ExactMatchSimilarity(),
|
|
)
|
|
result = await evaluator.evaluate(prompt, qa_dataset)
|
|
assert all(s == 1.0 for s in result.scores)
|
|
|
|
|
|
class TestGroundTruthIntegrationWithBleu:
|
|
@pytest.mark.asyncio
|
|
async def test_bleu_scores_partial_match(self, mock_executor, qa_dataset, prompt):
|
|
evaluator = GroundTruthEvaluator(
|
|
executor=mock_executor,
|
|
similarity=BleuSimilarity(),
|
|
)
|
|
result = await evaluator.evaluate(prompt, qa_dataset)
|
|
assert all(0.0 < s < 1.0 for s in result.scores)
|
|
assert result.mean_score > 0.0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_bleu_perfect_match(self, qa_dataset, prompt):
|
|
perfect_executor = AsyncMock(spec=LLMPort)
|
|
perfect_executor.execute.side_effect = ["Paris", "4", "blue"]
|
|
evaluator = GroundTruthEvaluator(
|
|
executor=perfect_executor,
|
|
similarity=BleuSimilarity(),
|
|
)
|
|
result = await evaluator.evaluate(prompt, qa_dataset)
|
|
assert all(s > 0.0 for s in result.scores)
|
|
|
|
|
|
class TestGroundTruthIntegrationWithRouge:
|
|
@pytest.mark.asyncio
|
|
async def test_rouge_l_scores(self, mock_executor, qa_dataset, prompt):
|
|
evaluator = GroundTruthEvaluator(
|
|
executor=mock_executor,
|
|
similarity=RougeLSimilarity(),
|
|
)
|
|
result = await evaluator.evaluate(prompt, qa_dataset)
|
|
assert all(s > 0.0 for s in result.scores)
|
|
|
|
|
|
class TestGroundTruthIntegrationWithCosine:
|
|
@pytest.mark.asyncio
|
|
async def test_cosine_scores(self, mock_executor, qa_dataset, prompt):
|
|
evaluator = GroundTruthEvaluator(
|
|
executor=mock_executor,
|
|
similarity=CosineSimilarity(),
|
|
)
|
|
result = await evaluator.evaluate(prompt, qa_dataset)
|
|
assert all(s > 0.0 for s in result.scores)
|
|
|
|
|
|
class TestDatasetLoaderIntegration:
|
|
@pytest.mark.asyncio
|
|
async def test_load_csv_and_evaluate(self, tmp_path, prompt):
|
|
csv_file = tmp_path / "eval.csv"
|
|
csv_file.write_text("input,expected_output\nWhat is 2+2?,4\nWhat color is grass?,green\n")
|
|
|
|
loader = FileDatasetLoader()
|
|
dataset = loader.load(str(csv_file))
|
|
assert len(dataset) == 2
|
|
|
|
executor = AsyncMock(spec=LLMPort)
|
|
executor.execute.side_effect = ["4", "green"]
|
|
|
|
evaluator = GroundTruthEvaluator(
|
|
executor=executor,
|
|
similarity=ExactMatchSimilarity(),
|
|
)
|
|
result = await evaluator.evaluate(prompt, dataset)
|
|
assert all(s == 1.0 for s in result.scores)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_load_json_and_evaluate(self, tmp_path, prompt):
|
|
json_file = tmp_path / "eval.json"
|
|
data = [
|
|
{"input": "What is 2+2?", "expected_output": "4"},
|
|
{"input": "What color is grass?", "expected_output": "green"},
|
|
]
|
|
json_file.write_text(json.dumps(data))
|
|
|
|
loader = FileDatasetLoader()
|
|
dataset = loader.load(str(json_file))
|
|
assert len(dataset) == 2
|
|
|
|
executor = AsyncMock(spec=LLMPort)
|
|
executor.execute.side_effect = ["4", "not green"]
|
|
|
|
evaluator = GroundTruthEvaluator(
|
|
executor=executor,
|
|
similarity=create_similarity_adapter("bleu"),
|
|
)
|
|
result = await evaluator.evaluate(prompt, dataset)
|
|
# First item should score well, second poorly
|
|
assert result.scores[0] > result.scores[1]
|
|
|
|
|
|
class TestMetricComparison:
|
|
"""Compare different metrics on the same outputs to ensure they behave differently."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_metrics_give_different_scores(self, qa_dataset, prompt):
|
|
results = {}
|
|
for metric_name, metric_cls in [
|
|
("exact", ExactMatchSimilarity),
|
|
("bleu", BleuSimilarity),
|
|
("rouge_l", RougeLSimilarity),
|
|
("cosine", CosineSimilarity),
|
|
]:
|
|
executor = AsyncMock(spec=LLMPort)
|
|
executor.execute.side_effect = [
|
|
"Paris is the capital of France.",
|
|
"The answer is 4.",
|
|
"The sky is blue.",
|
|
]
|
|
evaluator = GroundTruthEvaluator(
|
|
executor=executor,
|
|
similarity=metric_cls(),
|
|
)
|
|
result = await evaluator.evaluate(prompt, qa_dataset)
|
|
results[metric_name] = result.mean_score
|
|
|
|
# Exact match should be 0 (no exact matches)
|
|
assert results["exact"] == 0.0
|
|
# All other metrics should give partial credit
|
|
assert results["bleu"] > 0.0
|
|
assert results["rouge_l"] > 0.0
|
|
assert results["cosine"] > 0.0
|