"""Integration test — ground-truth evaluation end-to-end with real similarity metrics.""" from __future__ import annotations import asyncio import json import pytest from unittest.mock import AsyncMock from prometheus.application.ground_truth_evaluator import GroundTruthEvaluator from prometheus.domain.entities import GroundTruthExample, Prompt from prometheus.domain.ports import LLMPort from prometheus.infrastructure.dataset_loader import FileDatasetLoader from prometheus.infrastructure.similarity import ( BleuSimilarity, CosineSimilarity, ExactMatchSimilarity, RougeLSimilarity, create_similarity_adapter, ) def _make_dataset(items: list[tuple[str, str]]) -> list[GroundTruthExample]: return [ GroundTruthExample(input_text=inp, expected_output=exp, id=i) for i, (inp, exp) in enumerate(items) ] @pytest.fixture def qa_dataset(): return _make_dataset([ ("What is the capital of France?", "Paris"), ("What is 2+2?", "4"), ("What color is the sky?", "blue"), ]) @pytest.fixture def prompt(): return Prompt(text="Answer the following question concisely.") @pytest.fixture def mock_executor(): """Returns responses that partially match the ground truth.""" port = AsyncMock(spec=LLMPort) port.execute.side_effect = [ "Paris is the capital of France.", "The answer is 4.", "The sky is blue.", ] return port class TestGroundTruthIntegrationWithExactMatch: @pytest.mark.asyncio async def test_exact_match_on_qa(self, mock_executor, qa_dataset, prompt): evaluator = GroundTruthEvaluator( executor=mock_executor, similarity=ExactMatchSimilarity(), ) result = await evaluator.evaluate(prompt, qa_dataset) # None of the outputs are exact matches with expected outputs assert all(s == 0.0 for s in result.scores) @pytest.mark.asyncio async def test_exact_match_with_exact_outputs(self, qa_dataset, prompt): exact_executor = AsyncMock(spec=LLMPort) exact_executor.execute.side_effect = ["Paris", "4", "blue"] evaluator = GroundTruthEvaluator( executor=exact_executor, similarity=ExactMatchSimilarity(), ) result = await evaluator.evaluate(prompt, qa_dataset) assert all(s == 1.0 for s in result.scores) class TestGroundTruthIntegrationWithBleu: @pytest.mark.asyncio async def test_bleu_scores_partial_match(self, mock_executor, qa_dataset, prompt): evaluator = GroundTruthEvaluator( executor=mock_executor, similarity=BleuSimilarity(), ) result = await evaluator.evaluate(prompt, qa_dataset) assert all(0.0 < s < 1.0 for s in result.scores) assert result.mean_score > 0.0 @pytest.mark.asyncio async def test_bleu_perfect_match(self, qa_dataset, prompt): perfect_executor = AsyncMock(spec=LLMPort) perfect_executor.execute.side_effect = ["Paris", "4", "blue"] evaluator = GroundTruthEvaluator( executor=perfect_executor, similarity=BleuSimilarity(), ) result = await evaluator.evaluate(prompt, qa_dataset) assert all(s > 0.0 for s in result.scores) class TestGroundTruthIntegrationWithRouge: @pytest.mark.asyncio async def test_rouge_l_scores(self, mock_executor, qa_dataset, prompt): evaluator = GroundTruthEvaluator( executor=mock_executor, similarity=RougeLSimilarity(), ) result = await evaluator.evaluate(prompt, qa_dataset) assert all(s > 0.0 for s in result.scores) class TestGroundTruthIntegrationWithCosine: @pytest.mark.asyncio async def test_cosine_scores(self, mock_executor, qa_dataset, prompt): evaluator = GroundTruthEvaluator( executor=mock_executor, similarity=CosineSimilarity(), ) result = await evaluator.evaluate(prompt, qa_dataset) assert all(s > 0.0 for s in result.scores) class TestDatasetLoaderIntegration: @pytest.mark.asyncio async def test_load_csv_and_evaluate(self, tmp_path, prompt): csv_file = tmp_path / "eval.csv" csv_file.write_text("input,expected_output\nWhat is 2+2?,4\nWhat color is grass?,green\n") loader = FileDatasetLoader() dataset = loader.load(str(csv_file)) assert len(dataset) == 2 executor = AsyncMock(spec=LLMPort) executor.execute.side_effect = ["4", "green"] evaluator = GroundTruthEvaluator( executor=executor, similarity=ExactMatchSimilarity(), ) result = await evaluator.evaluate(prompt, dataset) assert all(s == 1.0 for s in result.scores) @pytest.mark.asyncio async def test_load_json_and_evaluate(self, tmp_path, prompt): json_file = tmp_path / "eval.json" data = [ {"input": "What is 2+2?", "expected_output": "4"}, {"input": "What color is grass?", "expected_output": "green"}, ] json_file.write_text(json.dumps(data)) loader = FileDatasetLoader() dataset = loader.load(str(json_file)) assert len(dataset) == 2 executor = AsyncMock(spec=LLMPort) executor.execute.side_effect = ["4", "not green"] evaluator = GroundTruthEvaluator( executor=executor, similarity=create_similarity_adapter("bleu"), ) result = await evaluator.evaluate(prompt, dataset) # First item should score well, second poorly assert result.scores[0] > result.scores[1] class TestMetricComparison: """Compare different metrics on the same outputs to ensure they behave differently.""" @pytest.mark.asyncio async def test_metrics_give_different_scores(self, qa_dataset, prompt): results = {} for metric_name, metric_cls in [ ("exact", ExactMatchSimilarity), ("bleu", BleuSimilarity), ("rouge_l", RougeLSimilarity), ("cosine", CosineSimilarity), ]: executor = AsyncMock(spec=LLMPort) executor.execute.side_effect = [ "Paris is the capital of France.", "The answer is 4.", "The sky is blue.", ] evaluator = GroundTruthEvaluator( executor=executor, similarity=metric_cls(), ) result = await evaluator.evaluate(prompt, qa_dataset) results[metric_name] = result.mean_score # Exact match should be 0 (no exact matches) assert results["exact"] == 0.0 # All other metrics should give partial credit assert results["bleu"] > 0.0 assert results["rouge_l"] > 0.0 assert results["cosine"] > 0.0