"""Tests for GroundTruthEvaluator — execution + similarity comparison.""" from __future__ import annotations from unittest.mock import AsyncMock import pytest from prometheus.application.ground_truth_evaluator import GroundTruthEvaluator from prometheus.domain.entities import EvalResult, GroundTruthExample, Prompt from prometheus.domain.ports import LLMPort, SimilarityPort @pytest.fixture def mock_executor() -> AsyncMock: port = AsyncMock(spec=LLMPort) port.execute.return_value = "Paris is the capital of France." return port @pytest.fixture def mock_similarity() -> AsyncMock: port = AsyncMock(spec=SimilarityPort) port.compute.return_value = 0.85 return port @pytest.fixture def gt_dataset() -> list[GroundTruthExample]: return [ GroundTruthExample(input_text="What is the capital of France?", expected_output="Paris", id=0), GroundTruthExample(input_text="What is 2+2?", expected_output="4", id=1), GroundTruthExample(input_text="What color is the sky?", expected_output="blue", id=2), ] @pytest.fixture def prompt() -> Prompt: return Prompt(text="Answer the following question accurately.") @pytest.mark.asyncio class TestGroundTruthEvaluator: async def test_evaluate_happy_path(self, mock_executor, mock_similarity, gt_dataset, prompt): evaluator = GroundTruthEvaluator( executor=mock_executor, similarity=mock_similarity, max_concurrency=2, ) result = await evaluator.evaluate(prompt, gt_dataset) assert isinstance(result, EvalResult) assert len(result.scores) == 3 assert len(result.feedbacks) == 3 assert len(result.trajectories) == 3 assert all(s == 0.85 for s in result.scores) assert result.mean_score == pytest.approx(0.85) assert result.total_score == pytest.approx(2.55) async def test_executor_called_for_each_input(self, mock_executor, mock_similarity, gt_dataset, prompt): evaluator = GroundTruthEvaluator( executor=mock_executor, similarity=mock_similarity, ) await evaluator.evaluate(prompt, gt_dataset) assert mock_executor.execute.call_count == 3 async def test_similarity_called_for_each_output(self, mock_executor, mock_similarity, gt_dataset, prompt): evaluator = GroundTruthEvaluator( executor=mock_executor, similarity=mock_similarity, ) await evaluator.evaluate(prompt, gt_dataset) assert mock_similarity.compute.call_count == 3 async def test_execution_error_produces_zero_score(self, mock_similarity, gt_dataset, prompt): failing_executor = AsyncMock(spec=LLMPort) failing_executor.execute.side_effect = RuntimeError("API timeout") evaluator = GroundTruthEvaluator( executor=failing_executor, similarity=mock_similarity, ) result = await evaluator.evaluate(prompt, gt_dataset) assert len(result.scores) == 3 # The similarity adapter is called with the error sentinel assert all(isinstance(s, float) for s in result.scores) assert all("[execution error:" in t.output_text for t in result.trajectories) async def test_empty_dataset(self, mock_executor, mock_similarity, prompt): evaluator = GroundTruthEvaluator( executor=mock_executor, similarity=mock_similarity, ) result = await evaluator.evaluate(prompt, []) assert result.scores == [] assert result.mean_score == 0.0 assert result.total_score == 0.0 async def test_trajectory_contains_prompt_used(self, mock_executor, mock_similarity, gt_dataset, prompt): evaluator = GroundTruthEvaluator( executor=mock_executor, similarity=mock_similarity, ) result = await evaluator.evaluate(prompt, gt_dataset) for t in result.trajectories: assert t.prompt_used == prompt.text async def test_scores_clamped_to_unit_range(self, mock_executor, gt_dataset, prompt): # Similarity returns a value > 1.0 (should be clamped) over_similarity = AsyncMock(spec=SimilarityPort) over_similarity.compute.return_value = 1.5 evaluator = GroundTruthEvaluator( executor=mock_executor, similarity=over_similarity, ) result = await evaluator.evaluate(prompt, gt_dataset) assert all(0.0 <= s <= 1.0 for s in result.scores) async def test_feedback_for_exact_match(self, mock_executor, gt_dataset, prompt): exact_similarity = AsyncMock(spec=SimilarityPort) exact_similarity.compute.return_value = 1.0 evaluator = GroundTruthEvaluator( executor=mock_executor, similarity=exact_similarity, ) result = await evaluator.evaluate(prompt, gt_dataset) assert all("Exact match" in fb for fb in result.feedbacks) async def test_feedback_for_poor_match(self, mock_executor, gt_dataset, prompt): poor_similarity = AsyncMock(spec=SimilarityPort) poor_similarity.compute.return_value = 0.1 evaluator = GroundTruthEvaluator( executor=mock_executor, similarity=poor_similarity, ) result = await evaluator.evaluate(prompt, gt_dataset) assert all("Poor match" in fb for fb in result.feedbacks)