"""Unit tests for PromptEvaluator.evaluate().""" from __future__ import annotations from unittest.mock import MagicMock import pytest from prometheus.application.evaluator import PromptEvaluator from prometheus.domain.entities import EvalResult, Prompt, SyntheticExample, Trajectory from prometheus.domain.ports import JudgePort, LLMPort class TestPromptEvaluatorEvaluate: """Tests for the evaluate() pipeline: execute → judge → trajectories.""" @pytest.fixture def executor(self) -> MagicMock: return MagicMock(spec=LLMPort) @pytest.fixture def judge(self) -> MagicMock: return MagicMock(spec=JudgePort) @pytest.fixture def evaluator(self, executor: MagicMock, judge: MagicMock) -> PromptEvaluator: return PromptEvaluator(executor=executor, judge=judge) def test_happy_path_builds_correct_trajectories( self, evaluator: PromptEvaluator, executor: MagicMock, judge: MagicMock, ) -> None: prompt = Prompt(text="Answer the question.") examples = [ SyntheticExample(input_text="What is 2+2?", id=0), SyntheticExample(input_text="Capital of France?", id=1), ] executor.execute.side_effect = ["4", "Paris"] judge.judge_batch.return_value = [ (0.9, "Correct."), (0.8, "Mostly correct."), ] result = evaluator.evaluate(prompt, examples, "math and geography") assert isinstance(result, EvalResult) assert result.scores == [0.9, 0.8] assert result.feedbacks == ["Correct.", "Mostly correct."] assert len(result.trajectories) == 2 assert result.trajectories[0].input_text == "What is 2+2?" assert result.trajectories[0].output_text == "4" assert result.trajectories[0].score == 0.9 assert result.trajectories[0].feedback == "Correct." assert result.trajectories[0].prompt_used == "Answer the question." assert result.trajectories[1].prompt_used == "Answer the question." def test_empty_minibatch_returns_empty_result( self, evaluator: PromptEvaluator, executor: MagicMock, judge: MagicMock, ) -> None: prompt = Prompt(text="test") result = evaluator.evaluate(prompt, [], "task") assert result.scores == [] assert result.feedbacks == [] assert result.trajectories == [] executor.execute.assert_not_called() # judge_batch is called with empty pairs list judge.judge_batch.assert_called_once_with("task", []) def test_executor_called_with_correct_prompt( self, evaluator: PromptEvaluator, executor: MagicMock, judge: MagicMock, ) -> None: prompt = Prompt(text="Summarize this.") examples = [SyntheticExample(input_text="Long text here", id=0)] executor.execute.return_value = "Summary." judge.judge_batch.return_value = [(0.7, "Good summary.")] evaluator.evaluate(prompt, examples, "summarization") executor.execute.assert_called_once_with(prompt, "Long text here") def test_trajectories_prompt_used_matches_input_prompt( self, evaluator: PromptEvaluator, executor: MagicMock, judge: MagicMock, ) -> None: prompt = Prompt(text="Translate to French.") examples = [SyntheticExample(input_text="Hello", id=0)] executor.execute.return_value = "Bonjour" judge.judge_batch.return_value = [(1.0, "Perfect.")] result = evaluator.evaluate(prompt, examples, "translation") assert result.trajectories[0].prompt_used == "Translate to French." def test_scores_feedbacks_trajectories_lists_sized_correctly( self, evaluator: PromptEvaluator, executor: MagicMock, judge: MagicMock, ) -> None: prompt = Prompt(text="test prompt") examples = [SyntheticExample(input_text=f"q{i}", id=i) for i in range(4)] executor.execute.side_effect = [f"a{i}" for i in range(4)] judge.judge_batch.return_value = [ (0.1 * i, f"fb{i}") for i in range(4) ] result = evaluator.evaluate(prompt, examples, "task") assert len(result.scores) == 4 assert len(result.feedbacks) == 4 assert len(result.trajectories) == 4