Aggregates all v0.2.0 sprint work (GARAA-30 through GARAA-40) and fixes 2 integration tests that broke when the codebase went async (DSPyLLMAdapter and full pipeline tests now properly await coroutines). 277 tests pass (260 unit + 17 integration). Co-Authored-By: Paperclip <noreply@paperclip.ing>
134 lines
5.2 KiB
Python
134 lines
5.2 KiB
Python
"""Tests for GroundTruthEvaluator — execution + similarity comparison."""
|
|
from __future__ import annotations
|
|
|
|
from unittest.mock import AsyncMock
|
|
|
|
import pytest
|
|
|
|
from prometheus.application.ground_truth_evaluator import GroundTruthEvaluator
|
|
from prometheus.domain.entities import EvalResult, GroundTruthExample, Prompt
|
|
from prometheus.domain.ports import LLMPort, SimilarityPort
|
|
|
|
|
|
@pytest.fixture
|
|
def mock_executor() -> AsyncMock:
|
|
port = AsyncMock(spec=LLMPort)
|
|
port.execute.return_value = "Paris is the capital of France."
|
|
return port
|
|
|
|
|
|
@pytest.fixture
|
|
def mock_similarity() -> AsyncMock:
|
|
port = AsyncMock(spec=SimilarityPort)
|
|
port.compute.return_value = 0.85
|
|
return port
|
|
|
|
|
|
@pytest.fixture
|
|
def gt_dataset() -> list[GroundTruthExample]:
|
|
return [
|
|
GroundTruthExample(input_text="What is the capital of France?", expected_output="Paris", id=0),
|
|
GroundTruthExample(input_text="What is 2+2?", expected_output="4", id=1),
|
|
GroundTruthExample(input_text="What color is the sky?", expected_output="blue", id=2),
|
|
]
|
|
|
|
|
|
@pytest.fixture
|
|
def prompt() -> Prompt:
|
|
return Prompt(text="Answer the following question accurately.")
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
class TestGroundTruthEvaluator:
|
|
async def test_evaluate_happy_path(self, mock_executor, mock_similarity, gt_dataset, prompt):
|
|
evaluator = GroundTruthEvaluator(
|
|
executor=mock_executor,
|
|
similarity=mock_similarity,
|
|
max_concurrency=2,
|
|
)
|
|
result = await evaluator.evaluate(prompt, gt_dataset)
|
|
|
|
assert isinstance(result, EvalResult)
|
|
assert len(result.scores) == 3
|
|
assert len(result.feedbacks) == 3
|
|
assert len(result.trajectories) == 3
|
|
assert all(s == 0.85 for s in result.scores)
|
|
assert result.mean_score == pytest.approx(0.85)
|
|
assert result.total_score == pytest.approx(2.55)
|
|
|
|
async def test_executor_called_for_each_input(self, mock_executor, mock_similarity, gt_dataset, prompt):
|
|
evaluator = GroundTruthEvaluator(
|
|
executor=mock_executor, similarity=mock_similarity,
|
|
)
|
|
await evaluator.evaluate(prompt, gt_dataset)
|
|
assert mock_executor.execute.call_count == 3
|
|
|
|
async def test_similarity_called_for_each_output(self, mock_executor, mock_similarity, gt_dataset, prompt):
|
|
evaluator = GroundTruthEvaluator(
|
|
executor=mock_executor, similarity=mock_similarity,
|
|
)
|
|
await evaluator.evaluate(prompt, gt_dataset)
|
|
assert mock_similarity.compute.call_count == 3
|
|
|
|
async def test_execution_error_produces_zero_score(self, mock_similarity, gt_dataset, prompt):
|
|
failing_executor = AsyncMock(spec=LLMPort)
|
|
failing_executor.execute.side_effect = RuntimeError("API timeout")
|
|
|
|
evaluator = GroundTruthEvaluator(
|
|
executor=failing_executor, similarity=mock_similarity,
|
|
)
|
|
result = await evaluator.evaluate(prompt, gt_dataset)
|
|
|
|
assert len(result.scores) == 3
|
|
# The similarity adapter is called with the error sentinel
|
|
assert all(isinstance(s, float) for s in result.scores)
|
|
assert all("[execution error:" in t.output_text for t in result.trajectories)
|
|
|
|
async def test_empty_dataset(self, mock_executor, mock_similarity, prompt):
|
|
evaluator = GroundTruthEvaluator(
|
|
executor=mock_executor, similarity=mock_similarity,
|
|
)
|
|
result = await evaluator.evaluate(prompt, [])
|
|
assert result.scores == []
|
|
assert result.mean_score == 0.0
|
|
assert result.total_score == 0.0
|
|
|
|
async def test_trajectory_contains_prompt_used(self, mock_executor, mock_similarity, gt_dataset, prompt):
|
|
evaluator = GroundTruthEvaluator(
|
|
executor=mock_executor, similarity=mock_similarity,
|
|
)
|
|
result = await evaluator.evaluate(prompt, gt_dataset)
|
|
for t in result.trajectories:
|
|
assert t.prompt_used == prompt.text
|
|
|
|
async def test_scores_clamped_to_unit_range(self, mock_executor, gt_dataset, prompt):
|
|
# Similarity returns a value > 1.0 (should be clamped)
|
|
over_similarity = AsyncMock(spec=SimilarityPort)
|
|
over_similarity.compute.return_value = 1.5
|
|
|
|
evaluator = GroundTruthEvaluator(
|
|
executor=mock_executor, similarity=over_similarity,
|
|
)
|
|
result = await evaluator.evaluate(prompt, gt_dataset)
|
|
assert all(0.0 <= s <= 1.0 for s in result.scores)
|
|
|
|
async def test_feedback_for_exact_match(self, mock_executor, gt_dataset, prompt):
|
|
exact_similarity = AsyncMock(spec=SimilarityPort)
|
|
exact_similarity.compute.return_value = 1.0
|
|
|
|
evaluator = GroundTruthEvaluator(
|
|
executor=mock_executor, similarity=exact_similarity,
|
|
)
|
|
result = await evaluator.evaluate(prompt, gt_dataset)
|
|
assert all("Exact match" in fb for fb in result.feedbacks)
|
|
|
|
async def test_feedback_for_poor_match(self, mock_executor, gt_dataset, prompt):
|
|
poor_similarity = AsyncMock(spec=SimilarityPort)
|
|
poor_similarity.compute.return_value = 0.1
|
|
|
|
evaluator = GroundTruthEvaluator(
|
|
executor=mock_executor, similarity=poor_similarity,
|
|
)
|
|
result = await evaluator.evaluate(prompt, gt_dataset)
|
|
assert all("Poor match" in fb for fb in result.feedbacks)
|