feat: v0.2.0 sprint — ground truth eval, crossover/mutation, checkpointing, similarity guards, dataset loader, CLI commands, extended test coverage
Aggregates all v0.2.0 sprint work (GARAA-30 through GARAA-40) and fixes 2 integration tests that broke when the codebase went async (DSPyLLMAdapter and full pipeline tests now properly await coroutines). 277 tests pass (260 unit + 17 integration). Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
133
tests/unit/test_ground_truth_evaluator.py
Normal file
133
tests/unit/test_ground_truth_evaluator.py
Normal file
@@ -0,0 +1,133 @@
|
||||
"""Tests for GroundTruthEvaluator — execution + similarity comparison."""
|
||||
from __future__ import annotations
|
||||
|
||||
from unittest.mock import AsyncMock
|
||||
|
||||
import pytest
|
||||
|
||||
from prometheus.application.ground_truth_evaluator import GroundTruthEvaluator
|
||||
from prometheus.domain.entities import EvalResult, GroundTruthExample, Prompt
|
||||
from prometheus.domain.ports import LLMPort, SimilarityPort
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_executor() -> AsyncMock:
|
||||
port = AsyncMock(spec=LLMPort)
|
||||
port.execute.return_value = "Paris is the capital of France."
|
||||
return port
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_similarity() -> AsyncMock:
|
||||
port = AsyncMock(spec=SimilarityPort)
|
||||
port.compute.return_value = 0.85
|
||||
return port
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def gt_dataset() -> list[GroundTruthExample]:
|
||||
return [
|
||||
GroundTruthExample(input_text="What is the capital of France?", expected_output="Paris", id=0),
|
||||
GroundTruthExample(input_text="What is 2+2?", expected_output="4", id=1),
|
||||
GroundTruthExample(input_text="What color is the sky?", expected_output="blue", id=2),
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def prompt() -> Prompt:
|
||||
return Prompt(text="Answer the following question accurately.")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
class TestGroundTruthEvaluator:
|
||||
async def test_evaluate_happy_path(self, mock_executor, mock_similarity, gt_dataset, prompt):
|
||||
evaluator = GroundTruthEvaluator(
|
||||
executor=mock_executor,
|
||||
similarity=mock_similarity,
|
||||
max_concurrency=2,
|
||||
)
|
||||
result = await evaluator.evaluate(prompt, gt_dataset)
|
||||
|
||||
assert isinstance(result, EvalResult)
|
||||
assert len(result.scores) == 3
|
||||
assert len(result.feedbacks) == 3
|
||||
assert len(result.trajectories) == 3
|
||||
assert all(s == 0.85 for s in result.scores)
|
||||
assert result.mean_score == pytest.approx(0.85)
|
||||
assert result.total_score == pytest.approx(2.55)
|
||||
|
||||
async def test_executor_called_for_each_input(self, mock_executor, mock_similarity, gt_dataset, prompt):
|
||||
evaluator = GroundTruthEvaluator(
|
||||
executor=mock_executor, similarity=mock_similarity,
|
||||
)
|
||||
await evaluator.evaluate(prompt, gt_dataset)
|
||||
assert mock_executor.execute.call_count == 3
|
||||
|
||||
async def test_similarity_called_for_each_output(self, mock_executor, mock_similarity, gt_dataset, prompt):
|
||||
evaluator = GroundTruthEvaluator(
|
||||
executor=mock_executor, similarity=mock_similarity,
|
||||
)
|
||||
await evaluator.evaluate(prompt, gt_dataset)
|
||||
assert mock_similarity.compute.call_count == 3
|
||||
|
||||
async def test_execution_error_produces_zero_score(self, mock_similarity, gt_dataset, prompt):
|
||||
failing_executor = AsyncMock(spec=LLMPort)
|
||||
failing_executor.execute.side_effect = RuntimeError("API timeout")
|
||||
|
||||
evaluator = GroundTruthEvaluator(
|
||||
executor=failing_executor, similarity=mock_similarity,
|
||||
)
|
||||
result = await evaluator.evaluate(prompt, gt_dataset)
|
||||
|
||||
assert len(result.scores) == 3
|
||||
# The similarity adapter is called with the error sentinel
|
||||
assert all(isinstance(s, float) for s in result.scores)
|
||||
assert all("[execution error:" in t.output_text for t in result.trajectories)
|
||||
|
||||
async def test_empty_dataset(self, mock_executor, mock_similarity, prompt):
|
||||
evaluator = GroundTruthEvaluator(
|
||||
executor=mock_executor, similarity=mock_similarity,
|
||||
)
|
||||
result = await evaluator.evaluate(prompt, [])
|
||||
assert result.scores == []
|
||||
assert result.mean_score == 0.0
|
||||
assert result.total_score == 0.0
|
||||
|
||||
async def test_trajectory_contains_prompt_used(self, mock_executor, mock_similarity, gt_dataset, prompt):
|
||||
evaluator = GroundTruthEvaluator(
|
||||
executor=mock_executor, similarity=mock_similarity,
|
||||
)
|
||||
result = await evaluator.evaluate(prompt, gt_dataset)
|
||||
for t in result.trajectories:
|
||||
assert t.prompt_used == prompt.text
|
||||
|
||||
async def test_scores_clamped_to_unit_range(self, mock_executor, gt_dataset, prompt):
|
||||
# Similarity returns a value > 1.0 (should be clamped)
|
||||
over_similarity = AsyncMock(spec=SimilarityPort)
|
||||
over_similarity.compute.return_value = 1.5
|
||||
|
||||
evaluator = GroundTruthEvaluator(
|
||||
executor=mock_executor, similarity=over_similarity,
|
||||
)
|
||||
result = await evaluator.evaluate(prompt, gt_dataset)
|
||||
assert all(0.0 <= s <= 1.0 for s in result.scores)
|
||||
|
||||
async def test_feedback_for_exact_match(self, mock_executor, gt_dataset, prompt):
|
||||
exact_similarity = AsyncMock(spec=SimilarityPort)
|
||||
exact_similarity.compute.return_value = 1.0
|
||||
|
||||
evaluator = GroundTruthEvaluator(
|
||||
executor=mock_executor, similarity=exact_similarity,
|
||||
)
|
||||
result = await evaluator.evaluate(prompt, gt_dataset)
|
||||
assert all("Exact match" in fb for fb in result.feedbacks)
|
||||
|
||||
async def test_feedback_for_poor_match(self, mock_executor, gt_dataset, prompt):
|
||||
poor_similarity = AsyncMock(spec=SimilarityPort)
|
||||
poor_similarity.compute.return_value = 0.1
|
||||
|
||||
evaluator = GroundTruthEvaluator(
|
||||
executor=mock_executor, similarity=poor_similarity,
|
||||
)
|
||||
result = await evaluator.evaluate(prompt, gt_dataset)
|
||||
assert all("Poor match" in fb for fb in result.feedbacks)
|
||||
Reference in New Issue
Block a user