feat: v0.2.0 sprint — ground truth eval, crossover/mutation, checkpointing, similarity guards, dataset loader, CLI commands, extended test coverage

Aggregates all v0.2.0 sprint work (GARAA-30 through GARAA-40) and fixes
2 integration tests that broke when the codebase went async (DSPyLLMAdapter
and full pipeline tests now properly await coroutines).

277 tests pass (260 unit + 17 integration).

Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
FullStackDev
2026-03-29 19:13:50 +00:00
parent b9745566c8
commit a5bf2ad59c
43 changed files with 5007 additions and 358 deletions

View File

@@ -0,0 +1,133 @@
"""Tests for GroundTruthEvaluator — execution + similarity comparison."""
from __future__ import annotations
from unittest.mock import AsyncMock
import pytest
from prometheus.application.ground_truth_evaluator import GroundTruthEvaluator
from prometheus.domain.entities import EvalResult, GroundTruthExample, Prompt
from prometheus.domain.ports import LLMPort, SimilarityPort
@pytest.fixture
def mock_executor() -> AsyncMock:
port = AsyncMock(spec=LLMPort)
port.execute.return_value = "Paris is the capital of France."
return port
@pytest.fixture
def mock_similarity() -> AsyncMock:
port = AsyncMock(spec=SimilarityPort)
port.compute.return_value = 0.85
return port
@pytest.fixture
def gt_dataset() -> list[GroundTruthExample]:
return [
GroundTruthExample(input_text="What is the capital of France?", expected_output="Paris", id=0),
GroundTruthExample(input_text="What is 2+2?", expected_output="4", id=1),
GroundTruthExample(input_text="What color is the sky?", expected_output="blue", id=2),
]
@pytest.fixture
def prompt() -> Prompt:
return Prompt(text="Answer the following question accurately.")
@pytest.mark.asyncio
class TestGroundTruthEvaluator:
async def test_evaluate_happy_path(self, mock_executor, mock_similarity, gt_dataset, prompt):
evaluator = GroundTruthEvaluator(
executor=mock_executor,
similarity=mock_similarity,
max_concurrency=2,
)
result = await evaluator.evaluate(prompt, gt_dataset)
assert isinstance(result, EvalResult)
assert len(result.scores) == 3
assert len(result.feedbacks) == 3
assert len(result.trajectories) == 3
assert all(s == 0.85 for s in result.scores)
assert result.mean_score == pytest.approx(0.85)
assert result.total_score == pytest.approx(2.55)
async def test_executor_called_for_each_input(self, mock_executor, mock_similarity, gt_dataset, prompt):
evaluator = GroundTruthEvaluator(
executor=mock_executor, similarity=mock_similarity,
)
await evaluator.evaluate(prompt, gt_dataset)
assert mock_executor.execute.call_count == 3
async def test_similarity_called_for_each_output(self, mock_executor, mock_similarity, gt_dataset, prompt):
evaluator = GroundTruthEvaluator(
executor=mock_executor, similarity=mock_similarity,
)
await evaluator.evaluate(prompt, gt_dataset)
assert mock_similarity.compute.call_count == 3
async def test_execution_error_produces_zero_score(self, mock_similarity, gt_dataset, prompt):
failing_executor = AsyncMock(spec=LLMPort)
failing_executor.execute.side_effect = RuntimeError("API timeout")
evaluator = GroundTruthEvaluator(
executor=failing_executor, similarity=mock_similarity,
)
result = await evaluator.evaluate(prompt, gt_dataset)
assert len(result.scores) == 3
# The similarity adapter is called with the error sentinel
assert all(isinstance(s, float) for s in result.scores)
assert all("[execution error:" in t.output_text for t in result.trajectories)
async def test_empty_dataset(self, mock_executor, mock_similarity, prompt):
evaluator = GroundTruthEvaluator(
executor=mock_executor, similarity=mock_similarity,
)
result = await evaluator.evaluate(prompt, [])
assert result.scores == []
assert result.mean_score == 0.0
assert result.total_score == 0.0
async def test_trajectory_contains_prompt_used(self, mock_executor, mock_similarity, gt_dataset, prompt):
evaluator = GroundTruthEvaluator(
executor=mock_executor, similarity=mock_similarity,
)
result = await evaluator.evaluate(prompt, gt_dataset)
for t in result.trajectories:
assert t.prompt_used == prompt.text
async def test_scores_clamped_to_unit_range(self, mock_executor, gt_dataset, prompt):
# Similarity returns a value > 1.0 (should be clamped)
over_similarity = AsyncMock(spec=SimilarityPort)
over_similarity.compute.return_value = 1.5
evaluator = GroundTruthEvaluator(
executor=mock_executor, similarity=over_similarity,
)
result = await evaluator.evaluate(prompt, gt_dataset)
assert all(0.0 <= s <= 1.0 for s in result.scores)
async def test_feedback_for_exact_match(self, mock_executor, gt_dataset, prompt):
exact_similarity = AsyncMock(spec=SimilarityPort)
exact_similarity.compute.return_value = 1.0
evaluator = GroundTruthEvaluator(
executor=mock_executor, similarity=exact_similarity,
)
result = await evaluator.evaluate(prompt, gt_dataset)
assert all("Exact match" in fb for fb in result.feedbacks)
async def test_feedback_for_poor_match(self, mock_executor, gt_dataset, prompt):
poor_similarity = AsyncMock(spec=SimilarityPort)
poor_similarity.compute.return_value = 0.1
evaluator = GroundTruthEvaluator(
executor=mock_executor, similarity=poor_similarity,
)
result = await evaluator.evaluate(prompt, gt_dataset)
assert all("Poor match" in fb for fb in result.feedbacks)