feat: v0.2.0 sprint — ground truth eval, crossover/mutation, checkpointing, similarity guards, dataset loader, CLI commands, extended test coverage

Aggregates all v0.2.0 sprint work (GARAA-30 through GARAA-40) and fixes
2 integration tests that broke when the codebase went async (DSPyLLMAdapter
and full pipeline tests now properly await coroutines).

277 tests pass (260 unit + 17 integration).

Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
FullStackDev
2026-03-29 19:13:50 +00:00
parent b9745566c8
commit a5bf2ad59c
43 changed files with 5007 additions and 358 deletions

View File

@@ -0,0 +1,233 @@
"""Unit tests for OptimizePromptUseCase — direct orchestration tests."""
from __future__ import annotations
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from prometheus.application.bootstrap import SyntheticBootstrap
from prometheus.application.dto import OptimizationConfig, OptimizationResult
from prometheus.application.evaluator import PromptEvaluator
from prometheus.application.evolution import EvolutionLoop
from prometheus.application.use_cases import OptimizePromptUseCase
from prometheus.domain.entities import (
Candidate,
EvalResult,
OptimizationState,
Prompt,
SyntheticExample,
Trajectory,
)
def _make_eval(scores: list[float]) -> EvalResult:
return EvalResult(
scores=scores,
feedbacks=["feedback"] * len(scores),
trajectories=[
Trajectory(f"in{i}", f"out{i}", s, "feedback", "prompt")
for i, s in enumerate(scores)
],
)
def _make_state(
iterations: int = 3,
initial_score: float = 0.3,
final_score: float = 0.8,
accepted: bool = True,
) -> OptimizationState:
seed = Candidate(prompt=Prompt(text="seed"), best_score=initial_score, generation=0)
best = Candidate(
prompt=Prompt(text="optimized" if accepted else "seed"),
best_score=final_score,
generation=iterations if accepted else 0,
)
history = []
for i in range(1, iterations + 1):
event = "accepted" if accepted else "rejected"
history.append({"iteration": i, "event": event, "old_score": 0.3, "new_score": 0.8})
return OptimizationState(
iteration=iterations,
best_candidate=best,
candidates=[seed, best] if accepted else [seed],
total_llm_calls=iterations * 11 + 10,
history=history,
)
class TestOptimizePromptUseCaseExecute:
"""Tests for the execute() orchestration method."""
@pytest.fixture
def mock_evaluator(self) -> MagicMock:
return MagicMock(spec=PromptEvaluator)
@pytest.fixture
def mock_proposer(self) -> MagicMock:
return MagicMock()
@pytest.fixture
def mock_bootstrap(self) -> MagicMock:
return MagicMock(spec=SyntheticBootstrap)
@pytest.fixture
def use_case(
self,
mock_evaluator: MagicMock,
mock_proposer: MagicMock,
mock_bootstrap: MagicMock,
) -> OptimizePromptUseCase:
return OptimizePromptUseCase(
evaluator=mock_evaluator,
proposer=mock_proposer,
bootstrap=mock_bootstrap,
)
@pytest.fixture
def config(self) -> OptimizationConfig:
return OptimizationConfig(
seed_prompt="Answer the question.",
task_description="Q&A task",
max_iterations=5,
n_synthetic_inputs=20,
minibatch_size=5,
seed=42,
)
@pytest.mark.asyncio
async def test_returns_optimization_result(
self,
use_case: OptimizePromptUseCase,
mock_bootstrap: MagicMock,
config: OptimizationConfig,
) -> None:
mock_bootstrap.run.return_value = [
SyntheticExample(input_text=f"q{i}", id=i) for i in range(20)
]
mock_state = _make_state(iterations=3, initial_score=0.3, final_score=0.9)
with patch.object(EvolutionLoop, "run", return_value=mock_state):
result = await use_case.execute(config)
assert isinstance(result, OptimizationResult)
assert result.initial_prompt == "Answer the question."
assert result.final_score == 0.9
assert result.improvement == pytest.approx(0.6)
@pytest.mark.asyncio
async def test_bootstrap_called_with_config_params(
self,
use_case: OptimizePromptUseCase,
mock_bootstrap: MagicMock,
config: OptimizationConfig,
) -> None:
mock_bootstrap.run.return_value = []
mock_state = _make_state()
with patch.object(EvolutionLoop, "run", return_value=mock_state):
await use_case.execute(config)
mock_bootstrap.run.assert_called_once_with(
task_description="Q&A task",
n_examples=20,
)
@pytest.mark.asyncio
async def test_evolution_loop_configured_from_config(
self,
use_case: OptimizePromptUseCase,
mock_bootstrap: MagicMock,
config: OptimizationConfig,
) -> None:
mock_bootstrap.run.return_value = []
mock_state = _make_state()
with patch.object(EvolutionLoop, "run", return_value=mock_state) as mock_run:
await use_case.execute(config)
# Verify the loop was instantiated with correct params
mock_run.assert_called_once()
call_args = mock_run.call_args
seed_prompt = call_args[0][0]
assert seed_prompt.text == "Answer the question."
synthetic_pool = call_args[0][1]
assert len(synthetic_pool) == 0 # bootstrap returned empty
assert call_args[0][2] == "Q&A task"
@pytest.mark.asyncio
async def test_total_llm_calls_includes_bootstrap_call(
self,
use_case: OptimizePromptUseCase,
mock_bootstrap: MagicMock,
config: OptimizationConfig,
) -> None:
mock_bootstrap.run.return_value = []
mock_state = _make_state(iterations=3)
# total_llm_calls from state + 1 for bootstrap
expected = mock_state.total_llm_calls + 1
with patch.object(EvolutionLoop, "run", return_value=mock_state):
result = await use_case.execute(config)
assert result.total_llm_calls == expected
@pytest.mark.asyncio
async def test_no_candidates_fallback(
self,
use_case: OptimizePromptUseCase,
mock_bootstrap: MagicMock,
config: OptimizationConfig,
) -> None:
mock_bootstrap.run.return_value = [
SyntheticExample(input_text=f"q{i}", id=i) for i in range(20)
]
mock_state = OptimizationState(
iteration=0,
best_candidate=None,
candidates=[],
total_llm_calls=0,
)
with patch.object(EvolutionLoop, "run", return_value=mock_state):
result = await use_case.execute(config)
assert result.optimized_prompt == "Answer the question."
assert result.initial_score == 0.0
assert result.final_score == 0.0
assert result.improvement == 0.0
@pytest.mark.asyncio
async def test_iterations_used_matches_state(
self,
use_case: OptimizePromptUseCase,
mock_bootstrap: MagicMock,
config: OptimizationConfig,
) -> None:
mock_bootstrap.run.return_value = []
mock_state = _make_state(iterations=7)
with patch.object(EvolutionLoop, "run", return_value=mock_state):
result = await use_case.execute(config)
assert result.iterations_used == 7
@pytest.mark.asyncio
async def test_history_passed_through(
self,
use_case: OptimizePromptUseCase,
mock_bootstrap: MagicMock,
config: OptimizationConfig,
) -> None:
mock_bootstrap.run.return_value = []
history = [
{"iteration": 1, "event": "accepted"},
{"iteration": 2, "event": "rejected"},
]
mock_state = _make_state()
mock_state.history = history
with patch.object(EvolutionLoop, "run", return_value=mock_state):
result = await use_case.execute(config)
assert result.history == history