feat: v0.2.0 sprint — ground truth eval, crossover/mutation, checkpointing, similarity guards, dataset loader, CLI commands, extended test coverage
Aggregates all v0.2.0 sprint work (GARAA-30 through GARAA-40) and fixes 2 integration tests that broke when the codebase went async (DSPyLLMAdapter and full pipeline tests now properly await coroutines). 277 tests pass (260 unit + 17 integration). Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
233
tests/unit/test_use_cases.py
Normal file
233
tests/unit/test_use_cases.py
Normal file
@@ -0,0 +1,233 @@
|
||||
"""Unit tests for OptimizePromptUseCase — direct orchestration tests."""
|
||||
from __future__ import annotations
|
||||
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from prometheus.application.bootstrap import SyntheticBootstrap
|
||||
from prometheus.application.dto import OptimizationConfig, OptimizationResult
|
||||
from prometheus.application.evaluator import PromptEvaluator
|
||||
from prometheus.application.evolution import EvolutionLoop
|
||||
from prometheus.application.use_cases import OptimizePromptUseCase
|
||||
from prometheus.domain.entities import (
|
||||
Candidate,
|
||||
EvalResult,
|
||||
OptimizationState,
|
||||
Prompt,
|
||||
SyntheticExample,
|
||||
Trajectory,
|
||||
)
|
||||
|
||||
|
||||
def _make_eval(scores: list[float]) -> EvalResult:
|
||||
return EvalResult(
|
||||
scores=scores,
|
||||
feedbacks=["feedback"] * len(scores),
|
||||
trajectories=[
|
||||
Trajectory(f"in{i}", f"out{i}", s, "feedback", "prompt")
|
||||
for i, s in enumerate(scores)
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
def _make_state(
|
||||
iterations: int = 3,
|
||||
initial_score: float = 0.3,
|
||||
final_score: float = 0.8,
|
||||
accepted: bool = True,
|
||||
) -> OptimizationState:
|
||||
seed = Candidate(prompt=Prompt(text="seed"), best_score=initial_score, generation=0)
|
||||
best = Candidate(
|
||||
prompt=Prompt(text="optimized" if accepted else "seed"),
|
||||
best_score=final_score,
|
||||
generation=iterations if accepted else 0,
|
||||
)
|
||||
history = []
|
||||
for i in range(1, iterations + 1):
|
||||
event = "accepted" if accepted else "rejected"
|
||||
history.append({"iteration": i, "event": event, "old_score": 0.3, "new_score": 0.8})
|
||||
|
||||
return OptimizationState(
|
||||
iteration=iterations,
|
||||
best_candidate=best,
|
||||
candidates=[seed, best] if accepted else [seed],
|
||||
total_llm_calls=iterations * 11 + 10,
|
||||
history=history,
|
||||
)
|
||||
|
||||
|
||||
class TestOptimizePromptUseCaseExecute:
|
||||
"""Tests for the execute() orchestration method."""
|
||||
|
||||
@pytest.fixture
|
||||
def mock_evaluator(self) -> MagicMock:
|
||||
return MagicMock(spec=PromptEvaluator)
|
||||
|
||||
@pytest.fixture
|
||||
def mock_proposer(self) -> MagicMock:
|
||||
return MagicMock()
|
||||
|
||||
@pytest.fixture
|
||||
def mock_bootstrap(self) -> MagicMock:
|
||||
return MagicMock(spec=SyntheticBootstrap)
|
||||
|
||||
@pytest.fixture
|
||||
def use_case(
|
||||
self,
|
||||
mock_evaluator: MagicMock,
|
||||
mock_proposer: MagicMock,
|
||||
mock_bootstrap: MagicMock,
|
||||
) -> OptimizePromptUseCase:
|
||||
return OptimizePromptUseCase(
|
||||
evaluator=mock_evaluator,
|
||||
proposer=mock_proposer,
|
||||
bootstrap=mock_bootstrap,
|
||||
)
|
||||
|
||||
@pytest.fixture
|
||||
def config(self) -> OptimizationConfig:
|
||||
return OptimizationConfig(
|
||||
seed_prompt="Answer the question.",
|
||||
task_description="Q&A task",
|
||||
max_iterations=5,
|
||||
n_synthetic_inputs=20,
|
||||
minibatch_size=5,
|
||||
seed=42,
|
||||
)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_returns_optimization_result(
|
||||
self,
|
||||
use_case: OptimizePromptUseCase,
|
||||
mock_bootstrap: MagicMock,
|
||||
config: OptimizationConfig,
|
||||
) -> None:
|
||||
mock_bootstrap.run.return_value = [
|
||||
SyntheticExample(input_text=f"q{i}", id=i) for i in range(20)
|
||||
]
|
||||
|
||||
mock_state = _make_state(iterations=3, initial_score=0.3, final_score=0.9)
|
||||
with patch.object(EvolutionLoop, "run", return_value=mock_state):
|
||||
result = await use_case.execute(config)
|
||||
|
||||
assert isinstance(result, OptimizationResult)
|
||||
assert result.initial_prompt == "Answer the question."
|
||||
assert result.final_score == 0.9
|
||||
assert result.improvement == pytest.approx(0.6)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_bootstrap_called_with_config_params(
|
||||
self,
|
||||
use_case: OptimizePromptUseCase,
|
||||
mock_bootstrap: MagicMock,
|
||||
config: OptimizationConfig,
|
||||
) -> None:
|
||||
mock_bootstrap.run.return_value = []
|
||||
mock_state = _make_state()
|
||||
with patch.object(EvolutionLoop, "run", return_value=mock_state):
|
||||
await use_case.execute(config)
|
||||
|
||||
mock_bootstrap.run.assert_called_once_with(
|
||||
task_description="Q&A task",
|
||||
n_examples=20,
|
||||
)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_evolution_loop_configured_from_config(
|
||||
self,
|
||||
use_case: OptimizePromptUseCase,
|
||||
mock_bootstrap: MagicMock,
|
||||
config: OptimizationConfig,
|
||||
) -> None:
|
||||
mock_bootstrap.run.return_value = []
|
||||
mock_state = _make_state()
|
||||
|
||||
with patch.object(EvolutionLoop, "run", return_value=mock_state) as mock_run:
|
||||
await use_case.execute(config)
|
||||
|
||||
# Verify the loop was instantiated with correct params
|
||||
mock_run.assert_called_once()
|
||||
call_args = mock_run.call_args
|
||||
seed_prompt = call_args[0][0]
|
||||
assert seed_prompt.text == "Answer the question."
|
||||
synthetic_pool = call_args[0][1]
|
||||
assert len(synthetic_pool) == 0 # bootstrap returned empty
|
||||
assert call_args[0][2] == "Q&A task"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_total_llm_calls_includes_bootstrap_call(
|
||||
self,
|
||||
use_case: OptimizePromptUseCase,
|
||||
mock_bootstrap: MagicMock,
|
||||
config: OptimizationConfig,
|
||||
) -> None:
|
||||
mock_bootstrap.run.return_value = []
|
||||
mock_state = _make_state(iterations=3)
|
||||
# total_llm_calls from state + 1 for bootstrap
|
||||
expected = mock_state.total_llm_calls + 1
|
||||
|
||||
with patch.object(EvolutionLoop, "run", return_value=mock_state):
|
||||
result = await use_case.execute(config)
|
||||
|
||||
assert result.total_llm_calls == expected
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_no_candidates_fallback(
|
||||
self,
|
||||
use_case: OptimizePromptUseCase,
|
||||
mock_bootstrap: MagicMock,
|
||||
config: OptimizationConfig,
|
||||
) -> None:
|
||||
mock_bootstrap.run.return_value = [
|
||||
SyntheticExample(input_text=f"q{i}", id=i) for i in range(20)
|
||||
]
|
||||
mock_state = OptimizationState(
|
||||
iteration=0,
|
||||
best_candidate=None,
|
||||
candidates=[],
|
||||
total_llm_calls=0,
|
||||
)
|
||||
|
||||
with patch.object(EvolutionLoop, "run", return_value=mock_state):
|
||||
result = await use_case.execute(config)
|
||||
|
||||
assert result.optimized_prompt == "Answer the question."
|
||||
assert result.initial_score == 0.0
|
||||
assert result.final_score == 0.0
|
||||
assert result.improvement == 0.0
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_iterations_used_matches_state(
|
||||
self,
|
||||
use_case: OptimizePromptUseCase,
|
||||
mock_bootstrap: MagicMock,
|
||||
config: OptimizationConfig,
|
||||
) -> None:
|
||||
mock_bootstrap.run.return_value = []
|
||||
mock_state = _make_state(iterations=7)
|
||||
|
||||
with patch.object(EvolutionLoop, "run", return_value=mock_state):
|
||||
result = await use_case.execute(config)
|
||||
|
||||
assert result.iterations_used == 7
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_history_passed_through(
|
||||
self,
|
||||
use_case: OptimizePromptUseCase,
|
||||
mock_bootstrap: MagicMock,
|
||||
config: OptimizationConfig,
|
||||
) -> None:
|
||||
mock_bootstrap.run.return_value = []
|
||||
history = [
|
||||
{"iteration": 1, "event": "accepted"},
|
||||
{"iteration": 2, "event": "rejected"},
|
||||
]
|
||||
mock_state = _make_state()
|
||||
mock_state.history = history
|
||||
|
||||
with patch.object(EvolutionLoop, "run", return_value=mock_state):
|
||||
result = await use_case.execute(config)
|
||||
|
||||
assert result.history == history
|
||||
Reference in New Issue
Block a user