Files
Prompt-optimizer/tests/unit/test_evolution.py
FullStackDev c92ca4a2b8 feat: async/parallel execution with configurable concurrency
Parallelize LLM calls across minibatches to reduce wall-clock time.
All domain ports (LLMPort, JudgePort, ProposerPort) are now async.
Adapter implementations wrap synchronous DSPy calls with asyncio.to_thread.
Judge calls run in parallel within a batch using asyncio.gather + semaphore.
Evaluator parallelizes minibatch execution with configurable concurrency.
Evolution loop and use case are fully async. Proposer stays sequential.
Added --max-concurrency CLI flag and max_concurrency YAML config field.
Added async_retry_with_backoff for async error handling.
All 139 unit tests pass.

Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-03-29 13:15:34 +00:00

153 lines
5.4 KiB
Python

"""Unit tests for the evolution loop — with full mocking."""
from __future__ import annotations
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from prometheus.application.bootstrap import SyntheticBootstrap
from prometheus.application.evaluator import PromptEvaluator
from prometheus.application.evolution import EvolutionLoop
from prometheus.domain.entities import EvalResult, Prompt, SyntheticExample, Trajectory
class TestEvolutionLoop:
@pytest.mark.asyncio
async def test_accepts_improvement(
self,
seed_prompt: Prompt,
synthetic_pool: list[SyntheticExample],
task_description: str,
mock_llm_port: AsyncMock,
mock_judge_port: AsyncMock,
mock_proposer_port: AsyncMock,
) -> None:
"""When the new prompt improves the score, the best candidate is updated."""
evaluator = PromptEvaluator(mock_llm_port, mock_judge_port)
bootstrap = MagicMock(spec=SyntheticBootstrap)
bootstrap.sample_minibatch.return_value = synthetic_pool[:5]
initial_eval = EvalResult(
scores=[0.3, 0.4, 0.3, 0.5, 0.2],
feedbacks=["bad"] * 5,
trajectories=[
Trajectory(f"input{i}", f"output{i}", s, "bad", "prompt")
for i, s in enumerate([0.3, 0.4, 0.3, 0.5, 0.2])
],
)
old_eval = EvalResult(
scores=[0.3, 0.4, 0.3, 0.5, 0.2],
feedbacks=["bad"] * 5,
trajectories=[
Trajectory(f"input{i}", f"output{i}", s, "bad", "prompt")
for i, s in enumerate([0.3, 0.4, 0.3, 0.5, 0.2])
],
)
new_eval = EvalResult(
scores=[0.8, 0.9, 0.7, 0.8, 0.9],
feedbacks=["good"] * 5,
trajectories=[],
)
evaluator.evaluate = AsyncMock(side_effect=[initial_eval, old_eval, new_eval])
loop = EvolutionLoop(
evaluator=evaluator,
proposer=mock_proposer_port,
bootstrap=bootstrap,
max_iterations=1,
minibatch_size=5,
)
with patch.object(loop, "_log"):
state = await loop.run(seed_prompt, synthetic_pool, task_description)
assert state.best_candidate is not None
assert state.best_candidate.best_score > 0
@pytest.mark.asyncio
async def test_rejects_regression(
self,
seed_prompt: Prompt,
synthetic_pool: list[SyntheticExample],
task_description: str,
mock_llm_port: AsyncMock,
mock_judge_port: AsyncMock,
mock_proposer_port: AsyncMock,
) -> None:
"""When the new prompt degrades the score, the best candidate stays unchanged."""
evaluator = PromptEvaluator(mock_llm_port, mock_judge_port)
bootstrap = MagicMock(spec=SyntheticBootstrap)
bootstrap.sample_minibatch.return_value = synthetic_pool[:5]
initial_eval = EvalResult(
scores=[0.7, 0.8, 0.7, 0.8, 0.9],
feedbacks=["ok"] * 5,
trajectories=[
Trajectory(f"input{i}", f"output{i}", s, "ok", "prompt")
for i, s in enumerate([0.7, 0.8, 0.7, 0.8, 0.9])
],
)
old_eval = EvalResult(
scores=[0.7, 0.8, 0.7, 0.8, 0.9],
feedbacks=["ok"] * 5,
trajectories=[
Trajectory(f"input{i}", f"output{i}", s, "ok", "prompt")
for i, s in enumerate([0.7, 0.8, 0.7, 0.8, 0.9])
],
)
new_eval = EvalResult(
scores=[0.2, 0.1, 0.3, 0.2, 0.1],
feedbacks=["bad"] * 5,
trajectories=[],
)
evaluator.evaluate = AsyncMock(side_effect=[initial_eval, old_eval, new_eval])
loop = EvolutionLoop(
evaluator=evaluator,
proposer=mock_proposer_port,
bootstrap=bootstrap,
max_iterations=1,
minibatch_size=5,
)
with patch.object(loop, "_log"):
state = await loop.run(seed_prompt, synthetic_pool, task_description)
assert state.best_candidate is not None
assert state.best_candidate.prompt.text == seed_prompt.text
@pytest.mark.asyncio
async def test_skips_perfect_scores(
self,
seed_prompt: Prompt,
synthetic_pool: list[SyntheticExample],
task_description: str,
mock_llm_port: AsyncMock,
mock_judge_port: AsyncMock,
mock_proposer_port: AsyncMock,
) -> None:
"""When all scores are perfect, no proposition is made."""
evaluator = PromptEvaluator(mock_llm_port, mock_judge_port)
bootstrap = MagicMock(spec=SyntheticBootstrap)
bootstrap.sample_minibatch.return_value = synthetic_pool[:5]
perfect_eval = EvalResult(
scores=[1.0, 1.0, 1.0, 1.0, 1.0],
feedbacks=["perfect"] * 5,
trajectories=[
Trajectory(f"input{i}", f"output{i}", 1.0, "perfect", "prompt")
for i in range(5)
],
)
evaluator.evaluate = AsyncMock(return_value=perfect_eval)
loop = EvolutionLoop(
evaluator=evaluator,
proposer=mock_proposer_port,
bootstrap=bootstrap,
max_iterations=3,
minibatch_size=5,
)
with patch.object(loop, "_log"):
await loop.run(seed_prompt, synthetic_pool, task_description)
mock_proposer_port.propose.assert_not_called()