Parallelize LLM calls across minibatches to reduce wall-clock time. All domain ports (LLMPort, JudgePort, ProposerPort) are now async. Adapter implementations wrap synchronous DSPy calls with asyncio.to_thread. Judge calls run in parallel within a batch using asyncio.gather + semaphore. Evaluator parallelizes minibatch execution with configurable concurrency. Evolution loop and use case are fully async. Proposer stays sequential. Added --max-concurrency CLI flag and max_concurrency YAML config field. Added async_retry_with_backoff for async error handling. All 139 unit tests pass. Co-Authored-By: Paperclip <noreply@paperclip.ing>
153 lines
5.4 KiB
Python
153 lines
5.4 KiB
Python
"""Unit tests for the evolution loop — with full mocking."""
|
|
from __future__ import annotations
|
|
|
|
from unittest.mock import AsyncMock, MagicMock, patch
|
|
|
|
import pytest
|
|
|
|
from prometheus.application.bootstrap import SyntheticBootstrap
|
|
from prometheus.application.evaluator import PromptEvaluator
|
|
from prometheus.application.evolution import EvolutionLoop
|
|
from prometheus.domain.entities import EvalResult, Prompt, SyntheticExample, Trajectory
|
|
|
|
|
|
class TestEvolutionLoop:
|
|
@pytest.mark.asyncio
|
|
async def test_accepts_improvement(
|
|
self,
|
|
seed_prompt: Prompt,
|
|
synthetic_pool: list[SyntheticExample],
|
|
task_description: str,
|
|
mock_llm_port: AsyncMock,
|
|
mock_judge_port: AsyncMock,
|
|
mock_proposer_port: AsyncMock,
|
|
) -> None:
|
|
"""When the new prompt improves the score, the best candidate is updated."""
|
|
evaluator = PromptEvaluator(mock_llm_port, mock_judge_port)
|
|
bootstrap = MagicMock(spec=SyntheticBootstrap)
|
|
bootstrap.sample_minibatch.return_value = synthetic_pool[:5]
|
|
|
|
initial_eval = EvalResult(
|
|
scores=[0.3, 0.4, 0.3, 0.5, 0.2],
|
|
feedbacks=["bad"] * 5,
|
|
trajectories=[
|
|
Trajectory(f"input{i}", f"output{i}", s, "bad", "prompt")
|
|
for i, s in enumerate([0.3, 0.4, 0.3, 0.5, 0.2])
|
|
],
|
|
)
|
|
old_eval = EvalResult(
|
|
scores=[0.3, 0.4, 0.3, 0.5, 0.2],
|
|
feedbacks=["bad"] * 5,
|
|
trajectories=[
|
|
Trajectory(f"input{i}", f"output{i}", s, "bad", "prompt")
|
|
for i, s in enumerate([0.3, 0.4, 0.3, 0.5, 0.2])
|
|
],
|
|
)
|
|
new_eval = EvalResult(
|
|
scores=[0.8, 0.9, 0.7, 0.8, 0.9],
|
|
feedbacks=["good"] * 5,
|
|
trajectories=[],
|
|
)
|
|
evaluator.evaluate = AsyncMock(side_effect=[initial_eval, old_eval, new_eval])
|
|
|
|
loop = EvolutionLoop(
|
|
evaluator=evaluator,
|
|
proposer=mock_proposer_port,
|
|
bootstrap=bootstrap,
|
|
max_iterations=1,
|
|
minibatch_size=5,
|
|
)
|
|
with patch.object(loop, "_log"):
|
|
state = await loop.run(seed_prompt, synthetic_pool, task_description)
|
|
|
|
assert state.best_candidate is not None
|
|
assert state.best_candidate.best_score > 0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_rejects_regression(
|
|
self,
|
|
seed_prompt: Prompt,
|
|
synthetic_pool: list[SyntheticExample],
|
|
task_description: str,
|
|
mock_llm_port: AsyncMock,
|
|
mock_judge_port: AsyncMock,
|
|
mock_proposer_port: AsyncMock,
|
|
) -> None:
|
|
"""When the new prompt degrades the score, the best candidate stays unchanged."""
|
|
evaluator = PromptEvaluator(mock_llm_port, mock_judge_port)
|
|
bootstrap = MagicMock(spec=SyntheticBootstrap)
|
|
bootstrap.sample_minibatch.return_value = synthetic_pool[:5]
|
|
|
|
initial_eval = EvalResult(
|
|
scores=[0.7, 0.8, 0.7, 0.8, 0.9],
|
|
feedbacks=["ok"] * 5,
|
|
trajectories=[
|
|
Trajectory(f"input{i}", f"output{i}", s, "ok", "prompt")
|
|
for i, s in enumerate([0.7, 0.8, 0.7, 0.8, 0.9])
|
|
],
|
|
)
|
|
old_eval = EvalResult(
|
|
scores=[0.7, 0.8, 0.7, 0.8, 0.9],
|
|
feedbacks=["ok"] * 5,
|
|
trajectories=[
|
|
Trajectory(f"input{i}", f"output{i}", s, "ok", "prompt")
|
|
for i, s in enumerate([0.7, 0.8, 0.7, 0.8, 0.9])
|
|
],
|
|
)
|
|
new_eval = EvalResult(
|
|
scores=[0.2, 0.1, 0.3, 0.2, 0.1],
|
|
feedbacks=["bad"] * 5,
|
|
trajectories=[],
|
|
)
|
|
evaluator.evaluate = AsyncMock(side_effect=[initial_eval, old_eval, new_eval])
|
|
|
|
loop = EvolutionLoop(
|
|
evaluator=evaluator,
|
|
proposer=mock_proposer_port,
|
|
bootstrap=bootstrap,
|
|
max_iterations=1,
|
|
minibatch_size=5,
|
|
)
|
|
with patch.object(loop, "_log"):
|
|
state = await loop.run(seed_prompt, synthetic_pool, task_description)
|
|
|
|
assert state.best_candidate is not None
|
|
assert state.best_candidate.prompt.text == seed_prompt.text
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_skips_perfect_scores(
|
|
self,
|
|
seed_prompt: Prompt,
|
|
synthetic_pool: list[SyntheticExample],
|
|
task_description: str,
|
|
mock_llm_port: AsyncMock,
|
|
mock_judge_port: AsyncMock,
|
|
mock_proposer_port: AsyncMock,
|
|
) -> None:
|
|
"""When all scores are perfect, no proposition is made."""
|
|
evaluator = PromptEvaluator(mock_llm_port, mock_judge_port)
|
|
bootstrap = MagicMock(spec=SyntheticBootstrap)
|
|
bootstrap.sample_minibatch.return_value = synthetic_pool[:5]
|
|
|
|
perfect_eval = EvalResult(
|
|
scores=[1.0, 1.0, 1.0, 1.0, 1.0],
|
|
feedbacks=["perfect"] * 5,
|
|
trajectories=[
|
|
Trajectory(f"input{i}", f"output{i}", 1.0, "perfect", "prompt")
|
|
for i in range(5)
|
|
],
|
|
)
|
|
evaluator.evaluate = AsyncMock(return_value=perfect_eval)
|
|
|
|
loop = EvolutionLoop(
|
|
evaluator=evaluator,
|
|
proposer=mock_proposer_port,
|
|
bootstrap=bootstrap,
|
|
max_iterations=3,
|
|
minibatch_size=5,
|
|
)
|
|
with patch.object(loop, "_log"):
|
|
await loop.run(seed_prompt, synthetic_pool, task_description)
|
|
|
|
mock_proposer_port.propose.assert_not_called()
|