Parallelize LLM calls across minibatches to reduce wall-clock time. All domain ports (LLMPort, JudgePort, ProposerPort) are now async. Adapter implementations wrap synchronous DSPy calls with asyncio.to_thread. Judge calls run in parallel within a batch using asyncio.gather + semaphore. Evaluator parallelizes minibatch execution with configurable concurrency. Evolution loop and use case are fully async. Proposer stays sequential. Added --max-concurrency CLI flag and max_concurrency YAML config field. Added async_retry_with_backoff for async error handling. All 139 unit tests pass. Co-Authored-By: Paperclip <noreply@paperclip.ing>
127 lines
4.5 KiB
Python
127 lines
4.5 KiB
Python
"""Unit tests for PromptEvaluator.evaluate()."""
|
|
from __future__ import annotations
|
|
|
|
from unittest.mock import AsyncMock, MagicMock
|
|
|
|
import pytest
|
|
|
|
from prometheus.application.evaluator import PromptEvaluator
|
|
from prometheus.domain.entities import EvalResult, Prompt, SyntheticExample, Trajectory
|
|
from prometheus.domain.ports import JudgePort, LLMPort
|
|
|
|
|
|
class TestPromptEvaluatorEvaluate:
|
|
"""Tests for the evaluate() pipeline: execute → judge → trajectories."""
|
|
|
|
@pytest.fixture
|
|
def executor(self) -> AsyncMock:
|
|
return AsyncMock(spec=LLMPort)
|
|
|
|
@pytest.fixture
|
|
def judge(self) -> AsyncMock:
|
|
return AsyncMock(spec=JudgePort)
|
|
|
|
@pytest.fixture
|
|
def evaluator(self, executor: AsyncMock, judge: AsyncMock) -> PromptEvaluator:
|
|
return PromptEvaluator(executor=executor, judge=judge)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_happy_path_builds_correct_trajectories(
|
|
self,
|
|
evaluator: PromptEvaluator,
|
|
executor: AsyncMock,
|
|
judge: AsyncMock,
|
|
) -> None:
|
|
prompt = Prompt(text="Answer the question.")
|
|
examples = [
|
|
SyntheticExample(input_text="What is 2+2?", id=0),
|
|
SyntheticExample(input_text="Capital of France?", id=1),
|
|
]
|
|
executor.execute.side_effect = ["4", "Paris"]
|
|
judge.judge_batch.return_value = [
|
|
(0.9, "Correct."),
|
|
(0.8, "Mostly correct."),
|
|
]
|
|
|
|
result = await evaluator.evaluate(prompt, examples, "math and geography")
|
|
|
|
assert isinstance(result, EvalResult)
|
|
assert result.scores == [0.9, 0.8]
|
|
assert result.feedbacks == ["Correct.", "Mostly correct."]
|
|
assert len(result.trajectories) == 2
|
|
assert result.trajectories[0].input_text == "What is 2+2?"
|
|
assert result.trajectories[0].output_text == "4"
|
|
assert result.trajectories[0].score == 0.9
|
|
assert result.trajectories[0].feedback == "Correct."
|
|
assert result.trajectories[0].prompt_used == "Answer the question."
|
|
assert result.trajectories[1].prompt_used == "Answer the question."
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_empty_minibatch_returns_empty_result(
|
|
self,
|
|
evaluator: PromptEvaluator,
|
|
executor: AsyncMock,
|
|
judge: AsyncMock,
|
|
) -> None:
|
|
prompt = Prompt(text="test")
|
|
result = await evaluator.evaluate(prompt, [], "task")
|
|
|
|
assert result.scores == []
|
|
assert result.feedbacks == []
|
|
assert result.trajectories == []
|
|
executor.execute.assert_not_called()
|
|
# judge_batch is called with empty pairs list
|
|
judge.judge_batch.assert_called_once_with("task", [])
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_executor_called_with_correct_prompt(
|
|
self,
|
|
evaluator: PromptEvaluator,
|
|
executor: AsyncMock,
|
|
judge: AsyncMock,
|
|
) -> None:
|
|
prompt = Prompt(text="Summarize this.")
|
|
examples = [SyntheticExample(input_text="Long text here", id=0)]
|
|
executor.execute.return_value = "Summary."
|
|
judge.judge_batch.return_value = [(0.7, "Good summary.")]
|
|
|
|
await evaluator.evaluate(prompt, examples, "summarization")
|
|
|
|
executor.execute.assert_called_once_with(prompt, "Long text here")
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_trajectories_prompt_used_matches_input_prompt(
|
|
self,
|
|
evaluator: PromptEvaluator,
|
|
executor: AsyncMock,
|
|
judge: AsyncMock,
|
|
) -> None:
|
|
prompt = Prompt(text="Translate to French.")
|
|
examples = [SyntheticExample(input_text="Hello", id=0)]
|
|
executor.execute.return_value = "Bonjour"
|
|
judge.judge_batch.return_value = [(1.0, "Perfect.")]
|
|
|
|
result = await evaluator.evaluate(prompt, examples, "translation")
|
|
|
|
assert result.trajectories[0].prompt_used == "Translate to French."
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_scores_feedbacks_trajectories_lists_sized_correctly(
|
|
self,
|
|
evaluator: PromptEvaluator,
|
|
executor: AsyncMock,
|
|
judge: AsyncMock,
|
|
) -> None:
|
|
prompt = Prompt(text="test prompt")
|
|
examples = [SyntheticExample(input_text=f"q{i}", id=i) for i in range(4)]
|
|
executor.execute.side_effect = [f"a{i}" for i in range(4)]
|
|
judge.judge_batch.return_value = [
|
|
(0.1 * i, f"fb{i}") for i in range(4)
|
|
]
|
|
|
|
result = await evaluator.evaluate(prompt, examples, "task")
|
|
|
|
assert len(result.scores) == 4
|
|
assert len(result.feedbacks) == 4
|
|
assert len(result.trajectories) == 4
|