feat: async/parallel execution with configurable concurrency

Parallelize LLM calls across minibatches to reduce wall-clock time. All domain ports (LLMPort, JudgePort, ProposerPort) are now async. Adapter implementations wrap synchronous DSPy calls with asyncio.to_thread. Judge calls run in parallel within a batch using asyncio.gather + semaphore. Evaluator parallelizes minibatch execution with configurable concurrency. Evolution loop and use case are fully async. Proposer stays sequential. Added --max-concurrency CLI flag and max_concurrency YAML config field. Added async_retry_with_backoff for async error handling. All 139 unit tests pass. Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-03-29 13:15:34 +00:00
parent e2d111ce5b
commit c92ca4a2b8
16 changed files with 297 additions and 159 deletions
--- a/tests/unit/test_evaluator.py
+++ b/tests/unit/test_evaluator.py
@@ -1,7 +1,7 @@
 """Unit tests for PromptEvaluator.evaluate()."""
 from __future__ import annotations

-from unittest.mock import MagicMock
+from unittest.mock import AsyncMock, MagicMock

 import pytest

@@ -14,22 +14,23 @@ class TestPromptEvaluatorEvaluate:
    """Tests for the evaluate() pipeline: execute → judge → trajectories."""

    @pytest.fixture
-    def executor(self) -> MagicMock:
-        return MagicMock(spec=LLMPort)
+    def executor(self) -> AsyncMock:
+        return AsyncMock(spec=LLMPort)

    @pytest.fixture
-    def judge(self) -> MagicMock:
-        return MagicMock(spec=JudgePort)
+    def judge(self) -> AsyncMock:
+        return AsyncMock(spec=JudgePort)

    @pytest.fixture
-    def evaluator(self, executor: MagicMock, judge: MagicMock) -> PromptEvaluator:
+    def evaluator(self, executor: AsyncMock, judge: AsyncMock) -> PromptEvaluator:
        return PromptEvaluator(executor=executor, judge=judge)

-    def test_happy_path_builds_correct_trajectories(
+    @pytest.mark.asyncio
+    async def test_happy_path_builds_correct_trajectories(
        self,
        evaluator: PromptEvaluator,
-        executor: MagicMock,
-        judge: MagicMock,
+        executor: AsyncMock,
+        judge: AsyncMock,
    ) -> None:
        prompt = Prompt(text="Answer the question.")
        examples = [
@@ -42,7 +43,7 @@ class TestPromptEvaluatorEvaluate:
            (0.8, "Mostly correct."),
        ]

-        result = evaluator.evaluate(prompt, examples, "math and geography")
+        result = await evaluator.evaluate(prompt, examples, "math and geography")

        assert isinstance(result, EvalResult)
        assert result.scores == [0.9, 0.8]
@@ -55,14 +56,15 @@ class TestPromptEvaluatorEvaluate:
        assert result.trajectories[0].prompt_used == "Answer the question."
        assert result.trajectories[1].prompt_used == "Answer the question."

-    def test_empty_minibatch_returns_empty_result(
+    @pytest.mark.asyncio
+    async def test_empty_minibatch_returns_empty_result(
        self,
        evaluator: PromptEvaluator,
-        executor: MagicMock,
-        judge: MagicMock,
+        executor: AsyncMock,
+        judge: AsyncMock,
    ) -> None:
        prompt = Prompt(text="test")
-        result = evaluator.evaluate(prompt, [], "task")
+        result = await evaluator.evaluate(prompt, [], "task")

        assert result.scores == []
        assert result.feedbacks == []
@@ -71,41 +73,44 @@ class TestPromptEvaluatorEvaluate:
        # judge_batch is called with empty pairs list
        judge.judge_batch.assert_called_once_with("task", [])

-    def test_executor_called_with_correct_prompt(
+    @pytest.mark.asyncio
+    async def test_executor_called_with_correct_prompt(
        self,
        evaluator: PromptEvaluator,
-        executor: MagicMock,
-        judge: MagicMock,
+        executor: AsyncMock,
+        judge: AsyncMock,
    ) -> None:
        prompt = Prompt(text="Summarize this.")
        examples = [SyntheticExample(input_text="Long text here", id=0)]
        executor.execute.return_value = "Summary."
        judge.judge_batch.return_value = [(0.7, "Good summary.")]

-        evaluator.evaluate(prompt, examples, "summarization")
+        await evaluator.evaluate(prompt, examples, "summarization")

        executor.execute.assert_called_once_with(prompt, "Long text here")

-    def test_trajectories_prompt_used_matches_input_prompt(
+    @pytest.mark.asyncio
+    async def test_trajectories_prompt_used_matches_input_prompt(
        self,
        evaluator: PromptEvaluator,
-        executor: MagicMock,
-        judge: MagicMock,
+        executor: AsyncMock,
+        judge: AsyncMock,
    ) -> None:
        prompt = Prompt(text="Translate to French.")
        examples = [SyntheticExample(input_text="Hello", id=0)]
        executor.execute.return_value = "Bonjour"
        judge.judge_batch.return_value = [(1.0, "Perfect.")]

-        result = evaluator.evaluate(prompt, examples, "translation")
+        result = await evaluator.evaluate(prompt, examples, "translation")

        assert result.trajectories[0].prompt_used == "Translate to French."

-    def test_scores_feedbacks_trajectories_lists_sized_correctly(
+    @pytest.mark.asyncio
+    async def test_scores_feedbacks_trajectories_lists_sized_correctly(
        self,
        evaluator: PromptEvaluator,
-        executor: MagicMock,
-        judge: MagicMock,
+        executor: AsyncMock,
+        judge: AsyncMock,
    ) -> None:
        prompt = Prompt(text="test prompt")
        examples = [SyntheticExample(input_text=f"q{i}", id=i) for i in range(4)]
@@ -114,7 +119,7 @@ class TestPromptEvaluatorEvaluate:
            (0.1 * i, f"fb{i}") for i in range(4)
        ]

-        result = evaluator.evaluate(prompt, examples, "task")
+        result = await evaluator.evaluate(prompt, examples, "task")

        assert len(result.scores) == 4
        assert len(result.feedbacks) == 4