feat: async/parallel execution with configurable concurrency

Parallelize LLM calls across minibatches to reduce wall-clock time. All domain ports (LLMPort, JudgePort, ProposerPort) are now async. Adapter implementations wrap synchronous DSPy calls with asyncio.to_thread. Judge calls run in parallel within a batch using asyncio.gather + semaphore. Evaluator parallelizes minibatch execution with configurable concurrency. Evolution loop and use case are fully async. Proposer stays sequential. Added --max-concurrency CLI flag and max_concurrency YAML config field. Added async_retry_with_backoff for async error handling. All 139 unit tests pass. Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-03-29 13:15:34 +00:00
parent e2d111ce5b
commit c92ca4a2b8
16 changed files with 297 additions and 159 deletions
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,7 +1,7 @@
 """Shared test fixtures."""
 from __future__ import annotations

-from unittest.mock import MagicMock
+from unittest.mock import AsyncMock, MagicMock

 import pytest

@@ -66,17 +66,17 @@ def mock_eval_result() -> EvalResult:


@pytest.fixture
-def mock_llm_port() -> MagicMock:
+def mock_llm_port() -> AsyncMock:
    """Mock LLMPort that returns canned responses."""
-    port = MagicMock()
+    port = AsyncMock()
    port.execute.return_value = "This is a mock response."
    return port


@pytest.fixture
-def mock_judge_port() -> MagicMock:
+def mock_judge_port() -> AsyncMock:
    """Mock JudgePort that returns moderate scores."""
-    port = MagicMock()
+    port = AsyncMock()
    port.judge_batch.return_value = [
        (0.5, "Moderate quality, needs improvement."),
    ] * 5
@@ -84,9 +84,9 @@ def mock_judge_port() -> MagicMock:


@pytest.fixture
-def mock_proposer_port() -> MagicMock:
+def mock_proposer_port() -> AsyncMock:
    """Mock ProposerPort that returns a slightly modified prompt."""
-    port = MagicMock()
+    port = AsyncMock()
    port.propose.return_value = Prompt(
        text="You are a very helpful assistant. Answer the question precisely."
    )
--- a/tests/unit/test_adapter_config.py
+++ b/tests/unit/test_adapter_config.py
@@ -57,23 +57,25 @@ def synth_lm() -> dspy.LM:
 class TestDSPyLLMAdapterOwnLM:
    """Bug #2 fix: DSPyLLMAdapter must use the LM it receives, not the global one."""

-    def test_uses_provided_lm_not_global(self) -> None:
+    @pytest.mark.asyncio
+    async def test_uses_provided_lm_not_global(self) -> None:
        local_lm = dspy.utils.DummyLM([{"output": "local response"}])
        global_lm = dspy.utils.DummyLM([{"output": "global response"}])
        dspy.configure(lm=global_lm)

        adapter = DSPyLLMAdapter(lm=local_lm)
-        result = adapter.execute(Prompt(text="test"), "input")
+        result = await adapter.execute(Prompt(text="test"), "input")

        assert result == "local response"

-    def test_does_not_affect_global_lm(self) -> None:
+    @pytest.mark.asyncio
+    async def test_does_not_affect_global_lm(self) -> None:
        local_lm = dspy.utils.DummyLM([{"output": "local response"}])
        global_lm = dspy.utils.DummyLM([{"output": "global response"}])
        dspy.configure(lm=global_lm)

        adapter = DSPyLLMAdapter(lm=local_lm)
-        adapter.execute(Prompt(text="test"), "input")
+        await adapter.execute(Prompt(text="test"), "input")

        # Global LM should still be the same
        assert dspy.settings.lm is global_lm
@@ -82,9 +84,10 @@ class TestDSPyLLMAdapterOwnLM:
 class TestDSPyJudgeAdapterOwnLM:
    """DSPyJudgeAdapter must use its own LM instance."""

-    def test_uses_provided_lm(self, judge_lm: dspy.LM) -> None:
+    @pytest.mark.asyncio
+    async def test_uses_provided_lm(self, judge_lm: dspy.LM) -> None:
        adapter = DSPyJudgeAdapter(lm=judge_lm)
-        results = adapter.judge_batch(
+        results = await adapter.judge_batch(
            task_description="Test task",
            pairs=[("input 1", "output 1")],
        )
@@ -93,7 +96,8 @@ class TestDSPyJudgeAdapterOwnLM:
        assert score == 0.8
        assert feedback == "Good response."

-    def test_does_not_use_global_lm(self) -> None:
+    @pytest.mark.asyncio
+    async def test_does_not_use_global_lm(self) -> None:
        judge_lm = dspy.utils.DummyLM(
            [{"reasoning": "ok", "score": "0.9", "feedback": "Judge-specific response"}]
        )
@@ -101,14 +105,15 @@ class TestDSPyJudgeAdapterOwnLM:
        dspy.configure(lm=global_lm)

        adapter = DSPyJudgeAdapter(lm=judge_lm)
-        results = adapter.judge_batch("task", [("in", "out")])
+        results = await adapter.judge_batch("task", [("in", "out")])
        assert results[0][0] == 0.9


 class TestDSPyProposerAdapterOwnLM:
    """DSPyProposerAdapter must use its own LM instance."""

-    def test_uses_provided_lm(self, proposer_lm: dspy.LM) -> None:
+    @pytest.mark.asyncio
+    async def test_uses_provided_lm(self, proposer_lm: dspy.LM) -> None:
        adapter = DSPyProposerAdapter(lm=proposer_lm)
        trajectories = [
            Trajectory(
@@ -119,14 +124,15 @@ class TestDSPyProposerAdapterOwnLM:
                prompt_used="old prompt",
            )
        ]
-        result = adapter.propose(
+        result = await adapter.propose(
            current_prompt=Prompt(text="old prompt"),
            trajectories=trajectories,
            task_description="Test task",
        )
        assert "Improved prompt" in result.text

-    def test_does_not_use_global_lm(self) -> None:
+    @pytest.mark.asyncio
+    async def test_does_not_use_global_lm(self) -> None:
        proposer_lm = dspy.utils.DummyLM(
            [{"reasoning": "ok", "new_instruction": "proposer-specific"}]
        )
@@ -136,7 +142,7 @@ class TestDSPyProposerAdapterOwnLM:
        dspy.configure(lm=global_lm)

        adapter = DSPyProposerAdapter(lm=proposer_lm)
-        result = adapter.propose(
+        result = await adapter.propose(
            current_prompt=Prompt(text="test"),
            trajectories=[],
            task_description="task",
--- a/tests/unit/test_error_handling.py
+++ b/tests/unit/test_error_handling.py
@@ -1,7 +1,7 @@
 """Unit tests for error handling: retry, circuit breaker, per-call isolation."""
 from __future__ import annotations

-from unittest.mock import MagicMock, patch
+from unittest.mock import AsyncMock, MagicMock, patch

 import pytest

@@ -96,7 +96,8 @@ def _make_eval_result(scores, feedbacks=None):


 class TestCircuitBreaker:
-    def test_trips_on_consecutive_failures(self):
+    @pytest.mark.asyncio
+    async def test_trips_on_consecutive_failures(self):
        """Loop stops when consecutive failures reach the threshold."""
        initial_eval = _make_eval_result([0.3, 0.4])
        evaluator = MagicMock()
@@ -109,8 +110,9 @@ class TestCircuitBreaker:
                return initial_eval  # seed eval succeeds
            raise RuntimeError("LLM down")

-        evaluator.evaluate.side_effect = _evaluate
+        evaluator.evaluate = AsyncMock(side_effect=_evaluate)
        proposer = MagicMock()
+        proposer.propose = AsyncMock()
        bootstrap = MagicMock(spec=SyntheticBootstrap)

        loop = EvolutionLoop(
@@ -123,7 +125,7 @@ class TestCircuitBreaker:
            error_strategy="skip",
        )
        with patch.object(loop, "_log"):
-            state = loop.run(
+            state = await loop.run(
                Prompt("test"),
                [SyntheticExample("in", id=0), SyntheticExample("in2", id=1)],
                "task",
@@ -135,7 +137,8 @@ class TestCircuitBreaker:
        assert len(cb_events) == 1
        assert state.iteration < 10  # stopped early

-    def test_abort_raises_on_first_error(self):
+    @pytest.mark.asyncio
+    async def test_abort_raises_on_first_error(self):
        """With error_strategy=abort, the first error raises immediately."""
        initial_eval = _make_eval_result([0.3, 0.4])
        evaluator = MagicMock()
@@ -148,8 +151,9 @@ class TestCircuitBreaker:
                return initial_eval
            raise RuntimeError("LLM down")

-        evaluator.evaluate.side_effect = _evaluate
+        evaluator.evaluate = AsyncMock(side_effect=_evaluate)
        proposer = MagicMock()
+        proposer.propose = AsyncMock()
        bootstrap = MagicMock(spec=SyntheticBootstrap)

        loop = EvolutionLoop(
@@ -163,13 +167,14 @@ class TestCircuitBreaker:
        )
        with patch.object(loop, "_log"):
            with pytest.raises(RuntimeError, match="LLM down"):
-                loop.run(
+                await loop.run(
                    Prompt("test"),
                    [SyntheticExample("in", id=0), SyntheticExample("in2", id=1)],
                    "task",
                )

-    def test_resets_on_success(self):
+    @pytest.mark.asyncio
+    async def test_resets_on_success(self):
        """Consecutive failure counter resets after a successful iteration."""
        initial_eval = _make_eval_result([0.3, 0.4])
        good_eval = _make_eval_result([0.8, 0.9])
@@ -194,9 +199,9 @@ class TestCircuitBreaker:
                return initial_eval  # current eval
            return good_eval  # new eval

-        evaluator.evaluate.side_effect = _evaluate
+        evaluator.evaluate = AsyncMock(side_effect=_evaluate)
        proposer = MagicMock()
-        proposer.propose.return_value = Prompt("better prompt")
+        proposer.propose = AsyncMock(return_value=Prompt("better prompt"))
        bootstrap = MagicMock(spec=SyntheticBootstrap)
        bootstrap.sample_minibatch.return_value = [
            SyntheticExample(f"in{i}", id=i) for i in range(2)
@@ -212,7 +217,7 @@ class TestCircuitBreaker:
            error_strategy="skip",
        )
        with patch.object(loop, "_log"):
-            state = loop.run(
+            state = await loop.run(
                Prompt("test"),
                [SyntheticExample("in", id=0), SyntheticExample("in2", id=1)],
                "task",
@@ -230,23 +235,24 @@ class TestCircuitBreaker:


 class TestPerCallIsolation:
-    def test_evaluator_isolates_execution_failure(self):
+    @pytest.mark.asyncio
+    async def test_evaluator_isolates_execution_failure(self):
        """A failing execution produces a sentinel output, not a crash."""
        executor = MagicMock()
-        executor.execute.side_effect = [
+        executor.execute = AsyncMock(side_effect=[
            "good output",
            RuntimeError("API error"),
            "another good output",
-        ]
+        ])
        judge = MagicMock()
-        judge.judge_batch.return_value = [
+        judge.judge_batch = AsyncMock(return_value=[
            (0.8, "good"),
            (0.0, "[judge error]"),
            (0.7, "ok"),
-        ]
+        ])

        evaluator = PromptEvaluator(executor, judge)
-        result = evaluator.evaluate(
+        result = await evaluator.evaluate(
            Prompt("test"),
            [
                SyntheticExample("in0", id=0),
@@ -261,7 +267,8 @@ class TestPerCallIsolation:
        assert "execution error" in result.trajectories[1].output_text
        assert result.scores[0] == 0.8  # other items unaffected

-    def test_judge_adapter_isolates_single_failure(self):
+    @pytest.mark.asyncio
+    async def test_judge_adapter_isolates_single_failure(self):
        """DSPyJudgeAdapter returns sentinel for a failed item, not crash."""
        from prometheus.infrastructure.judge_adapter import DSPyJudgeAdapter

@@ -269,6 +276,7 @@ class TestPerCallIsolation:
        adapter._lm = MagicMock()
        adapter._max_retries = 1
        adapter._retry_delay_base = 0
+        adapter._semaphore = __import__("asyncio").Semaphore(5)

        # Mock _judge to fail on first call, succeed on second
        call_count = 0
@@ -289,9 +297,10 @@ class TestPerCallIsolation:

        with patch("prometheus.infrastructure.judge_adapter.dspy.context"):
            with patch(
-                "prometheus.infrastructure.retry.time.sleep"
+                "prometheus.infrastructure.retry.asyncio.sleep",
+                new=AsyncMock(),
            ):
-                results = adapter.judge_batch(
+                results = await adapter.judge_batch(
                    "task", [("input1", "output1"), ("input2", "output2")]
                )

--- a/tests/unit/test_evaluator.py
+++ b/tests/unit/test_evaluator.py
@@ -1,7 +1,7 @@
 """Unit tests for PromptEvaluator.evaluate()."""
 from __future__ import annotations

-from unittest.mock import MagicMock
+from unittest.mock import AsyncMock, MagicMock

 import pytest

@@ -14,22 +14,23 @@ class TestPromptEvaluatorEvaluate:
    """Tests for the evaluate() pipeline: execute → judge → trajectories."""

    @pytest.fixture
-    def executor(self) -> MagicMock:
-        return MagicMock(spec=LLMPort)
+    def executor(self) -> AsyncMock:
+        return AsyncMock(spec=LLMPort)

    @pytest.fixture
-    def judge(self) -> MagicMock:
-        return MagicMock(spec=JudgePort)
+    def judge(self) -> AsyncMock:
+        return AsyncMock(spec=JudgePort)

    @pytest.fixture
-    def evaluator(self, executor: MagicMock, judge: MagicMock) -> PromptEvaluator:
+    def evaluator(self, executor: AsyncMock, judge: AsyncMock) -> PromptEvaluator:
        return PromptEvaluator(executor=executor, judge=judge)

-    def test_happy_path_builds_correct_trajectories(
+    @pytest.mark.asyncio
+    async def test_happy_path_builds_correct_trajectories(
        self,
        evaluator: PromptEvaluator,
-        executor: MagicMock,
-        judge: MagicMock,
+        executor: AsyncMock,
+        judge: AsyncMock,
    ) -> None:
        prompt = Prompt(text="Answer the question.")
        examples = [
@@ -42,7 +43,7 @@ class TestPromptEvaluatorEvaluate:
            (0.8, "Mostly correct."),
        ]

-        result = evaluator.evaluate(prompt, examples, "math and geography")
+        result = await evaluator.evaluate(prompt, examples, "math and geography")

        assert isinstance(result, EvalResult)
        assert result.scores == [0.9, 0.8]
@@ -55,14 +56,15 @@ class TestPromptEvaluatorEvaluate:
        assert result.trajectories[0].prompt_used == "Answer the question."
        assert result.trajectories[1].prompt_used == "Answer the question."

-    def test_empty_minibatch_returns_empty_result(
+    @pytest.mark.asyncio
+    async def test_empty_minibatch_returns_empty_result(
        self,
        evaluator: PromptEvaluator,
-        executor: MagicMock,
-        judge: MagicMock,
+        executor: AsyncMock,
+        judge: AsyncMock,
    ) -> None:
        prompt = Prompt(text="test")
-        result = evaluator.evaluate(prompt, [], "task")
+        result = await evaluator.evaluate(prompt, [], "task")

        assert result.scores == []
        assert result.feedbacks == []
@@ -71,41 +73,44 @@ class TestPromptEvaluatorEvaluate:
        # judge_batch is called with empty pairs list
        judge.judge_batch.assert_called_once_with("task", [])

-    def test_executor_called_with_correct_prompt(
+    @pytest.mark.asyncio
+    async def test_executor_called_with_correct_prompt(
        self,
        evaluator: PromptEvaluator,
-        executor: MagicMock,
-        judge: MagicMock,
+        executor: AsyncMock,
+        judge: AsyncMock,
    ) -> None:
        prompt = Prompt(text="Summarize this.")
        examples = [SyntheticExample(input_text="Long text here", id=0)]
        executor.execute.return_value = "Summary."
        judge.judge_batch.return_value = [(0.7, "Good summary.")]

-        evaluator.evaluate(prompt, examples, "summarization")
+        await evaluator.evaluate(prompt, examples, "summarization")

        executor.execute.assert_called_once_with(prompt, "Long text here")

-    def test_trajectories_prompt_used_matches_input_prompt(
+    @pytest.mark.asyncio
+    async def test_trajectories_prompt_used_matches_input_prompt(
        self,
        evaluator: PromptEvaluator,
-        executor: MagicMock,
-        judge: MagicMock,
+        executor: AsyncMock,
+        judge: AsyncMock,
    ) -> None:
        prompt = Prompt(text="Translate to French.")
        examples = [SyntheticExample(input_text="Hello", id=0)]
        executor.execute.return_value = "Bonjour"
        judge.judge_batch.return_value = [(1.0, "Perfect.")]

-        result = evaluator.evaluate(prompt, examples, "translation")
+        result = await evaluator.evaluate(prompt, examples, "translation")

        assert result.trajectories[0].prompt_used == "Translate to French."

-    def test_scores_feedbacks_trajectories_lists_sized_correctly(
+    @pytest.mark.asyncio
+    async def test_scores_feedbacks_trajectories_lists_sized_correctly(
        self,
        evaluator: PromptEvaluator,
-        executor: MagicMock,
-        judge: MagicMock,
+        executor: AsyncMock,
+        judge: AsyncMock,
    ) -> None:
        prompt = Prompt(text="test prompt")
        examples = [SyntheticExample(input_text=f"q{i}", id=i) for i in range(4)]
@@ -114,7 +119,7 @@ class TestPromptEvaluatorEvaluate:
            (0.1 * i, f"fb{i}") for i in range(4)
        ]

-        result = evaluator.evaluate(prompt, examples, "task")
+        result = await evaluator.evaluate(prompt, examples, "task")

        assert len(result.scores) == 4
        assert len(result.feedbacks) == 4
--- a/tests/unit/test_evolution.py
+++ b/tests/unit/test_evolution.py
@@ -1,7 +1,9 @@
 """Unit tests for the evolution loop — with full mocking."""
 from __future__ import annotations

-from unittest.mock import MagicMock, patch
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest

 from prometheus.application.bootstrap import SyntheticBootstrap
 from prometheus.application.evaluator import PromptEvaluator
@@ -10,14 +12,15 @@ from prometheus.domain.entities import EvalResult, Prompt, SyntheticExample, Tra


 class TestEvolutionLoop:
-    def test_accepts_improvement(
+    @pytest.mark.asyncio
+    async def test_accepts_improvement(
        self,
        seed_prompt: Prompt,
        synthetic_pool: list[SyntheticExample],
        task_description: str,
-        mock_llm_port: MagicMock,
-        mock_judge_port: MagicMock,
-        mock_proposer_port: MagicMock,
+        mock_llm_port: AsyncMock,
+        mock_judge_port: AsyncMock,
+        mock_proposer_port: AsyncMock,
    ) -> None:
        """When the new prompt improves the score, the best candidate is updated."""
        evaluator = PromptEvaluator(mock_llm_port, mock_judge_port)
@@ -45,7 +48,7 @@ class TestEvolutionLoop:
            feedbacks=["good"] * 5,
            trajectories=[],
        )
-        evaluator.evaluate = MagicMock(side_effect=[initial_eval, old_eval, new_eval])
+        evaluator.evaluate = AsyncMock(side_effect=[initial_eval, old_eval, new_eval])

        loop = EvolutionLoop(
            evaluator=evaluator,
@@ -55,19 +58,20 @@ class TestEvolutionLoop:
            minibatch_size=5,
        )
        with patch.object(loop, "_log"):
-            state = loop.run(seed_prompt, synthetic_pool, task_description)
+            state = await loop.run(seed_prompt, synthetic_pool, task_description)

        assert state.best_candidate is not None
        assert state.best_candidate.best_score > 0

-    def test_rejects_regression(
+    @pytest.mark.asyncio
+    async def test_rejects_regression(
        self,
        seed_prompt: Prompt,
        synthetic_pool: list[SyntheticExample],
        task_description: str,
-        mock_llm_port: MagicMock,
-        mock_judge_port: MagicMock,
-        mock_proposer_port: MagicMock,
+        mock_llm_port: AsyncMock,
+        mock_judge_port: AsyncMock,
+        mock_proposer_port: AsyncMock,
    ) -> None:
        """When the new prompt degrades the score, the best candidate stays unchanged."""
        evaluator = PromptEvaluator(mock_llm_port, mock_judge_port)
@@ -95,7 +99,7 @@ class TestEvolutionLoop:
            feedbacks=["bad"] * 5,
            trajectories=[],
        )
-        evaluator.evaluate = MagicMock(side_effect=[initial_eval, old_eval, new_eval])
+        evaluator.evaluate = AsyncMock(side_effect=[initial_eval, old_eval, new_eval])

        loop = EvolutionLoop(
            evaluator=evaluator,
@@ -105,19 +109,20 @@ class TestEvolutionLoop:
            minibatch_size=5,
        )
        with patch.object(loop, "_log"):
-            state = loop.run(seed_prompt, synthetic_pool, task_description)
+            state = await loop.run(seed_prompt, synthetic_pool, task_description)

        assert state.best_candidate is not None
        assert state.best_candidate.prompt.text == seed_prompt.text

-    def test_skips_perfect_scores(
+    @pytest.mark.asyncio
+    async def test_skips_perfect_scores(
        self,
        seed_prompt: Prompt,
        synthetic_pool: list[SyntheticExample],
        task_description: str,
-        mock_llm_port: MagicMock,
-        mock_judge_port: MagicMock,
-        mock_proposer_port: MagicMock,
+        mock_llm_port: AsyncMock,
+        mock_judge_port: AsyncMock,
+        mock_proposer_port: AsyncMock,
    ) -> None:
        """When all scores are perfect, no proposition is made."""
        evaluator = PromptEvaluator(mock_llm_port, mock_judge_port)
@@ -132,7 +137,7 @@ class TestEvolutionLoop:
                for i in range(5)
            ],
        )
-        evaluator.evaluate = MagicMock(return_value=perfect_eval)
+        evaluator.evaluate = AsyncMock(return_value=perfect_eval)

        loop = EvolutionLoop(
            evaluator=evaluator,
@@ -142,6 +147,6 @@ class TestEvolutionLoop:
            minibatch_size=5,
        )
        with patch.object(loop, "_log"):
-            loop.run(seed_prompt, synthetic_pool, task_description)
+            await loop.run(seed_prompt, synthetic_pool, task_description)

        mock_proposer_port.propose.assert_not_called()