feat: async/parallel execution with configurable concurrency

Parallelize LLM calls across minibatches to reduce wall-clock time. All domain ports (LLMPort, JudgePort, ProposerPort) are now async. Adapter implementations wrap synchronous DSPy calls with asyncio.to_thread. Judge calls run in parallel within a batch using asyncio.gather + semaphore. Evaluator parallelizes minibatch execution with configurable concurrency. Evolution loop and use case are fully async. Proposer stays sequential. Added --max-concurrency CLI flag and max_concurrency YAML config field. Added async_retry_with_backoff for async error handling. All 139 unit tests pass. Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-03-29 13:15:34 +00:00
parent e2d111ce5b
commit c92ca4a2b8
16 changed files with 297 additions and 159 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,6 +16,7 @@ dependencies = [
 [project.optional-dependencies]
 dev = [
    "pytest>=8.3",
    "pytest-asyncio>=0.24",
    "pytest-cov>=6.0",
    "ruff>=0.9",
    "mypy>=1.14",
@@ -37,6 +38,9 @@ target-version = "py312"
 python_version = "3.12"
 strict = true
 [tool.pytest.ini_options]
 asyncio_mode = "auto"
 [[tool.mypy.overrides]]
 module = ["dspy", "dspy.*"]
 ignore_missing_imports = true
--- a/src/prometheus/application/dto.py
+++ b/src/prometheus/application/dto.py
@@ -38,6 +38,9 @@ class OptimizationConfig:
    # --- Reproducibility ---
    seed: int = 42
    # --- Concurrency ---
    max_concurrency: int = 5
    # --- Error handling ---
    max_retries: int = 3
    retry_delay_base: float = 1.0
--- a/src/prometheus/application/evaluator.py
+++ b/src/prometheus/application/evaluator.py
@@ -6,6 +6,7 @@ Combines candidate prompt execution + LLM-as-Judge evaluation.
 """
 from __future__ import annotations
 import asyncio
 import logging
 from prometheus.domain.entities import (
@@ -26,15 +27,22 @@ class PromptEvaluator:
    Replaces GEPA's EvaluatorFn. Instead of comparing to ground truth,
    uses an LLM-as-Judge.
    Execution and judge calls run in parallel (bounded by *max_concurrency*).
    Per-call isolation: a failure on one minibatch item produces a
    zero-score trajectory instead of crashing the whole batch.
    """
-    def __init__(self, executor: LLMPort, judge: JudgePort):
+    def __init__(
        self,
        executor: LLMPort,
        judge: JudgePort,
        max_concurrency: int = 5,
    ):
        self._executor = executor
        self._judge = judge
        self._semaphore = asyncio.Semaphore(max_concurrency)
-    def evaluate(
+    async def evaluate(
        self,
        prompt: Prompt,
        minibatch: list[SyntheticExample],
@@ -43,27 +51,20 @@ class PromptEvaluator:
        """Evaluate the prompt on the minibatch.
        Steps:
-        1. Execute the prompt on each input in the minibatch
+        1. Execute the prompt on each input in the minibatch (parallel)
        2. Judge each (input, output) pair
        3. Build trajectories with feedback
        """
-        # Step 1: Execution (per-item isolation)
+        # Step 1: Parallel execution (per-item isolation)
-        outputs: list[str] = []
+        output_coros = [
-        for example in minibatch:
+            self._execute_single(prompt, example)
-            try:
+            for example in minibatch
-                raw_output = self._executor.execute(prompt, example.input_text)
+        ]
-                outputs.append(raw_output)
+        outputs = await asyncio.gather(*output_coros)
            except Exception as exc:
                logger.warning(
                    "Execution failed for input '%s…': %s",
                    example.input_text[:40],
                    exc,
                )
                outputs.append(f"[execution error: {exc}]")
-        # Step 2: Judgement (judge_adapter handles its own per-call isolation)
+        # Step 2: Judgement (judge_adapter handles its own per-call isolation + parallelism)
        pairs = [(ex.input_text, out) for ex, out in zip(minibatch, outputs)]
-        judge_results = self._judge.judge_batch(task_description, pairs)
+        judge_results = await self._judge.judge_batch(task_description, pairs)
        # Step 3: Build trajectories
        scores: list[float] = []
@@ -88,3 +89,17 @@ class PromptEvaluator:
            feedbacks=feedbacks,
            trajectories=trajectories,
        )
    async def _execute_single(
        self, prompt: Prompt, example: SyntheticExample
    ) -> str:
        async with self._semaphore:
            try:
                return await self._executor.execute(prompt, example.input_text)
            except Exception as exc:
                logger.warning(
                    "Execution failed for input '%s…': %s",
                    example.input_text[:40],
                    exc,
                )
                return f"[execution error: {exc}]"
--- a/src/prometheus/application/evolution.py
+++ b/src/prometheus/application/evolution.py
@@ -62,7 +62,7 @@ class EvolutionLoop:
        self._circuit_breaker_threshold = circuit_breaker_threshold
        self._error_strategy = error_strategy
-    def run(
+    async def run(
        self,
        seed_prompt: Prompt,
        synthetic_pool: list[SyntheticExample],
@@ -76,7 +76,7 @@ class EvolutionLoop:
        initial_batch = self._bootstrap.sample_minibatch(
            synthetic_pool, self._minibatch_size
        )
-        initial_eval = self._evaluator.evaluate(
+        initial_eval = await self._evaluator.evaluate(
            seed_prompt, initial_batch, task_description
        )
        state.total_llm_calls += 2 * self._minibatch_size  # N executions + N judge calls
@@ -95,7 +95,7 @@ class EvolutionLoop:
            state.iteration = i
            try:
-                self._run_iteration(
+                await self._run_iteration(
                    i, state, best_candidate, synthetic_pool, task_description
                )
                # Update best_candidate from state after successful iteration
@@ -144,7 +144,7 @@ class EvolutionLoop:
        return state
-    def _run_iteration(
+    async def _run_iteration(
        self,
        i: int,
        state: OptimizationState,
@@ -159,7 +159,7 @@ class EvolutionLoop:
        )
        # 2. Evaluate the current candidate
-        current_eval = self._evaluator.evaluate(
+        current_eval = await self._evaluator.evaluate(
            best_candidate.prompt, batch, task_description
        )
        state.total_llm_calls += 2 * self._minibatch_size
@@ -176,8 +176,8 @@ class EvolutionLoop:
            )
            return
-        # 4. Propose a new prompt (reflective mutation)
+        # 4. Propose a new prompt (reflective mutation) — sequential
-        new_prompt = self._proposer.propose(
+        new_prompt = await self._proposer.propose(
            best_candidate.prompt,
            current_eval.trajectories,
            task_description,
@@ -185,7 +185,7 @@ class EvolutionLoop:
        state.total_llm_calls += 1  # 1 proposition call
        # 5. Evaluate the new prompt on the same minibatch
-        new_eval = self._evaluator.evaluate(
+        new_eval = await self._evaluator.evaluate(
            new_prompt, batch, task_description
        )
        state.total_llm_calls += 2 * self._minibatch_size
--- a/src/prometheus/application/use_cases.py
+++ b/src/prometheus/application/use_cases.py
@@ -30,7 +30,7 @@ class OptimizePromptUseCase:
        self._proposer = proposer
        self._bootstrap = bootstrap
-    def execute(self, config: OptimizationConfig) -> OptimizationResult:
+    async def execute(self, config: OptimizationConfig) -> OptimizationResult:
        """Full pipeline:
        1. Bootstrap → generate synthetic inputs
        2. Evolution → optimization loop
@@ -55,7 +55,7 @@ class OptimizePromptUseCase:
            error_strategy=config.error_strategy,
        )
        seed_prompt = Prompt(text=config.seed_prompt)
-        state = loop.run(seed_prompt, synthetic_pool, config.task_description)
+        state = await loop.run(seed_prompt, synthetic_pool, config.task_description)
        # Phase 2: Result
        initial_score = (
--- a/src/prometheus/cli/app.py
+++ b/src/prometheus/cli/app.py
@@ -5,6 +5,7 @@ Typer interface with -i (input) and -o (output) options.
 """
 from __future__ import annotations
 import asyncio
 import logging
 import os
 from dataclasses import asdict
@@ -66,12 +67,30 @@ def optimize(
        "--error-strategy",
        help="How to handle errors: skip | retry | abort.",
    ),
    max_concurrency: int = typer.Option(
        5,
        "--max-concurrency",
        help="Max parallel LLM calls for minibatch execution and judging.",
    ),
 ) -> None:
    """Optimize a prompt without any reference data.
    Usage:
        prometheus optimize -i config.yaml -o result.yaml
    """
    asyncio.run(
        _async_optimize(input, output, verbose, max_retries, error_strategy, max_concurrency)
    )
 async def _async_optimize(
    input: str,
    output: str,
    verbose: bool,
    max_retries: int,
    error_strategy: str,
    max_concurrency: int,
 ) -> None:
    # Configure verbose logging
    if verbose:
        logging.basicConfig(level=logging.INFO, format="[PROMETHEUS] %(message)s")
@@ -129,6 +148,7 @@ def optimize(
        retry_delay_base=raw_config.get("retry_delay_base", 1.0),
        circuit_breaker_threshold=raw_config.get("circuit_breaker_threshold", 5),
        error_strategy=raw_config.get("error_strategy", error_strategy),
        max_concurrency=raw_config.get("max_concurrency", max_concurrency),
        output_path=output,
        verbose=verbose,
    )
@@ -164,6 +184,7 @@ def optimize(
        lm=judge_lm,
        max_retries=config.max_retries,
        retry_delay_base=config.retry_delay_base,
        max_concurrency=config.max_concurrency,
    )
    proposer_adapter = DSPyProposerAdapter(
        lm=proposer_lm,
@@ -171,7 +192,11 @@ def optimize(
        retry_delay_base=config.retry_delay_base,
    )
    bootstrap = SyntheticBootstrap(generator=synth_adapter, seed=config.seed)
-    evaluator = PromptEvaluator(executor=llm_adapter, judge=judge_adapter)
+    evaluator = PromptEvaluator(
        executor=llm_adapter,
        judge=judge_adapter,
        max_concurrency=config.max_concurrency,
    )
    use_case = OptimizePromptUseCase(
        evaluator=evaluator,
        proposer=proposer_adapter,
@@ -180,7 +205,7 @@ def optimize(
    # 4. Execute
    with console.status("[bold green]Evolving prompt..."):
-        result = use_case.execute(config)
+        result = await use_case.execute(config)
    # 5. Display results
    _display_result(result)
--- a/src/prometheus/domain/ports.py
+++ b/src/prometheus/domain/ports.py
@@ -18,7 +18,7 @@ class LLMPort(ABC):
    """
    @abstractmethod
-    def execute(self, prompt: Prompt, input_text: str) -> str:
+    async def execute(self, prompt: Prompt, input_text: str) -> str:
        """Execute the prompt on the input, return the raw response."""
        ...
@@ -31,7 +31,7 @@ class JudgePort(ABC):
    """
    @abstractmethod
-    def judge_batch(
+    async def judge_batch(
        self,
        task_description: str,
        pairs: list[tuple[str, str]],
@@ -50,7 +50,7 @@ class ProposerPort(ABC):
    """
    @abstractmethod
-    def propose(
+    async def propose(
        self,
        current_prompt: Prompt,
        trajectories: list[Trajectory],
--- a/src/prometheus/infrastructure/judge_adapter.py
+++ b/src/prometheus/infrastructure/judge_adapter.py
@@ -5,13 +5,15 @@ Implements the JudgePort via the DSPy OutputJudge module.
 """
 from __future__ import annotations
 import asyncio
 import logging
 from typing import Self
 import dspy
 from prometheus.domain.ports import JudgePort
 from prometheus.infrastructure.dspy_modules import OutputJudge
-from prometheus.infrastructure.retry import retry_with_backoff
+from prometheus.infrastructure.retry import async_retry_with_backoff
 logger = logging.getLogger(__name__)
@@ -21,6 +23,8 @@ class DSPyJudgeAdapter(JudgePort):
    Per-call isolation: a failure on one item returns a zero-score sentinel
    instead of crashing the whole batch.
    Judge calls run in parallel (bounded by *max_concurrency*).
    """
    def __init__(
@@ -28,40 +32,60 @@ class DSPyJudgeAdapter(JudgePort):
        lm: dspy.LM,
        max_retries: int = 3,
        retry_delay_base: float = 1.0,
        max_concurrency: int = 5,
    ) -> None:
        self._lm = lm
        self._judge = OutputJudge()
        self._max_retries = max_retries
        self._retry_delay_base = retry_delay_base
        self._semaphore = asyncio.Semaphore(max_concurrency)
-    def judge_batch(
+    async def judge_batch(
        self,
        task_description: str,
        pairs: list[tuple[str, str]],
    ) -> list[tuple[float, str]]:
-        results: list[tuple[float, str]] = []
+        tasks = [
-        with dspy.context(lm=self._lm):
+            self._judge_single_safe(task_description, input_text, output_text)
-            for input_text, output_text in pairs:
+            for input_text, output_text in pairs
-                results.append(self._judge_single(task_description, input_text, output_text))
+        ]
-        return results
+        return list(await asyncio.gather(*tasks))
-    def _judge_single(
+    async def _judge_single_safe(
        self,
        task_description: str,
        input_text: str,
        output_text: str,
    ) -> tuple[float, str]:
        async with self._semaphore:
            try:
-            pred = retry_with_backoff(
+                return await self._judge_single(task_description, input_text, output_text)
                lambda: self._judge(
                    task_description=task_description,
                    input_text=input_text,
                    output_text=output_text,
                ),
                max_retries=self._max_retries,
                retry_delay_base=self._retry_delay_base,
            )
            return (pred.score, pred.feedback)
            except Exception as exc:
                logger.warning("Judge call failed for input '%s…': %s", input_text[:40], exc)
                return (0.0, f"[judge error: {exc}]")
    async def _judge_single(
        self,
        task_description: str,
        input_text: str,
        output_text: str,
    ) -> tuple[float, str]:
        async def _call() -> tuple[float, str]:
            pred = await asyncio.to_thread(
                self._sync_judge, task_description, input_text, output_text,
            )
            return (pred.score, pred.feedback)
        return await async_retry_with_backoff(
            _call,
            max_retries=self._max_retries,
            retry_delay_base=self._retry_delay_base,
        )
    def _sync_judge(self, task_description: str, input_text: str, output_text: str):
        with dspy.context(lm=self._lm):
            return self._judge(
                task_description=task_description,
                input_text=input_text,
                output_text=output_text,
            )
--- a/src/prometheus/infrastructure/llm_adapter.py
+++ b/src/prometheus/infrastructure/llm_adapter.py
@@ -5,11 +5,13 @@ Implements the LLMPort via DSPy.
 """
 from __future__ import annotations
 import asyncio
 import dspy
 from prometheus.domain.entities import Prompt
 from prometheus.domain.ports import LLMPort
-from prometheus.infrastructure.retry import retry_with_backoff
+from prometheus.infrastructure.retry import async_retry_with_backoff
 class DSPyLLMAdapter(LLMPort):
@@ -33,17 +35,21 @@ class DSPyLLMAdapter(LLMPort):
        self._max_retries = max_retries
        self._retry_delay_base = retry_delay_base
-    def execute(self, prompt: Prompt, input_text: str) -> str:
+    async def execute(self, prompt: Prompt, input_text: str) -> str:
-        def _call() -> str:
+        async def _call() -> str:
            # DSPy is synchronous — run in a thread to avoid blocking the event loop.
            return await asyncio.to_thread(self._sync_execute, prompt, input_text)
        return await async_retry_with_backoff(
            _call,
            max_retries=self._max_retries,
            retry_delay_base=self._retry_delay_base,
        )
    def _sync_execute(self, prompt: Prompt, input_text: str) -> str:
        with dspy.context(lm=self._lm):
            result = self._predictor(
                instruction=prompt.text,
                input_text=input_text,
            )
        return str(result.output)
        return retry_with_backoff(
            _call,
            max_retries=self._max_retries,
            retry_delay_base=self._retry_delay_base,
        )
--- a/src/prometheus/infrastructure/proposer_adapter.py
+++ b/src/prometheus/infrastructure/proposer_adapter.py
@@ -6,12 +6,14 @@ Converts trajectories into readable format for the LLM proposer.
 """
 from __future__ import annotations
 import asyncio
 import dspy
 from prometheus.domain.entities import Prompt, Trajectory
 from prometheus.domain.ports import ProposerPort
 from prometheus.infrastructure.dspy_modules import InstructionProposer
-from prometheus.infrastructure.retry import retry_with_backoff
+from prometheus.infrastructure.retry import async_retry_with_backoff
 class DSPyProposerAdapter(ProposerPort):
@@ -28,7 +30,7 @@ class DSPyProposerAdapter(ProposerPort):
        self._max_retries = max_retries
        self._retry_delay_base = retry_delay_base
-    def propose(
+    async def propose(
        self,
        current_prompt: Prompt,
        trajectories: list[Trajectory],
@@ -36,7 +38,18 @@ class DSPyProposerAdapter(ProposerPort):
    ) -> Prompt:
        failure_examples = self._format_failures(trajectories)
-        def _call() -> Prompt:
+        async def _call() -> Prompt:
            return await asyncio.to_thread(
                self._sync_propose, current_prompt, task_description, failure_examples,
            )
        return await async_retry_with_backoff(
            _call,
            max_retries=self._max_retries,
            retry_delay_base=self._retry_delay_base,
        )
    def _sync_propose(self, current_prompt: Prompt, task_description: str, failure_examples: str) -> Prompt:
        with dspy.context(lm=self._lm):
            pred = self._proposer(
                current_instruction=current_prompt.text,
@@ -45,12 +58,6 @@ class DSPyProposerAdapter(ProposerPort):
            )
        return Prompt(text=pred.new_instruction)
        return retry_with_backoff(
            _call,
            max_retries=self._max_retries,
            retry_delay_base=self._retry_delay_base,
        )
    @staticmethod
    def _format_failures(trajectories: list[Trajectory]) -> str:
        """Convert trajectories into a structured textual report."""
--- a/src/prometheus/infrastructure/retry.py
+++ b/src/prometheus/infrastructure/retry.py
@@ -1,9 +1,10 @@
 """Retry with exponential backoff for transient LLM errors."""
 from __future__ import annotations
 import asyncio
 import logging
 import time
-from typing import Any, Callable, TypeVar
+from typing import Any, Callable, Coroutine, TypeVar
 logger = logging.getLogger(__name__)
@@ -71,3 +72,31 @@ def retry_with_backoff(
            time.sleep(delay)
    # Should not reach here, but satisfy type-checker.
    raise TransientError(str(last_exc)) from last_exc
 async def async_retry_with_backoff(
    fn: Callable[..., Coroutine[Any, Any, T]],
    *args: Any,
    max_retries: int = 3,
    retry_delay_base: float = 1.0,
    **kwargs: Any,
 ) -> T:
    """Async version of retry_with_backoff — uses asyncio.sleep instead of time.sleep."""
    last_exc: Exception | None = None
    for attempt in range(max_retries + 1):
        try:
            return await fn(*args, **kwargs)
        except Exception as exc:
            last_exc = exc
            if not is_transient_error(exc) or attempt == max_retries:
                raise
            delay = retry_delay_base * (2 ** attempt)
            logger.warning(
                "Transient error (attempt %d/%d): %s — retrying in %.1fs",
                attempt + 1,
                max_retries + 1,
                exc,
                delay,
            )
            await asyncio.sleep(delay)
    raise TransientError(str(last_exc)) from last_exc
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,7 +1,7 @@
 """Shared test fixtures."""
 from __future__ import annotations
-from unittest.mock import MagicMock
+from unittest.mock import AsyncMock, MagicMock
 import pytest
@@ -66,17 +66,17 @@ def mock_eval_result() -> EvalResult:
@pytest.fixture
-def mock_llm_port() -> MagicMock:
+def mock_llm_port() -> AsyncMock:
    """Mock LLMPort that returns canned responses."""
-    port = MagicMock()
+    port = AsyncMock()
    port.execute.return_value = "This is a mock response."
    return port
@pytest.fixture
-def mock_judge_port() -> MagicMock:
+def mock_judge_port() -> AsyncMock:
    """Mock JudgePort that returns moderate scores."""
-    port = MagicMock()
+    port = AsyncMock()
    port.judge_batch.return_value = [
        (0.5, "Moderate quality, needs improvement."),
    ] * 5
@@ -84,9 +84,9 @@ def mock_judge_port() -> MagicMock:
@pytest.fixture
-def mock_proposer_port() -> MagicMock:
+def mock_proposer_port() -> AsyncMock:
    """Mock ProposerPort that returns a slightly modified prompt."""
-    port = MagicMock()
+    port = AsyncMock()
    port.propose.return_value = Prompt(
        text="You are a very helpful assistant. Answer the question precisely."
    )
--- a/tests/unit/test_adapter_config.py
+++ b/tests/unit/test_adapter_config.py
@@ -57,23 +57,25 @@ def synth_lm() -> dspy.LM:
 class TestDSPyLLMAdapterOwnLM:
    """Bug #2 fix: DSPyLLMAdapter must use the LM it receives, not the global one."""
-    def test_uses_provided_lm_not_global(self) -> None:
+    @pytest.mark.asyncio
    async def test_uses_provided_lm_not_global(self) -> None:
        local_lm = dspy.utils.DummyLM([{"output": "local response"}])
        global_lm = dspy.utils.DummyLM([{"output": "global response"}])
        dspy.configure(lm=global_lm)
        adapter = DSPyLLMAdapter(lm=local_lm)
-        result = adapter.execute(Prompt(text="test"), "input")
+        result = await adapter.execute(Prompt(text="test"), "input")
        assert result == "local response"
-    def test_does_not_affect_global_lm(self) -> None:
+    @pytest.mark.asyncio
    async def test_does_not_affect_global_lm(self) -> None:
        local_lm = dspy.utils.DummyLM([{"output": "local response"}])
        global_lm = dspy.utils.DummyLM([{"output": "global response"}])
        dspy.configure(lm=global_lm)
        adapter = DSPyLLMAdapter(lm=local_lm)
-        adapter.execute(Prompt(text="test"), "input")
+        await adapter.execute(Prompt(text="test"), "input")
        # Global LM should still be the same
        assert dspy.settings.lm is global_lm
@@ -82,9 +84,10 @@ class TestDSPyLLMAdapterOwnLM:
 class TestDSPyJudgeAdapterOwnLM:
    """DSPyJudgeAdapter must use its own LM instance."""
-    def test_uses_provided_lm(self, judge_lm: dspy.LM) -> None:
+    @pytest.mark.asyncio
    async def test_uses_provided_lm(self, judge_lm: dspy.LM) -> None:
        adapter = DSPyJudgeAdapter(lm=judge_lm)
-        results = adapter.judge_batch(
+        results = await adapter.judge_batch(
            task_description="Test task",
            pairs=[("input 1", "output 1")],
        )
@@ -93,7 +96,8 @@ class TestDSPyJudgeAdapterOwnLM:
        assert score == 0.8
        assert feedback == "Good response."
-    def test_does_not_use_global_lm(self) -> None:
+    @pytest.mark.asyncio
    async def test_does_not_use_global_lm(self) -> None:
        judge_lm = dspy.utils.DummyLM(
            [{"reasoning": "ok", "score": "0.9", "feedback": "Judge-specific response"}]
        )
@@ -101,14 +105,15 @@ class TestDSPyJudgeAdapterOwnLM:
        dspy.configure(lm=global_lm)
        adapter = DSPyJudgeAdapter(lm=judge_lm)
-        results = adapter.judge_batch("task", [("in", "out")])
+        results = await adapter.judge_batch("task", [("in", "out")])
        assert results[0][0] == 0.9
 class TestDSPyProposerAdapterOwnLM:
    """DSPyProposerAdapter must use its own LM instance."""
-    def test_uses_provided_lm(self, proposer_lm: dspy.LM) -> None:
+    @pytest.mark.asyncio
    async def test_uses_provided_lm(self, proposer_lm: dspy.LM) -> None:
        adapter = DSPyProposerAdapter(lm=proposer_lm)
        trajectories = [
            Trajectory(
@@ -119,14 +124,15 @@ class TestDSPyProposerAdapterOwnLM:
                prompt_used="old prompt",
            )
        ]
-        result = adapter.propose(
+        result = await adapter.propose(
            current_prompt=Prompt(text="old prompt"),
            trajectories=trajectories,
            task_description="Test task",
        )
        assert "Improved prompt" in result.text
-    def test_does_not_use_global_lm(self) -> None:
+    @pytest.mark.asyncio
    async def test_does_not_use_global_lm(self) -> None:
        proposer_lm = dspy.utils.DummyLM(
            [{"reasoning": "ok", "new_instruction": "proposer-specific"}]
        )
@@ -136,7 +142,7 @@ class TestDSPyProposerAdapterOwnLM:
        dspy.configure(lm=global_lm)
        adapter = DSPyProposerAdapter(lm=proposer_lm)
-        result = adapter.propose(
+        result = await adapter.propose(
            current_prompt=Prompt(text="test"),
            trajectories=[],
            task_description="task",
--- a/tests/unit/test_error_handling.py
+++ b/tests/unit/test_error_handling.py
@@ -1,7 +1,7 @@
 """Unit tests for error handling: retry, circuit breaker, per-call isolation."""
 from __future__ import annotations
-from unittest.mock import MagicMock, patch
+from unittest.mock import AsyncMock, MagicMock, patch
 import pytest
@@ -96,7 +96,8 @@ def _make_eval_result(scores, feedbacks=None):
 class TestCircuitBreaker:
-    def test_trips_on_consecutive_failures(self):
+    @pytest.mark.asyncio
    async def test_trips_on_consecutive_failures(self):
        """Loop stops when consecutive failures reach the threshold."""
        initial_eval = _make_eval_result([0.3, 0.4])
        evaluator = MagicMock()
@@ -109,8 +110,9 @@ class TestCircuitBreaker:
                return initial_eval  # seed eval succeeds
            raise RuntimeError("LLM down")
-        evaluator.evaluate.side_effect = _evaluate
+        evaluator.evaluate = AsyncMock(side_effect=_evaluate)
        proposer = MagicMock()
        proposer.propose = AsyncMock()
        bootstrap = MagicMock(spec=SyntheticBootstrap)
        loop = EvolutionLoop(
@@ -123,7 +125,7 @@ class TestCircuitBreaker:
            error_strategy="skip",
        )
        with patch.object(loop, "_log"):
-            state = loop.run(
+            state = await loop.run(
                Prompt("test"),
                [SyntheticExample("in", id=0), SyntheticExample("in2", id=1)],
                "task",
@@ -135,7 +137,8 @@ class TestCircuitBreaker:
        assert len(cb_events) == 1
        assert state.iteration < 10  # stopped early
-    def test_abort_raises_on_first_error(self):
+    @pytest.mark.asyncio
    async def test_abort_raises_on_first_error(self):
        """With error_strategy=abort, the first error raises immediately."""
        initial_eval = _make_eval_result([0.3, 0.4])
        evaluator = MagicMock()
@@ -148,8 +151,9 @@ class TestCircuitBreaker:
                return initial_eval
            raise RuntimeError("LLM down")
-        evaluator.evaluate.side_effect = _evaluate
+        evaluator.evaluate = AsyncMock(side_effect=_evaluate)
        proposer = MagicMock()
        proposer.propose = AsyncMock()
        bootstrap = MagicMock(spec=SyntheticBootstrap)
        loop = EvolutionLoop(
@@ -163,13 +167,14 @@ class TestCircuitBreaker:
        )
        with patch.object(loop, "_log"):
            with pytest.raises(RuntimeError, match="LLM down"):
-                loop.run(
+                await loop.run(
                    Prompt("test"),
                    [SyntheticExample("in", id=0), SyntheticExample("in2", id=1)],
                    "task",
                )
-    def test_resets_on_success(self):
+    @pytest.mark.asyncio
    async def test_resets_on_success(self):
        """Consecutive failure counter resets after a successful iteration."""
        initial_eval = _make_eval_result([0.3, 0.4])
        good_eval = _make_eval_result([0.8, 0.9])
@@ -194,9 +199,9 @@ class TestCircuitBreaker:
                return initial_eval  # current eval
            return good_eval  # new eval
-        evaluator.evaluate.side_effect = _evaluate
+        evaluator.evaluate = AsyncMock(side_effect=_evaluate)
        proposer = MagicMock()
-        proposer.propose.return_value = Prompt("better prompt")
+        proposer.propose = AsyncMock(return_value=Prompt("better prompt"))
        bootstrap = MagicMock(spec=SyntheticBootstrap)
        bootstrap.sample_minibatch.return_value = [
            SyntheticExample(f"in{i}", id=i) for i in range(2)
@@ -212,7 +217,7 @@ class TestCircuitBreaker:
            error_strategy="skip",
        )
        with patch.object(loop, "_log"):
-            state = loop.run(
+            state = await loop.run(
                Prompt("test"),
                [SyntheticExample("in", id=0), SyntheticExample("in2", id=1)],
                "task",
@@ -230,23 +235,24 @@ class TestCircuitBreaker:
 class TestPerCallIsolation:
-    def test_evaluator_isolates_execution_failure(self):
+    @pytest.mark.asyncio
    async def test_evaluator_isolates_execution_failure(self):
        """A failing execution produces a sentinel output, not a crash."""
        executor = MagicMock()
-        executor.execute.side_effect = [
+        executor.execute = AsyncMock(side_effect=[
            "good output",
            RuntimeError("API error"),
            "another good output",
-        ]
+        ])
        judge = MagicMock()
-        judge.judge_batch.return_value = [
+        judge.judge_batch = AsyncMock(return_value=[
            (0.8, "good"),
            (0.0, "[judge error]"),
            (0.7, "ok"),
-        ]
+        ])
        evaluator = PromptEvaluator(executor, judge)
-        result = evaluator.evaluate(
+        result = await evaluator.evaluate(
            Prompt("test"),
            [
                SyntheticExample("in0", id=0),
@@ -261,7 +267,8 @@ class TestPerCallIsolation:
        assert "execution error" in result.trajectories[1].output_text
        assert result.scores[0] == 0.8  # other items unaffected
-    def test_judge_adapter_isolates_single_failure(self):
+    @pytest.mark.asyncio
    async def test_judge_adapter_isolates_single_failure(self):
        """DSPyJudgeAdapter returns sentinel for a failed item, not crash."""
        from prometheus.infrastructure.judge_adapter import DSPyJudgeAdapter
@@ -269,6 +276,7 @@ class TestPerCallIsolation:
        adapter._lm = MagicMock()
        adapter._max_retries = 1
        adapter._retry_delay_base = 0
        adapter._semaphore = __import__("asyncio").Semaphore(5)
        # Mock _judge to fail on first call, succeed on second
        call_count = 0
@@ -289,9 +297,10 @@ class TestPerCallIsolation:
        with patch("prometheus.infrastructure.judge_adapter.dspy.context"):
            with patch(
-                "prometheus.infrastructure.retry.time.sleep"
+                "prometheus.infrastructure.retry.asyncio.sleep",
                new=AsyncMock(),
            ):
-                results = adapter.judge_batch(
+                results = await adapter.judge_batch(
                    "task", [("input1", "output1"), ("input2", "output2")]
                )
--- a/tests/unit/test_evaluator.py
+++ b/tests/unit/test_evaluator.py
@@ -1,7 +1,7 @@
 """Unit tests for PromptEvaluator.evaluate()."""
 from __future__ import annotations
-from unittest.mock import MagicMock
+from unittest.mock import AsyncMock, MagicMock
 import pytest
@@ -14,22 +14,23 @@ class TestPromptEvaluatorEvaluate:
    """Tests for the evaluate() pipeline: execute → judge → trajectories."""
    @pytest.fixture
-    def executor(self) -> MagicMock:
+    def executor(self) -> AsyncMock:
-        return MagicMock(spec=LLMPort)
+        return AsyncMock(spec=LLMPort)
    @pytest.fixture
-    def judge(self) -> MagicMock:
+    def judge(self) -> AsyncMock:
-        return MagicMock(spec=JudgePort)
+        return AsyncMock(spec=JudgePort)
    @pytest.fixture
-    def evaluator(self, executor: MagicMock, judge: MagicMock) -> PromptEvaluator:
+    def evaluator(self, executor: AsyncMock, judge: AsyncMock) -> PromptEvaluator:
        return PromptEvaluator(executor=executor, judge=judge)
-    def test_happy_path_builds_correct_trajectories(
+    @pytest.mark.asyncio
    async def test_happy_path_builds_correct_trajectories(
        self,
        evaluator: PromptEvaluator,
-        executor: MagicMock,
+        executor: AsyncMock,
-        judge: MagicMock,
+        judge: AsyncMock,
    ) -> None:
        prompt = Prompt(text="Answer the question.")
        examples = [
@@ -42,7 +43,7 @@ class TestPromptEvaluatorEvaluate:
            (0.8, "Mostly correct."),
        ]
-        result = evaluator.evaluate(prompt, examples, "math and geography")
+        result = await evaluator.evaluate(prompt, examples, "math and geography")
        assert isinstance(result, EvalResult)
        assert result.scores == [0.9, 0.8]
@@ -55,14 +56,15 @@ class TestPromptEvaluatorEvaluate:
        assert result.trajectories[0].prompt_used == "Answer the question."
        assert result.trajectories[1].prompt_used == "Answer the question."
-    def test_empty_minibatch_returns_empty_result(
+    @pytest.mark.asyncio
    async def test_empty_minibatch_returns_empty_result(
        self,
        evaluator: PromptEvaluator,
-        executor: MagicMock,
+        executor: AsyncMock,
-        judge: MagicMock,
+        judge: AsyncMock,
    ) -> None:
        prompt = Prompt(text="test")
-        result = evaluator.evaluate(prompt, [], "task")
+        result = await evaluator.evaluate(prompt, [], "task")
        assert result.scores == []
        assert result.feedbacks == []
@@ -71,41 +73,44 @@ class TestPromptEvaluatorEvaluate:
        # judge_batch is called with empty pairs list
        judge.judge_batch.assert_called_once_with("task", [])
-    def test_executor_called_with_correct_prompt(
+    @pytest.mark.asyncio
    async def test_executor_called_with_correct_prompt(
        self,
        evaluator: PromptEvaluator,
-        executor: MagicMock,
+        executor: AsyncMock,
-        judge: MagicMock,
+        judge: AsyncMock,
    ) -> None:
        prompt = Prompt(text="Summarize this.")
        examples = [SyntheticExample(input_text="Long text here", id=0)]
        executor.execute.return_value = "Summary."
        judge.judge_batch.return_value = [(0.7, "Good summary.")]
-        evaluator.evaluate(prompt, examples, "summarization")
+        await evaluator.evaluate(prompt, examples, "summarization")
        executor.execute.assert_called_once_with(prompt, "Long text here")
-    def test_trajectories_prompt_used_matches_input_prompt(
+    @pytest.mark.asyncio
    async def test_trajectories_prompt_used_matches_input_prompt(
        self,
        evaluator: PromptEvaluator,
-        executor: MagicMock,
+        executor: AsyncMock,
-        judge: MagicMock,
+        judge: AsyncMock,
    ) -> None:
        prompt = Prompt(text="Translate to French.")
        examples = [SyntheticExample(input_text="Hello", id=0)]
        executor.execute.return_value = "Bonjour"
        judge.judge_batch.return_value = [(1.0, "Perfect.")]
-        result = evaluator.evaluate(prompt, examples, "translation")
+        result = await evaluator.evaluate(prompt, examples, "translation")
        assert result.trajectories[0].prompt_used == "Translate to French."
-    def test_scores_feedbacks_trajectories_lists_sized_correctly(
+    @pytest.mark.asyncio
    async def test_scores_feedbacks_trajectories_lists_sized_correctly(
        self,
        evaluator: PromptEvaluator,
-        executor: MagicMock,
+        executor: AsyncMock,
-        judge: MagicMock,
+        judge: AsyncMock,
    ) -> None:
        prompt = Prompt(text="test prompt")
        examples = [SyntheticExample(input_text=f"q{i}", id=i) for i in range(4)]
@@ -114,7 +119,7 @@ class TestPromptEvaluatorEvaluate:
            (0.1 * i, f"fb{i}") for i in range(4)
        ]
-        result = evaluator.evaluate(prompt, examples, "task")
+        result = await evaluator.evaluate(prompt, examples, "task")
        assert len(result.scores) == 4
        assert len(result.feedbacks) == 4
--- a/tests/unit/test_evolution.py
+++ b/tests/unit/test_evolution.py
@@ -1,7 +1,9 @@
 """Unit tests for the evolution loop — with full mocking."""
 from __future__ import annotations
-from unittest.mock import MagicMock, patch
+from unittest.mock import AsyncMock, MagicMock, patch
 import pytest
 from prometheus.application.bootstrap import SyntheticBootstrap
 from prometheus.application.evaluator import PromptEvaluator
@@ -10,14 +12,15 @@ from prometheus.domain.entities import EvalResult, Prompt, SyntheticExample, Tra
 class TestEvolutionLoop:
-    def test_accepts_improvement(
+    @pytest.mark.asyncio
    async def test_accepts_improvement(
        self,
        seed_prompt: Prompt,
        synthetic_pool: list[SyntheticExample],
        task_description: str,
-        mock_llm_port: MagicMock,
+        mock_llm_port: AsyncMock,
-        mock_judge_port: MagicMock,
+        mock_judge_port: AsyncMock,
-        mock_proposer_port: MagicMock,
+        mock_proposer_port: AsyncMock,
    ) -> None:
        """When the new prompt improves the score, the best candidate is updated."""
        evaluator = PromptEvaluator(mock_llm_port, mock_judge_port)
@@ -45,7 +48,7 @@ class TestEvolutionLoop:
            feedbacks=["good"] * 5,
            trajectories=[],
        )
-        evaluator.evaluate = MagicMock(side_effect=[initial_eval, old_eval, new_eval])
+        evaluator.evaluate = AsyncMock(side_effect=[initial_eval, old_eval, new_eval])
        loop = EvolutionLoop(
            evaluator=evaluator,
@@ -55,19 +58,20 @@ class TestEvolutionLoop:
            minibatch_size=5,
        )
        with patch.object(loop, "_log"):
-            state = loop.run(seed_prompt, synthetic_pool, task_description)
+            state = await loop.run(seed_prompt, synthetic_pool, task_description)
        assert state.best_candidate is not None
        assert state.best_candidate.best_score > 0
-    def test_rejects_regression(
+    @pytest.mark.asyncio
    async def test_rejects_regression(
        self,
        seed_prompt: Prompt,
        synthetic_pool: list[SyntheticExample],
        task_description: str,
-        mock_llm_port: MagicMock,
+        mock_llm_port: AsyncMock,
-        mock_judge_port: MagicMock,
+        mock_judge_port: AsyncMock,
-        mock_proposer_port: MagicMock,
+        mock_proposer_port: AsyncMock,
    ) -> None:
        """When the new prompt degrades the score, the best candidate stays unchanged."""
        evaluator = PromptEvaluator(mock_llm_port, mock_judge_port)
@@ -95,7 +99,7 @@ class TestEvolutionLoop:
            feedbacks=["bad"] * 5,
            trajectories=[],
        )
-        evaluator.evaluate = MagicMock(side_effect=[initial_eval, old_eval, new_eval])
+        evaluator.evaluate = AsyncMock(side_effect=[initial_eval, old_eval, new_eval])
        loop = EvolutionLoop(
            evaluator=evaluator,
@@ -105,19 +109,20 @@ class TestEvolutionLoop:
            minibatch_size=5,
        )
        with patch.object(loop, "_log"):
-            state = loop.run(seed_prompt, synthetic_pool, task_description)
+            state = await loop.run(seed_prompt, synthetic_pool, task_description)
        assert state.best_candidate is not None
        assert state.best_candidate.prompt.text == seed_prompt.text
-    def test_skips_perfect_scores(
+    @pytest.mark.asyncio
    async def test_skips_perfect_scores(
        self,
        seed_prompt: Prompt,
        synthetic_pool: list[SyntheticExample],
        task_description: str,
-        mock_llm_port: MagicMock,
+        mock_llm_port: AsyncMock,
-        mock_judge_port: MagicMock,
+        mock_judge_port: AsyncMock,
-        mock_proposer_port: MagicMock,
+        mock_proposer_port: AsyncMock,
    ) -> None:
        """When all scores are perfect, no proposition is made."""
        evaluator = PromptEvaluator(mock_llm_port, mock_judge_port)
@@ -132,7 +137,7 @@ class TestEvolutionLoop:
                for i in range(5)
            ],
        )
-        evaluator.evaluate = MagicMock(return_value=perfect_eval)
+        evaluator.evaluate = AsyncMock(return_value=perfect_eval)
        loop = EvolutionLoop(
            evaluator=evaluator,
@@ -142,6 +147,6 @@ class TestEvolutionLoop:
            minibatch_size=5,
        )
        with patch.object(loop, "_log"):
-            loop.run(seed_prompt, synthetic_pool, task_description)
+            await loop.run(seed_prompt, synthetic_pool, task_description)
        mock_proposer_port.propose.assert_not_called()