Initial commit: PROMETHEUS v0.1.0 - Prompt optimizer

- Clean architecture (domain/application/infrastructure) - DSPy-based evolution engine with scoring - CLI via pyproject.toml entry point - Unit + integration tests (~300 tests) - Configs for glm-5.1 and glm-4.5-air models - Z.AI endpoint integration
2026-03-29 11:44:03 +00:00
commit 837a44970f
49 changed files with 6599 additions and 0 deletions
--- a/src/prometheus/init.py
+++ b/src/prometheus/init.py
@@ -0,0 +1,3 @@
+"""PROMETHEUS — Prompt evolution without reference data."""
+
+__version__ = "0.1.0"
--- a/src/prometheus/application/init.py
+++ b/src/prometheus/application/init.py
--- a/src/prometheus/application/bootstrap.py
+++ b/src/prometheus/application/bootstrap.py
@@ -0,0 +1,42 @@
+"""
+Bootstrap — synthetic input generation.
+
+Creates a pool of test inputs from the task description.
+This replaces the need for a labelled dataset.
+"""
+from __future__ import annotations
+
+import random
+
+from prometheus.domain.entities import SyntheticExample
+from prometheus.domain.ports import SyntheticGeneratorPort
+
+
+class SyntheticBootstrap:
+    """Orchestrates synthetic input generation.
+
+    Depends only on the abstract port, not on DSPy directly.
+    """
+
+    def __init__(self, generator: SyntheticGeneratorPort, seed: int = 42):
+        self._generator = generator
+        self._rng = random.Random(seed)
+
+    def run(self, task_description: str, n_examples: int) -> list[SyntheticExample]:
+        """Generate the synthetic pool in a single call.
+
+        Single call minimizes LLM cost (1 call instead of N),
+        and the LLM can ensure diversity in a single generation.
+        """
+        examples = self._generator.generate_inputs(task_description, n_examples)
+        self._rng.shuffle(examples)
+        return examples
+
+    def sample_minibatch(
+        self,
+        pool: list[SyntheticExample],
+        size: int,
+    ) -> list[SyntheticExample]:
+        """Sample a minibatch from the synthetic pool."""
+        size = min(size, len(pool))
+        return self._rng.sample(pool, size)
--- a/src/prometheus/application/dto.py
+++ b/src/prometheus/application/dto.py
@@ -0,0 +1,47 @@
+"""Data Transfer Objects — configuration and results."""
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+
+
+@dataclass
+class OptimizationConfig:
+    """Complete configuration for a PROMETHEUS run."""
+
+    # --- Prompt ---
+    seed_prompt: str
+    task_description: str
+
+    # --- Models ---
+    task_model: str = "openai/gpt-4o-mini"
+    judge_model: str = "openai/gpt-4o"
+    proposer_model: str = "openai/gpt-4o"
+    synth_model: str = "openai/gpt-4o"
+
+    # --- Evolution parameters ---
+    max_iterations: int = 30
+    n_synthetic_inputs: int = 20
+    minibatch_size: int = 5
+    perfect_score: float = 1.0
+
+    # --- Reproducibility ---
+    seed: int = 42
+
+    # --- Output ---
+    output_path: str = "output.yaml"
+    verbose: bool = False
+
+
+@dataclass
+class OptimizationResult:
+    """Result of a complete optimization."""
+
+    optimized_prompt: str
+    initial_prompt: str
+    iterations_used: int
+    total_llm_calls: int
+    initial_score: float
+    final_score: float
+    improvement: float
+    history: list[dict[str, Any]] = field(default_factory=list)
--- a/src/prometheus/application/evaluator.py
+++ b/src/prometheus/application/evaluator.py
@@ -0,0 +1,75 @@
+"""
+Evaluator — execution + judgement.
+
+Produces a quality signal without ground truth.
+Combines candidate prompt execution + LLM-as-Judge evaluation.
+"""
+from __future__ import annotations
+
+from prometheus.domain.entities import (
+    EvalResult,
+    Prompt,
+    SyntheticExample,
+    Trajectory,
+)
+from prometheus.domain.ports import JudgePort, LLMPort
+
+
+class PromptEvaluator:
+    """Evaluates a prompt on a minibatch of synthetic inputs.
+
+    Pipeline: execute → judge → build trajectories.
+    Replaces GEPA's EvaluatorFn. Instead of comparing to ground truth,
+    uses an LLM-as-Judge.
+    """
+
+    def __init__(self, executor: LLMPort, judge: JudgePort):
+        self._executor = executor
+        self._judge = judge
+
+    def evaluate(
+        self,
+        prompt: Prompt,
+        minibatch: list[SyntheticExample],
+        task_description: str,
+    ) -> EvalResult:
+        """Evaluate the prompt on the minibatch.
+
+        Steps:
+        1. Execute the prompt on each input in the minibatch
+        2. Judge each (input, output) pair
+        3. Build trajectories with feedback
+        """
+        # Step 1: Execution
+        outputs: list[str] = []
+        for example in minibatch:
+            raw_output = self._executor.execute(prompt, example.input_text)
+            outputs.append(raw_output)
+
+        # Step 2: Judgement
+        pairs = [(ex.input_text, out) for ex, out in zip(minibatch, outputs)]
+        judge_results = self._judge.judge_batch(task_description, pairs)
+
+        # Step 3: Build trajectories
+        scores: list[float] = []
+        feedbacks: list[str] = []
+        trajectories: list[Trajectory] = []
+        for i, (example, output) in enumerate(zip(minibatch, outputs)):
+            score, feedback = judge_results[i]
+            scores.append(score)
+            feedbacks.append(feedback)
+            trajectories.append(
+                Trajectory(
+                    input_text=example.input_text,
+                    output_text=output,
+                    score=score,
+                    feedback=feedback,
+                    prompt_used=prompt.text,
+                )
+            )
+
+        return EvalResult(
+            scores=scores,
+            feedbacks=feedbacks,
+            trajectories=trajectories,
+        )
--- a/src/prometheus/application/evolution.py
+++ b/src/prometheus/application/evolution.py
@@ -0,0 +1,174 @@
+"""
+Evolution loop — core PROMETHEUS engine.
+
+Orchestrates the select → evaluate → propose → accept cycle.
+Equivalent to GEPAEngine.run(), adapted to work without a valset.
+"""
+from __future__ import annotations
+
+import logging
+
+from prometheus.application.bootstrap import SyntheticBootstrap
+from prometheus.application.evaluator import PromptEvaluator
+from prometheus.domain.entities import (
+    Candidate,
+    OptimizationState,
+    Prompt,
+    SyntheticExample,
+)
+from prometheus.domain.ports import ProposerPort
+from prometheus.domain.scoring import should_accept
+
+logger = logging.getLogger(__name__)
+
+
+class EvolutionLoop:
+    """Main evolution loop.
+
+    Design:
+    - Keeps only the best candidate (no full population).
+    - Simplifies vs GEPA (no Pareto, no merge).
+    - Population support deferred to v2.
+    """
+
+    def __init__(
+        self,
+        evaluator: PromptEvaluator,
+        proposer: ProposerPort,
+        bootstrap: SyntheticBootstrap,
+        max_iterations: int = 30,
+        minibatch_size: int = 5,
+        perfect_score: float = 1.0,
+        verbose: bool = False,
+    ):
+        self._evaluator = evaluator
+        self._proposer = proposer
+        self._bootstrap = bootstrap
+        self._max_iterations = max_iterations
+        self._minibatch_size = minibatch_size
+        self._perfect_score = perfect_score
+        self._verbose = verbose
+
+    def run(
+        self,
+        seed_prompt: Prompt,
+        synthetic_pool: list[SyntheticExample],
+        task_description: str,
+    ) -> OptimizationState:
+        """Execute the complete evolution loop."""
+        state = OptimizationState()
+
+        # Evaluate the seed
+        initial_batch = self._bootstrap.sample_minibatch(
+            synthetic_pool, self._minibatch_size
+        )
+        initial_eval = self._evaluator.evaluate(
+            seed_prompt, initial_batch, task_description
+        )
+        state.total_llm_calls += 2 * self._minibatch_size  # N executions + N judge calls
+
+        best_candidate = Candidate(
+            prompt=seed_prompt,
+            best_score=initial_eval.total_score,
+            generation=0,
+        )
+        state.best_candidate = best_candidate
+        state.candidates.append(best_candidate)
+        self._log(f"Initial score: {initial_eval.total_score:.2f}")
+
+        # Main loop
+        for i in range(1, self._max_iterations + 1):
+            state.iteration = i
+
+            try:
+                # 1. Sample a fresh minibatch
+                batch = self._bootstrap.sample_minibatch(
+                    synthetic_pool, self._minibatch_size
+                )
+
+                # 2. Evaluate the current candidate
+                current_eval = self._evaluator.evaluate(
+                    best_candidate.prompt, batch, task_description
+                )
+                state.total_llm_calls += 2 * self._minibatch_size
+
+                # 3. Skip if perfect
+                if all(s >= self._perfect_score for s in current_eval.scores):
+                    self._log(f"Iter {i}: All scores perfect, skipping.")
+                    state.history.append(
+                        {
+                            "iteration": i,
+                            "event": "skip_perfect",
+                            "current_score": current_eval.total_score,
+                        }
+                    )
+                    continue
+
+                # 4. Propose a new prompt (reflective mutation)
+                new_prompt = self._proposer.propose(
+                    best_candidate.prompt,
+                    current_eval.trajectories,
+                    task_description,
+                )
+                state.total_llm_calls += 1  # 1 proposition call
+
+                # 5. Evaluate the new prompt on the same minibatch
+                new_eval = self._evaluator.evaluate(
+                    new_prompt, batch, task_description
+                )
+                state.total_llm_calls += 2 * self._minibatch_size
+
+                # 6. Accept or reject
+                if should_accept(current_eval, new_eval):
+                    best_candidate = Candidate(
+                        prompt=new_prompt,
+                        best_score=new_eval.total_score,
+                        generation=i,
+                        parent_id=id(best_candidate),
+                    )
+                    state.best_candidate = best_candidate
+                    state.candidates.append(best_candidate)
+                    self._log(
+                        f"Iter {i}: ACCEPTED "
+                        f"({current_eval.total_score:.2f} -> {new_eval.total_score:.2f})"
+                    )
+                    state.history.append(
+                        {
+                            "iteration": i,
+                            "event": "accepted",
+                            "old_score": current_eval.total_score,
+                            "new_score": new_eval.total_score,
+                            "improvement": new_eval.total_score
+                            - current_eval.total_score,
+                        }
+                    )
+                else:
+                    self._log(
+                        f"Iter {i}: REJECTED "
+                        f"({new_eval.total_score:.2f} <= {current_eval.total_score:.2f})"
+                    )
+                    state.history.append(
+                        {
+                            "iteration": i,
+                            "event": "rejected",
+                            "old_score": current_eval.total_score,
+                            "new_score": new_eval.total_score,
+                        }
+                    )
+
+            except Exception as exc:
+                self._log(f"Iter {i}: ERROR — {exc}. Skipping iteration.")
+                state.history.append(
+                    {
+                        "iteration": i,
+                        "event": "error",
+                        "error": str(exc),
+                    }
+                )
+                continue
+
+        return state
+
+    def _log(self, msg: str) -> None:
+        if self._verbose:
+            logger.info("[PROMETHEUS] %s", msg)
--- a/src/prometheus/application/use_cases.py
+++ b/src/prometheus/application/use_cases.py
@@ -0,0 +1,77 @@
+"""
+Main use case — high-level orchestration.
+
+Entry point for business logic. Coordinates bootstrap → evolution → result.
+Contains no technical logic, only orchestration.
+"""
+from __future__ import annotations
+
+from prometheus.application.bootstrap import SyntheticBootstrap
+from prometheus.application.dto import OptimizationConfig, OptimizationResult
+from prometheus.application.evaluator import PromptEvaluator
+from prometheus.application.evolution import EvolutionLoop
+from prometheus.domain.entities import Prompt
+from prometheus.domain.ports import ProposerPort
+
+
+class OptimizePromptUseCase:
+    """Single MVP use case.
+
+    Injects dependencies via constructor (dependency injection).
+    """
+
+    def __init__(
+        self,
+        evaluator: PromptEvaluator,
+        proposer: ProposerPort,
+        bootstrap: SyntheticBootstrap,
+    ):
+        self._evaluator = evaluator
+        self._proposer = proposer
+        self._bootstrap = bootstrap
+
+    def execute(self, config: OptimizationConfig) -> OptimizationResult:
+        """Full pipeline:
+        1. Bootstrap → generate synthetic inputs
+        2. Evolution → optimization loop
+        3. Return result
+        """
+        # Phase 0: Bootstrap
+        synthetic_pool = self._bootstrap.run(
+            task_description=config.task_description,
+            n_examples=config.n_synthetic_inputs,
+        )
+
+        # Phase 1: Evolution
+        loop = EvolutionLoop(
+            evaluator=self._evaluator,
+            proposer=self._proposer,
+            bootstrap=self._bootstrap,
+            max_iterations=config.max_iterations,
+            minibatch_size=config.minibatch_size,
+            perfect_score=config.perfect_score,
+            verbose=config.verbose,
+        )
+        seed_prompt = Prompt(text=config.seed_prompt)
+        state = loop.run(seed_prompt, synthetic_pool, config.task_description)
+
+        # Phase 2: Result
+        initial_score = (
+            state.candidates[0].best_score if state.candidates else 0.0
+        )
+        final_score = state.best_candidate.best_score if state.best_candidate else 0.0
+
+        return OptimizationResult(
+            optimized_prompt=(
+                state.best_candidate.prompt.text
+                if state.best_candidate
+                else config.seed_prompt
+            ),
+            initial_prompt=config.seed_prompt,
+            iterations_used=state.iteration,
+            total_llm_calls=state.total_llm_calls + 1,  # +1 for bootstrap
+            initial_score=initial_score,
+            final_score=final_score,
+            improvement=final_score - initial_score,
+            history=state.history,
+        )
--- a/src/prometheus/cli/init.py
+++ b/src/prometheus/cli/init.py
--- a/src/prometheus/cli/app.py
+++ b/src/prometheus/cli/app.py
@@ -0,0 +1,168 @@
+"""
+CLI — user entry point.
+
+Typer interface with -i (input) and -o (output) options.
+"""
+from __future__ import annotations
+
+import logging
+import os
+from dataclasses import asdict
+
+import dspy
+import typer
+from rich.console import Console
+from rich.panel import Panel
+from rich.table import Table
+
+from prometheus.application.bootstrap import SyntheticBootstrap
+from prometheus.application.dto import OptimizationConfig, OptimizationResult
+from prometheus.application.evaluator import PromptEvaluator
+from prometheus.application.use_cases import OptimizePromptUseCase
+from prometheus.infrastructure.file_io import YamlPersistence
+from prometheus.infrastructure.judge_adapter import DSPyJudgeAdapter
+from prometheus.infrastructure.llm_adapter import DSPyLLMAdapter
+from prometheus.infrastructure.proposer_adapter import DSPyProposerAdapter
+from prometheus.infrastructure.synth_adapter import DSPySyntheticAdapter
+
+app = typer.Typer(
+    name="prometheus",
+    help="PROMETHEUS — Prompt evolution without reference data.",
+    no_args_is_help=True,
+)
+
+console = Console()
+
+
+@app.command()
+def optimize(
+    input: str = typer.Option(
+        ...,
+        "-i",
+        "--input",
+        help="Path to input YAML config file.",
+        exists=True,
+        readable=True,
+    ),
+    output: str = typer.Option(
+        "output.yaml",
+        "-o",
+        "--output",
+        help="Path to output YAML result file.",
+    ),
+    verbose: bool = typer.Option(
+        False,
+        "-v",
+        "--verbose",
+        help="Print detailed progress.",
+    ),
+) -> None:
+    """Optimize a prompt without any reference data.
+
+    Usage:
+        prometheus optimize -i config.yaml -o result.yaml
+    """
+    # Configure verbose logging
+    if verbose:
+        logging.basicConfig(level=logging.INFO, format="[PROMETHEUS] %(message)s")
+
+    console.print(
+        Panel.fit(
+            "PROMETHEUS — Prompt Evolution Engine",
+            subtitle="No reference data required",
+        )
+    )
+
+    # 1. Load config
+    persistence = YamlPersistence()
+    raw_config = persistence.read_config(input)
+    config = OptimizationConfig(
+        seed_prompt=raw_config["seed_prompt"],
+        task_description=raw_config["task_description"],
+        task_model=raw_config.get("task_model", "openai/gpt-4o-mini"),
+        judge_model=raw_config.get("judge_model", "openai/gpt-4o"),
+        proposer_model=raw_config.get("proposer_model", "openai/gpt-4o"),
+        synth_model=raw_config.get("synth_model", "openai/gpt-4o"),
+        max_iterations=raw_config.get("max_iterations", 30),
+        n_synthetic_inputs=raw_config.get("n_synthetic_inputs", 20),
+        minibatch_size=raw_config.get("minibatch_size", 5),
+        seed=raw_config.get("seed", 42),
+        output_path=output,
+        verbose=verbose,
+    )
+    console.print(f"[dim]Task: {config.task_description[:80]}...[/dim]")
+    console.print(f"[dim]Seed prompt: {config.seed_prompt[:80]}...[/dim]")
+
+    # 2. Configure DSPy with optional api_base/api_key from config
+    lm_kwargs: dict = {}
+    api_base = raw_config.get("api_base")
+    api_key_env = raw_config.get("api_key_env")
+    if api_base:
+        lm_kwargs["api_base"] = api_base
+    if api_key_env:
+        lm_kwargs["api_key"] = os.environ.get(api_key_env, "")
+    task_lm = dspy.LM(config.task_model, **lm_kwargs)
+    dspy.configure(lm=task_lm)
+
+    # 3. Build adapters (Dependency Injection)
+    synth_adapter = DSPySyntheticAdapter()
+    llm_adapter = DSPyLLMAdapter(model=config.task_model)
+    judge_adapter = DSPyJudgeAdapter()
+    proposer_adapter = DSPyProposerAdapter()
+    bootstrap = SyntheticBootstrap(generator=synth_adapter, seed=config.seed)
+    evaluator = PromptEvaluator(executor=llm_adapter, judge=judge_adapter)
+    use_case = OptimizePromptUseCase(
+        evaluator=evaluator,
+        proposer=proposer_adapter,
+        bootstrap=bootstrap,
+    )
+
+    # 4. Execute
+    with console.status("[bold green]Evolving prompt..."):
+        result = use_case.execute(config)
+
+    # 5. Display results
+    _display_result(result)
+
+    # 6. Save
+    _save_result(persistence, output, result)
+    console.print(f"\n[green]Results saved to {output}[/green]")
+
+
+def _display_result(result: OptimizationResult) -> None:
+    """Display a Rich summary in the terminal."""
+    console.print()
+    console.print(
+        Panel(
+            f"[bold green]Optimized Prompt[/bold green]\n\n{result.optimized_prompt}",
+            title="Result",
+        )
+    )
+    table = Table(title="Metrics")
+    table.add_column("Metric", style="cyan")
+    table.add_column("Value", style="bold")
+    table.add_row("Initial Score", f"{result.initial_score:.2f}")
+    table.add_row("Final Score", f"{result.final_score:.2f}")
+    table.add_row("Improvement", f"{result.improvement:+.2f}")
+    table.add_row("Iterations", str(result.iterations_used))
+    table.add_row("LLM Calls", str(result.total_llm_calls))
+    console.print(table)
+
+
+def _save_result(
+    persistence: YamlPersistence,
+    path: str,
+    result: OptimizationResult,
+) -> None:
+    """Save the result as YAML."""
+    persistence.write_result(path, asdict(result))
+
+
+@app.command(hidden=True)
+def _help() -> None:
+    """Internal placeholder to force multi-command Typer behavior."""
+    pass
+
+
+if __name__ == "__main__":
+    app()
--- a/src/prometheus/config.py
+++ b/src/prometheus/config.py
@@ -0,0 +1,12 @@
+"""Application settings."""
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+
+@dataclass
+class AppSettings:
+    """Non-sensitive settings, hardcoded for the MVP."""
+
+    app_name: str = "prometheus"
+    version: str = "0.1.0"
--- a/src/prometheus/domain/init.py
+++ b/src/prometheus/domain/init.py
--- a/src/prometheus/domain/entities.py
+++ b/src/prometheus/domain/entities.py
@@ -0,0 +1,87 @@
+"""Domain entities — pure data, zero dependencies."""
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+
+
+@dataclass(frozen=True)
+class Prompt:
+    """Represents a candidate prompt.
+
+    frozen=True → immutable, safe for Pareto tracking.
+    """
+
+    text: str
+    metadata: dict[str, Any] = field(default_factory=dict)
+
+    def __len__(self) -> int:
+        return len(self.text)
+
+
+@dataclass(frozen=True)
+class SyntheticExample:
+    """A synthetic example: an input generated from the task description.
+
+    No expected output — the judge will evaluate the output directly.
+    """
+
+    input_text: str
+    category: str = "default"  # for future stratified sampling
+    id: int = 0
+
+
+@dataclass
+class Trajectory:
+    """Execution trace of a prompt on an input.
+
+    Used by reflective mutation to understand failures.
+    """
+
+    input_text: str
+    output_text: str
+    score: float
+    feedback: str  # textual feedback from the judge
+    prompt_used: str
+
+
+@dataclass
+class EvalResult:
+    """Result of an evaluation on a minibatch."""
+
+    scores: list[float]
+    feedbacks: list[str]
+    trajectories: list[Trajectory]
+
+    @property
+    def total_score(self) -> float:
+        return sum(self.scores)
+
+    @property
+    def mean_score(self) -> float:
+        return sum(self.scores) / len(self.scores) if self.scores else 0.0
+
+
+@dataclass
+class Candidate:
+    """A candidate in the evolution pool.
+
+    Contains the prompt + its cumulative scores.
+    """
+
+    prompt: Prompt
+    best_score: float = 0.0
+    generation: int = 0  # at which iteration it was created
+    parent_id: int | None = None
+
+
+@dataclass
+class OptimizationState:
+    """Complete optimization state — serializable snapshot."""
+
+    iteration: int = 0
+    best_candidate: Candidate | None = None
+    candidates: list[Candidate] = field(default_factory=list)
+    synthetic_pool: list[SyntheticExample] = field(default_factory=list)
+    history: list[dict[str, Any]] = field(default_factory=list)
+    total_llm_calls: int = 0
--- a/src/prometheus/domain/ports.py
+++ b/src/prometheus/domain/ports.py
@@ -0,0 +1,85 @@
+"""
+Domain ports — abstract interfaces that infrastructure implements.
+Uses ABC (abstract base classes) for the loose coupling.
+"""
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+
+from typing import Any
+
+from prometheus.domain.entities import Prompt, SyntheticExample, Trajectory
+
+
+class LLMPort(ABC):
+    """Port for executing a prompt on an input.
+
+    Infrastructure will provide an implementation via DSPy.
+    """
+
+    @abstractmethod
+    def execute(self, prompt: Prompt, input_text: str) -> str:
+        """Execute the prompt on the input, return the raw response."""
+        ...
+
+
+class JudgePort(ABC):
+    """Port for LLM-as-Judge evaluation.
+
+    Takes (input, output) pairs + the task description.
+    Returns a score + textual feedback per pair.
+    """
+
+    @abstractmethod
+    def judge_batch(
+        self,
+        task_description: str,
+        pairs: list[tuple[str, str]],
+    ) -> list[tuple[float, str]]:
+        """Evaluate a batch of (input, output) pairs.
+
+        Returns a list of (score, feedback).
+        """
+        ...
+
+
+class ProposerPort(ABC):
+    """Port for proposing a new prompt.
+
+    Uses evaluation trajectories to propose an improvement.
+    """
+
+    @abstractmethod
+    def propose(
+        self,
+        current_prompt: Prompt,
+        trajectories: list[Trajectory],
+        task_description: str,
+    ) -> Prompt:
+        """Propose a new prompt based on failure trajectories."""
+        ...
+
+
+class SyntheticGeneratorPort(ABC):
+    """Port for generating synthetic inputs."""
+
+    @abstractmethod
+    def generate_inputs(
+        self,
+        task_description: str,
+        n_examples: int,
+    ) -> list[SyntheticExample]:
+        """Generate N diverse synthetic inputs."""
+        ...
+
+
+class PersistencePort(ABC):
+    """Port for reading/writing files."""
+
+    @abstractmethod
+    def read_config(self, path: str) -> dict[str, Any]:
+        ...
+
+    @abstractmethod
+    def write_result(self, path: str, data: dict[str, Any]) -> None:
+        ...
--- a/src/prometheus/domain/scoring.py
+++ b/src/prometheus/domain/scoring.py
@@ -0,0 +1,21 @@
+"""Scoring logic and acceptance criteria — pure domain."""
+from __future__ import annotations
+
+from prometheus.domain.entities import EvalResult
+
+
+def should_accept(
+    old_result: EvalResult,
+    new_result: EvalResult,
+    min_improvement: float = 0.0,
+) -> bool:
+    """Strict acceptance criterion.
+
+    The new candidate must strictly improve the total score.
+    """
+    return new_result.total_score > old_result.total_score + min_improvement
+
+
+def normalize_score(raw: float, min_val: float = 0.0, max_val: float = 1.0) -> float:
+    """Clamp a score within [min_val, max_val]."""
+    return max(min_val, min(max_val, raw))
--- a/src/prometheus/infrastructure/init.py
+++ b/src/prometheus/infrastructure/init.py
--- a/src/prometheus/infrastructure/dspy_modules.py
+++ b/src/prometheus/infrastructure/dspy_modules.py
@@ -0,0 +1,92 @@
+"""
+DSPy Modules — signature composition.
+
+Declarative LLM call orchestration via DSPy.
+"""
+from __future__ import annotations
+
+import json
+import re
+
+import dspy
+
+from prometheus.infrastructure.dspy_signatures import (
+    GenerateSyntheticInputs,
+    JudgeOutput,
+    ProposeInstruction,
+)
+
+
+class SyntheticInputGenerator(dspy.Module):
+    """Generates synthetic inputs in a single batch call.
+
+    Uses ChainOfThought for better diversity.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.generate = dspy.ChainOfThought(GenerateSyntheticInputs)
+
+    def forward(self, task_description: str, n_examples: int) -> dspy.Prediction:
+        result = self.generate(
+            task_description=task_description,
+            n_examples=n_examples,
+        )
+        try:
+            examples = json.loads(result.examples)
+        except json.JSONDecodeError:
+            examples = self._parse_fallback(result.examples)
+        return dspy.Prediction(examples=examples)
+
+    @staticmethod
+    def _parse_fallback(text: str) -> list[str]:
+        """Extract strings from non-JSON output."""
+        matches = re.findall(r'"([^"]+)"', text)
+        return matches if matches else [text]
+
+
+class OutputJudge(dspy.Module):
+    """Judges a single output. Called in batch by JudgeAdapter."""
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.judge = dspy.ChainOfThought(JudgeOutput)
+
+    def forward(
+        self, task_description: str, input_text: str, output_text: str
+    ) -> dspy.Prediction:
+        result = self.judge(
+            task_description=task_description,
+            input_text=input_text,
+            output_text=output_text,
+        )
+        try:
+            score = float(result.score)
+        except (ValueError, TypeError):
+            score = 0.5  # neutral fallback
+        score = max(0.0, min(1.0, score))
+        return dspy.Prediction(score=score, feedback=result.feedback)
+
+
+class InstructionProposer(dspy.Module):
+    """Proposes a new prompt from failure trajectories.
+
+    Equivalent to GEPA's InstructionProposalSignature.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.propose = dspy.ChainOfThought(ProposeInstruction)
+
+    def forward(
+        self,
+        current_instruction: str,
+        task_description: str,
+        failure_examples: str,
+    ) -> dspy.Prediction:
+        result = self.propose(
+            current_instruction=current_instruction,
+            task_description=task_description,
+            failure_examples=failure_examples,
+        )
+        return dspy.Prediction(new_instruction=result.new_instruction)
--- a/src/prometheus/infrastructure/dspy_signatures.py
+++ b/src/prometheus/infrastructure/dspy_signatures.py
@@ -0,0 +1,79 @@
+"""
+DSPy Signatures — declarative LLM contracts.
+
+Defines WHAT each LLM call does, not HOW.
+DSPy Signature = input_fields → output_fields + instruction.
+DSPy handles prompting, parsing, and structuring.
+"""
+from __future__ import annotations
+
+import dspy
+
+
+class GenerateSyntheticInputs(dspy.Signature):
+    """Generate diverse, realistic input examples for a given task."""
+
+    task_description: str = dspy.InputField(
+        desc="Description of the task the prompt should accomplish."
+    )
+    n_examples: int = dspy.InputField(
+        desc="Number of examples to generate."
+    )
+    examples: str = dspy.OutputField(
+        desc=(
+            "A JSON array of strings, each being a realistic input "
+            "for the task. Cover: normal cases, edge cases, long inputs, "
+            "short inputs, ambiguous cases, and tricky scenarios."
+        ),
+    )
+
+
+class JudgeOutput(dspy.Signature):
+    """Evaluate the quality of an LLM output for a given task and input.
+
+    Score: 0.0 (completely wrong) to 1.0 (perfect).
+    Feedback: specific, actionable criticism.
+    """
+
+    task_description: str = dspy.InputField(
+        desc="What the assistant is supposed to do."
+    )
+    input_text: str = dspy.InputField(
+        desc="The input provided to the assistant."
+    )
+    output_text: str = dspy.InputField(
+        desc="The assistant's response to evaluate."
+    )
+    score: float = dspy.OutputField(
+        desc="Quality score from 0.0 (wrong) to 1.0 (perfect)."
+    )
+    feedback: str = dspy.OutputField(
+        desc=(
+            "Specific, actionable feedback explaining what's wrong "
+            "with the output and how to improve it. Be critical."
+        ),
+    )
+
+
+class ProposeInstruction(dspy.Signature):
+    """Given a current prompt and examples of where it fails with feedback,
+    propose an improved version of the prompt.
+
+    The new prompt should address all the issues identified in the feedback.
+    """
+
+    current_instruction: str = dspy.InputField(
+        desc="The current prompt/instruction to improve."
+    )
+    task_description: str = dspy.InputField(
+        desc="Description of the task."
+    )
+    failure_examples: str = dspy.InputField(
+        desc=(
+            "Examples of inputs, outputs, scores, and feedback "
+            "showing where the current instruction fails."
+        ),
+    )
+    new_instruction: str = dspy.OutputField(
+        desc="An improved version of the instruction."
+    )
--- a/src/prometheus/infrastructure/file_io.py
+++ b/src/prometheus/infrastructure/file_io.py
@@ -0,0 +1,25 @@
+"""
+File I/O — read/write config and result files.
+
+Implements the PersistencePort with YAML.
+"""
+from __future__ import annotations
+
+from typing import Any
+
+import yaml
+
+from prometheus.domain.ports import PersistencePort
+
+
+class YamlPersistence(PersistencePort):
+    """Reads and writes YAML files."""
+
+    def read_config(self, path: str) -> dict[str, Any]:
+        with open(path, encoding="utf-8") as f:
+            data: dict[str, Any] = yaml.safe_load(f)
+            return data
+
+    def write_result(self, path: str, data: dict[str, Any]) -> None:
+        with open(path, "w", encoding="utf-8") as f:
+            yaml.dump(data, f, default_flow_style=False, allow_unicode=True)
--- a/src/prometheus/infrastructure/judge_adapter.py
+++ b/src/prometheus/infrastructure/judge_adapter.py
@@ -0,0 +1,34 @@
+"""
+Adapter: LLM-as-Judge.
+
+Implements the JudgePort via the DSPy OutputJudge module.
+"""
+from __future__ import annotations
+
+from prometheus.domain.ports import JudgePort
+from prometheus.infrastructure.dspy_modules import OutputJudge
+
+
+class DSPyJudgeAdapter(JudgePort):
+    """Evaluates a batch of (input, output) pairs by calling the Judge for each.
+
+    Sequential for MVP. Future: parallelize via dspy.Parallel.
+    """
+
+    def __init__(self) -> None:
+        self._judge = OutputJudge()
+
+    def judge_batch(
+        self,
+        task_description: str,
+        pairs: list[tuple[str, str]],
+    ) -> list[tuple[float, str]]:
+        results: list[tuple[float, str]] = []
+        for input_text, output_text in pairs:
+            pred = self._judge(
+                task_description=task_description,
+                input_text=input_text,
+                output_text=output_text,
+            )
+            results.append((pred.score, pred.feedback))
+        return results
--- a/src/prometheus/infrastructure/llm_adapter.py
+++ b/src/prometheus/infrastructure/llm_adapter.py
@@ -0,0 +1,32 @@
+"""
+Adapter: Execute a prompt on an input.
+
+Implements the LLMPort via DSPy.
+"""
+from __future__ import annotations
+
+import dspy
+
+from prometheus.domain.entities import Prompt
+from prometheus.domain.ports import LLMPort
+
+
+class DSPyLLMAdapter(LLMPort):
+    """Executes a prompt using dspy.Predict with a simple signature."""
+
+    class _ExecuteSignature(dspy.Signature):
+        """Execute the instruction on the given input."""
+
+        instruction: str = dspy.InputField(desc="The instruction/prompt to follow.")
+        input_text: str = dspy.InputField(desc="The input to process.")
+        output: str = dspy.OutputField(desc="The response following the instruction.")
+
+    def __init__(self, model: str) -> None:
+        self._predictor = dspy.Predict(self._ExecuteSignature)
+
+    def execute(self, prompt: Prompt, input_text: str) -> str:
+        result = self._predictor(
+            instruction=prompt.text,
+            input_text=input_text,
+        )
+        return str(result.output)
--- a/src/prometheus/infrastructure/proposer_adapter.py
+++ b/src/prometheus/infrastructure/proposer_adapter.py
@@ -0,0 +1,47 @@
+"""
+Adapter: Reflective Mutation Proposer.
+
+Implements the ProposerPort via the DSPy InstructionProposer.
+Converts trajectories into readable format for the LLM proposer.
+"""
+from __future__ import annotations
+
+from prometheus.domain.entities import Prompt, Trajectory
+from prometheus.domain.ports import ProposerPort
+from prometheus.infrastructure.dspy_modules import InstructionProposer
+
+
+class DSPyProposerAdapter(ProposerPort):
+    """Uses evaluation trajectories to build a failure report and propose a new prompt."""
+
+    def __init__(self) -> None:
+        self._proposer = InstructionProposer()
+
+    def propose(
+        self,
+        current_prompt: Prompt,
+        trajectories: list[Trajectory],
+        task_description: str,
+    ) -> Prompt:
+        failure_examples = self._format_failures(trajectories)
+        pred = self._proposer(
+            current_instruction=current_prompt.text,
+            task_description=task_description,
+            failure_examples=failure_examples,
+        )
+        return Prompt(text=pred.new_instruction)
+
+    @staticmethod
+    def _format_failures(trajectories: list[Trajectory]) -> str:
+        """Convert trajectories into a structured textual report."""
+        sections: list[str] = []
+        for i, t in enumerate(trajectories, 1):
+            section = (
+                f"# Example {i}\n"
+                f"## Input\n{t.input_text}\n\n"
+                f"## Generated Output\n{t.output_text}\n\n"
+                f"## Score\n{t.score:.2f}\n\n"
+                f"## Feedback\n{t.feedback}\n"
+            )
+            sections.append(section)
+        return "\n---\n".join(sections)
--- a/src/prometheus/infrastructure/synth_adapter.py
+++ b/src/prometheus/infrastructure/synth_adapter.py
@@ -0,0 +1,34 @@
+"""
+Adapter: Synthetic input generation.
+
+Implements the SyntheticGeneratorPort via DSPy.
+"""
+from __future__ import annotations
+
+from prometheus.domain.entities import SyntheticExample
+from prometheus.domain.ports import SyntheticGeneratorPort
+from prometheus.infrastructure.dspy_modules import SyntheticInputGenerator
+
+
+class DSPySyntheticAdapter(SyntheticGeneratorPort):
+    """Generates synthetic inputs in a single batch call via DSPy."""
+
+    def __init__(self) -> None:
+        self._generator = SyntheticInputGenerator()
+
+    def generate_inputs(
+        self,
+        task_description: str,
+        n_examples: int,
+    ) -> list[SyntheticExample]:
+        pred = self._generator(
+            task_description=task_description,
+            n_examples=n_examples,
+        )
+        return [
+            SyntheticExample(
+                input_text=text,
+                id=i,
+            )
+            for i, text in enumerate(pred.examples[:n_examples])
+        ]