Initial commit: PROMETHEUS v0.1.0 - Prompt optimizer

- Clean architecture (domain/application/infrastructure)
- DSPy-based evolution engine with scoring
- CLI via pyproject.toml entry point
- Unit + integration tests (~300 tests)
- Configs for glm-5.1 and glm-4.5-air models
- Z.AI endpoint integration
This commit is contained in:
2026-03-29 11:44:03 +00:00
commit 837a44970f
49 changed files with 6599 additions and 0 deletions

View File

@@ -0,0 +1,3 @@
"""PROMETHEUS — Prompt evolution without reference data."""
__version__ = "0.1.0"

View File

View File

@@ -0,0 +1,42 @@
"""
Bootstrap — synthetic input generation.
Creates a pool of test inputs from the task description.
This replaces the need for a labelled dataset.
"""
from __future__ import annotations
import random
from prometheus.domain.entities import SyntheticExample
from prometheus.domain.ports import SyntheticGeneratorPort
class SyntheticBootstrap:
"""Orchestrates synthetic input generation.
Depends only on the abstract port, not on DSPy directly.
"""
def __init__(self, generator: SyntheticGeneratorPort, seed: int = 42):
self._generator = generator
self._rng = random.Random(seed)
def run(self, task_description: str, n_examples: int) -> list[SyntheticExample]:
"""Generate the synthetic pool in a single call.
Single call minimizes LLM cost (1 call instead of N),
and the LLM can ensure diversity in a single generation.
"""
examples = self._generator.generate_inputs(task_description, n_examples)
self._rng.shuffle(examples)
return examples
def sample_minibatch(
self,
pool: list[SyntheticExample],
size: int,
) -> list[SyntheticExample]:
"""Sample a minibatch from the synthetic pool."""
size = min(size, len(pool))
return self._rng.sample(pool, size)

View File

@@ -0,0 +1,47 @@
"""Data Transfer Objects — configuration and results."""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any
@dataclass
class OptimizationConfig:
"""Complete configuration for a PROMETHEUS run."""
# --- Prompt ---
seed_prompt: str
task_description: str
# --- Models ---
task_model: str = "openai/gpt-4o-mini"
judge_model: str = "openai/gpt-4o"
proposer_model: str = "openai/gpt-4o"
synth_model: str = "openai/gpt-4o"
# --- Evolution parameters ---
max_iterations: int = 30
n_synthetic_inputs: int = 20
minibatch_size: int = 5
perfect_score: float = 1.0
# --- Reproducibility ---
seed: int = 42
# --- Output ---
output_path: str = "output.yaml"
verbose: bool = False
@dataclass
class OptimizationResult:
"""Result of a complete optimization."""
optimized_prompt: str
initial_prompt: str
iterations_used: int
total_llm_calls: int
initial_score: float
final_score: float
improvement: float
history: list[dict[str, Any]] = field(default_factory=list)

View File

@@ -0,0 +1,75 @@
"""
Evaluator — execution + judgement.
Produces a quality signal without ground truth.
Combines candidate prompt execution + LLM-as-Judge evaluation.
"""
from __future__ import annotations
from prometheus.domain.entities import (
EvalResult,
Prompt,
SyntheticExample,
Trajectory,
)
from prometheus.domain.ports import JudgePort, LLMPort
class PromptEvaluator:
"""Evaluates a prompt on a minibatch of synthetic inputs.
Pipeline: execute → judge → build trajectories.
Replaces GEPA's EvaluatorFn. Instead of comparing to ground truth,
uses an LLM-as-Judge.
"""
def __init__(self, executor: LLMPort, judge: JudgePort):
self._executor = executor
self._judge = judge
def evaluate(
self,
prompt: Prompt,
minibatch: list[SyntheticExample],
task_description: str,
) -> EvalResult:
"""Evaluate the prompt on the minibatch.
Steps:
1. Execute the prompt on each input in the minibatch
2. Judge each (input, output) pair
3. Build trajectories with feedback
"""
# Step 1: Execution
outputs: list[str] = []
for example in minibatch:
raw_output = self._executor.execute(prompt, example.input_text)
outputs.append(raw_output)
# Step 2: Judgement
pairs = [(ex.input_text, out) for ex, out in zip(minibatch, outputs)]
judge_results = self._judge.judge_batch(task_description, pairs)
# Step 3: Build trajectories
scores: list[float] = []
feedbacks: list[str] = []
trajectories: list[Trajectory] = []
for i, (example, output) in enumerate(zip(minibatch, outputs)):
score, feedback = judge_results[i]
scores.append(score)
feedbacks.append(feedback)
trajectories.append(
Trajectory(
input_text=example.input_text,
output_text=output,
score=score,
feedback=feedback,
prompt_used=prompt.text,
)
)
return EvalResult(
scores=scores,
feedbacks=feedbacks,
trajectories=trajectories,
)

View File

@@ -0,0 +1,174 @@
"""
Evolution loop — core PROMETHEUS engine.
Orchestrates the select → evaluate → propose → accept cycle.
Equivalent to GEPAEngine.run(), adapted to work without a valset.
"""
from __future__ import annotations
import logging
from prometheus.application.bootstrap import SyntheticBootstrap
from prometheus.application.evaluator import PromptEvaluator
from prometheus.domain.entities import (
Candidate,
OptimizationState,
Prompt,
SyntheticExample,
)
from prometheus.domain.ports import ProposerPort
from prometheus.domain.scoring import should_accept
logger = logging.getLogger(__name__)
class EvolutionLoop:
"""Main evolution loop.
Design:
- Keeps only the best candidate (no full population).
- Simplifies vs GEPA (no Pareto, no merge).
- Population support deferred to v2.
"""
def __init__(
self,
evaluator: PromptEvaluator,
proposer: ProposerPort,
bootstrap: SyntheticBootstrap,
max_iterations: int = 30,
minibatch_size: int = 5,
perfect_score: float = 1.0,
verbose: bool = False,
):
self._evaluator = evaluator
self._proposer = proposer
self._bootstrap = bootstrap
self._max_iterations = max_iterations
self._minibatch_size = minibatch_size
self._perfect_score = perfect_score
self._verbose = verbose
def run(
self,
seed_prompt: Prompt,
synthetic_pool: list[SyntheticExample],
task_description: str,
) -> OptimizationState:
"""Execute the complete evolution loop."""
state = OptimizationState()
# Evaluate the seed
initial_batch = self._bootstrap.sample_minibatch(
synthetic_pool, self._minibatch_size
)
initial_eval = self._evaluator.evaluate(
seed_prompt, initial_batch, task_description
)
state.total_llm_calls += 2 * self._minibatch_size # N executions + N judge calls
best_candidate = Candidate(
prompt=seed_prompt,
best_score=initial_eval.total_score,
generation=0,
)
state.best_candidate = best_candidate
state.candidates.append(best_candidate)
self._log(f"Initial score: {initial_eval.total_score:.2f}")
# Main loop
for i in range(1, self._max_iterations + 1):
state.iteration = i
try:
# 1. Sample a fresh minibatch
batch = self._bootstrap.sample_minibatch(
synthetic_pool, self._minibatch_size
)
# 2. Evaluate the current candidate
current_eval = self._evaluator.evaluate(
best_candidate.prompt, batch, task_description
)
state.total_llm_calls += 2 * self._minibatch_size
# 3. Skip if perfect
if all(s >= self._perfect_score for s in current_eval.scores):
self._log(f"Iter {i}: All scores perfect, skipping.")
state.history.append(
{
"iteration": i,
"event": "skip_perfect",
"current_score": current_eval.total_score,
}
)
continue
# 4. Propose a new prompt (reflective mutation)
new_prompt = self._proposer.propose(
best_candidate.prompt,
current_eval.trajectories,
task_description,
)
state.total_llm_calls += 1 # 1 proposition call
# 5. Evaluate the new prompt on the same minibatch
new_eval = self._evaluator.evaluate(
new_prompt, batch, task_description
)
state.total_llm_calls += 2 * self._minibatch_size
# 6. Accept or reject
if should_accept(current_eval, new_eval):
best_candidate = Candidate(
prompt=new_prompt,
best_score=new_eval.total_score,
generation=i,
parent_id=id(best_candidate),
)
state.best_candidate = best_candidate
state.candidates.append(best_candidate)
self._log(
f"Iter {i}: ACCEPTED "
f"({current_eval.total_score:.2f} -> {new_eval.total_score:.2f})"
)
state.history.append(
{
"iteration": i,
"event": "accepted",
"old_score": current_eval.total_score,
"new_score": new_eval.total_score,
"improvement": new_eval.total_score
- current_eval.total_score,
}
)
else:
self._log(
f"Iter {i}: REJECTED "
f"({new_eval.total_score:.2f} <= {current_eval.total_score:.2f})"
)
state.history.append(
{
"iteration": i,
"event": "rejected",
"old_score": current_eval.total_score,
"new_score": new_eval.total_score,
}
)
except Exception as exc:
self._log(f"Iter {i}: ERROR — {exc}. Skipping iteration.")
state.history.append(
{
"iteration": i,
"event": "error",
"error": str(exc),
}
)
continue
return state
def _log(self, msg: str) -> None:
if self._verbose:
logger.info("[PROMETHEUS] %s", msg)

View File

@@ -0,0 +1,77 @@
"""
Main use case — high-level orchestration.
Entry point for business logic. Coordinates bootstrap → evolution → result.
Contains no technical logic, only orchestration.
"""
from __future__ import annotations
from prometheus.application.bootstrap import SyntheticBootstrap
from prometheus.application.dto import OptimizationConfig, OptimizationResult
from prometheus.application.evaluator import PromptEvaluator
from prometheus.application.evolution import EvolutionLoop
from prometheus.domain.entities import Prompt
from prometheus.domain.ports import ProposerPort
class OptimizePromptUseCase:
"""Single MVP use case.
Injects dependencies via constructor (dependency injection).
"""
def __init__(
self,
evaluator: PromptEvaluator,
proposer: ProposerPort,
bootstrap: SyntheticBootstrap,
):
self._evaluator = evaluator
self._proposer = proposer
self._bootstrap = bootstrap
def execute(self, config: OptimizationConfig) -> OptimizationResult:
"""Full pipeline:
1. Bootstrap → generate synthetic inputs
2. Evolution → optimization loop
3. Return result
"""
# Phase 0: Bootstrap
synthetic_pool = self._bootstrap.run(
task_description=config.task_description,
n_examples=config.n_synthetic_inputs,
)
# Phase 1: Evolution
loop = EvolutionLoop(
evaluator=self._evaluator,
proposer=self._proposer,
bootstrap=self._bootstrap,
max_iterations=config.max_iterations,
minibatch_size=config.minibatch_size,
perfect_score=config.perfect_score,
verbose=config.verbose,
)
seed_prompt = Prompt(text=config.seed_prompt)
state = loop.run(seed_prompt, synthetic_pool, config.task_description)
# Phase 2: Result
initial_score = (
state.candidates[0].best_score if state.candidates else 0.0
)
final_score = state.best_candidate.best_score if state.best_candidate else 0.0
return OptimizationResult(
optimized_prompt=(
state.best_candidate.prompt.text
if state.best_candidate
else config.seed_prompt
),
initial_prompt=config.seed_prompt,
iterations_used=state.iteration,
total_llm_calls=state.total_llm_calls + 1, # +1 for bootstrap
initial_score=initial_score,
final_score=final_score,
improvement=final_score - initial_score,
history=state.history,
)

View File

168
src/prometheus/cli/app.py Normal file
View File

@@ -0,0 +1,168 @@
"""
CLI — user entry point.
Typer interface with -i (input) and -o (output) options.
"""
from __future__ import annotations
import logging
import os
from dataclasses import asdict
import dspy
import typer
from rich.console import Console
from rich.panel import Panel
from rich.table import Table
from prometheus.application.bootstrap import SyntheticBootstrap
from prometheus.application.dto import OptimizationConfig, OptimizationResult
from prometheus.application.evaluator import PromptEvaluator
from prometheus.application.use_cases import OptimizePromptUseCase
from prometheus.infrastructure.file_io import YamlPersistence
from prometheus.infrastructure.judge_adapter import DSPyJudgeAdapter
from prometheus.infrastructure.llm_adapter import DSPyLLMAdapter
from prometheus.infrastructure.proposer_adapter import DSPyProposerAdapter
from prometheus.infrastructure.synth_adapter import DSPySyntheticAdapter
app = typer.Typer(
name="prometheus",
help="PROMETHEUS — Prompt evolution without reference data.",
no_args_is_help=True,
)
console = Console()
@app.command()
def optimize(
input: str = typer.Option(
...,
"-i",
"--input",
help="Path to input YAML config file.",
exists=True,
readable=True,
),
output: str = typer.Option(
"output.yaml",
"-o",
"--output",
help="Path to output YAML result file.",
),
verbose: bool = typer.Option(
False,
"-v",
"--verbose",
help="Print detailed progress.",
),
) -> None:
"""Optimize a prompt without any reference data.
Usage:
prometheus optimize -i config.yaml -o result.yaml
"""
# Configure verbose logging
if verbose:
logging.basicConfig(level=logging.INFO, format="[PROMETHEUS] %(message)s")
console.print(
Panel.fit(
"PROMETHEUS — Prompt Evolution Engine",
subtitle="No reference data required",
)
)
# 1. Load config
persistence = YamlPersistence()
raw_config = persistence.read_config(input)
config = OptimizationConfig(
seed_prompt=raw_config["seed_prompt"],
task_description=raw_config["task_description"],
task_model=raw_config.get("task_model", "openai/gpt-4o-mini"),
judge_model=raw_config.get("judge_model", "openai/gpt-4o"),
proposer_model=raw_config.get("proposer_model", "openai/gpt-4o"),
synth_model=raw_config.get("synth_model", "openai/gpt-4o"),
max_iterations=raw_config.get("max_iterations", 30),
n_synthetic_inputs=raw_config.get("n_synthetic_inputs", 20),
minibatch_size=raw_config.get("minibatch_size", 5),
seed=raw_config.get("seed", 42),
output_path=output,
verbose=verbose,
)
console.print(f"[dim]Task: {config.task_description[:80]}...[/dim]")
console.print(f"[dim]Seed prompt: {config.seed_prompt[:80]}...[/dim]")
# 2. Configure DSPy with optional api_base/api_key from config
lm_kwargs: dict = {}
api_base = raw_config.get("api_base")
api_key_env = raw_config.get("api_key_env")
if api_base:
lm_kwargs["api_base"] = api_base
if api_key_env:
lm_kwargs["api_key"] = os.environ.get(api_key_env, "")
task_lm = dspy.LM(config.task_model, **lm_kwargs)
dspy.configure(lm=task_lm)
# 3. Build adapters (Dependency Injection)
synth_adapter = DSPySyntheticAdapter()
llm_adapter = DSPyLLMAdapter(model=config.task_model)
judge_adapter = DSPyJudgeAdapter()
proposer_adapter = DSPyProposerAdapter()
bootstrap = SyntheticBootstrap(generator=synth_adapter, seed=config.seed)
evaluator = PromptEvaluator(executor=llm_adapter, judge=judge_adapter)
use_case = OptimizePromptUseCase(
evaluator=evaluator,
proposer=proposer_adapter,
bootstrap=bootstrap,
)
# 4. Execute
with console.status("[bold green]Evolving prompt..."):
result = use_case.execute(config)
# 5. Display results
_display_result(result)
# 6. Save
_save_result(persistence, output, result)
console.print(f"\n[green]Results saved to {output}[/green]")
def _display_result(result: OptimizationResult) -> None:
"""Display a Rich summary in the terminal."""
console.print()
console.print(
Panel(
f"[bold green]Optimized Prompt[/bold green]\n\n{result.optimized_prompt}",
title="Result",
)
)
table = Table(title="Metrics")
table.add_column("Metric", style="cyan")
table.add_column("Value", style="bold")
table.add_row("Initial Score", f"{result.initial_score:.2f}")
table.add_row("Final Score", f"{result.final_score:.2f}")
table.add_row("Improvement", f"{result.improvement:+.2f}")
table.add_row("Iterations", str(result.iterations_used))
table.add_row("LLM Calls", str(result.total_llm_calls))
console.print(table)
def _save_result(
persistence: YamlPersistence,
path: str,
result: OptimizationResult,
) -> None:
"""Save the result as YAML."""
persistence.write_result(path, asdict(result))
@app.command(hidden=True)
def _help() -> None:
"""Internal placeholder to force multi-command Typer behavior."""
pass
if __name__ == "__main__":
app()

12
src/prometheus/config.py Normal file
View File

@@ -0,0 +1,12 @@
"""Application settings."""
from __future__ import annotations
from dataclasses import dataclass
@dataclass
class AppSettings:
"""Non-sensitive settings, hardcoded for the MVP."""
app_name: str = "prometheus"
version: str = "0.1.0"

View File

View File

@@ -0,0 +1,87 @@
"""Domain entities — pure data, zero dependencies."""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any
@dataclass(frozen=True)
class Prompt:
"""Represents a candidate prompt.
frozen=True → immutable, safe for Pareto tracking.
"""
text: str
metadata: dict[str, Any] = field(default_factory=dict)
def __len__(self) -> int:
return len(self.text)
@dataclass(frozen=True)
class SyntheticExample:
"""A synthetic example: an input generated from the task description.
No expected output — the judge will evaluate the output directly.
"""
input_text: str
category: str = "default" # for future stratified sampling
id: int = 0
@dataclass
class Trajectory:
"""Execution trace of a prompt on an input.
Used by reflective mutation to understand failures.
"""
input_text: str
output_text: str
score: float
feedback: str # textual feedback from the judge
prompt_used: str
@dataclass
class EvalResult:
"""Result of an evaluation on a minibatch."""
scores: list[float]
feedbacks: list[str]
trajectories: list[Trajectory]
@property
def total_score(self) -> float:
return sum(self.scores)
@property
def mean_score(self) -> float:
return sum(self.scores) / len(self.scores) if self.scores else 0.0
@dataclass
class Candidate:
"""A candidate in the evolution pool.
Contains the prompt + its cumulative scores.
"""
prompt: Prompt
best_score: float = 0.0
generation: int = 0 # at which iteration it was created
parent_id: int | None = None
@dataclass
class OptimizationState:
"""Complete optimization state — serializable snapshot."""
iteration: int = 0
best_candidate: Candidate | None = None
candidates: list[Candidate] = field(default_factory=list)
synthetic_pool: list[SyntheticExample] = field(default_factory=list)
history: list[dict[str, Any]] = field(default_factory=list)
total_llm_calls: int = 0

View File

@@ -0,0 +1,85 @@
"""
Domain ports — abstract interfaces that infrastructure implements.
Uses ABC (abstract base classes) for the loose coupling.
"""
from __future__ import annotations
from abc import ABC, abstractmethod
from typing import Any
from prometheus.domain.entities import Prompt, SyntheticExample, Trajectory
class LLMPort(ABC):
"""Port for executing a prompt on an input.
Infrastructure will provide an implementation via DSPy.
"""
@abstractmethod
def execute(self, prompt: Prompt, input_text: str) -> str:
"""Execute the prompt on the input, return the raw response."""
...
class JudgePort(ABC):
"""Port for LLM-as-Judge evaluation.
Takes (input, output) pairs + the task description.
Returns a score + textual feedback per pair.
"""
@abstractmethod
def judge_batch(
self,
task_description: str,
pairs: list[tuple[str, str]],
) -> list[tuple[float, str]]:
"""Evaluate a batch of (input, output) pairs.
Returns a list of (score, feedback).
"""
...
class ProposerPort(ABC):
"""Port for proposing a new prompt.
Uses evaluation trajectories to propose an improvement.
"""
@abstractmethod
def propose(
self,
current_prompt: Prompt,
trajectories: list[Trajectory],
task_description: str,
) -> Prompt:
"""Propose a new prompt based on failure trajectories."""
...
class SyntheticGeneratorPort(ABC):
"""Port for generating synthetic inputs."""
@abstractmethod
def generate_inputs(
self,
task_description: str,
n_examples: int,
) -> list[SyntheticExample]:
"""Generate N diverse synthetic inputs."""
...
class PersistencePort(ABC):
"""Port for reading/writing files."""
@abstractmethod
def read_config(self, path: str) -> dict[str, Any]:
...
@abstractmethod
def write_result(self, path: str, data: dict[str, Any]) -> None:
...

View File

@@ -0,0 +1,21 @@
"""Scoring logic and acceptance criteria — pure domain."""
from __future__ import annotations
from prometheus.domain.entities import EvalResult
def should_accept(
old_result: EvalResult,
new_result: EvalResult,
min_improvement: float = 0.0,
) -> bool:
"""Strict acceptance criterion.
The new candidate must strictly improve the total score.
"""
return new_result.total_score > old_result.total_score + min_improvement
def normalize_score(raw: float, min_val: float = 0.0, max_val: float = 1.0) -> float:
"""Clamp a score within [min_val, max_val]."""
return max(min_val, min(max_val, raw))

View File

@@ -0,0 +1,92 @@
"""
DSPy Modules — signature composition.
Declarative LLM call orchestration via DSPy.
"""
from __future__ import annotations
import json
import re
import dspy
from prometheus.infrastructure.dspy_signatures import (
GenerateSyntheticInputs,
JudgeOutput,
ProposeInstruction,
)
class SyntheticInputGenerator(dspy.Module):
"""Generates synthetic inputs in a single batch call.
Uses ChainOfThought for better diversity.
"""
def __init__(self) -> None:
super().__init__()
self.generate = dspy.ChainOfThought(GenerateSyntheticInputs)
def forward(self, task_description: str, n_examples: int) -> dspy.Prediction:
result = self.generate(
task_description=task_description,
n_examples=n_examples,
)
try:
examples = json.loads(result.examples)
except json.JSONDecodeError:
examples = self._parse_fallback(result.examples)
return dspy.Prediction(examples=examples)
@staticmethod
def _parse_fallback(text: str) -> list[str]:
"""Extract strings from non-JSON output."""
matches = re.findall(r'"([^"]+)"', text)
return matches if matches else [text]
class OutputJudge(dspy.Module):
"""Judges a single output. Called in batch by JudgeAdapter."""
def __init__(self) -> None:
super().__init__()
self.judge = dspy.ChainOfThought(JudgeOutput)
def forward(
self, task_description: str, input_text: str, output_text: str
) -> dspy.Prediction:
result = self.judge(
task_description=task_description,
input_text=input_text,
output_text=output_text,
)
try:
score = float(result.score)
except (ValueError, TypeError):
score = 0.5 # neutral fallback
score = max(0.0, min(1.0, score))
return dspy.Prediction(score=score, feedback=result.feedback)
class InstructionProposer(dspy.Module):
"""Proposes a new prompt from failure trajectories.
Equivalent to GEPA's InstructionProposalSignature.
"""
def __init__(self) -> None:
super().__init__()
self.propose = dspy.ChainOfThought(ProposeInstruction)
def forward(
self,
current_instruction: str,
task_description: str,
failure_examples: str,
) -> dspy.Prediction:
result = self.propose(
current_instruction=current_instruction,
task_description=task_description,
failure_examples=failure_examples,
)
return dspy.Prediction(new_instruction=result.new_instruction)

View File

@@ -0,0 +1,79 @@
"""
DSPy Signatures — declarative LLM contracts.
Defines WHAT each LLM call does, not HOW.
DSPy Signature = input_fields → output_fields + instruction.
DSPy handles prompting, parsing, and structuring.
"""
from __future__ import annotations
import dspy
class GenerateSyntheticInputs(dspy.Signature):
"""Generate diverse, realistic input examples for a given task."""
task_description: str = dspy.InputField(
desc="Description of the task the prompt should accomplish."
)
n_examples: int = dspy.InputField(
desc="Number of examples to generate."
)
examples: str = dspy.OutputField(
desc=(
"A JSON array of strings, each being a realistic input "
"for the task. Cover: normal cases, edge cases, long inputs, "
"short inputs, ambiguous cases, and tricky scenarios."
),
)
class JudgeOutput(dspy.Signature):
"""Evaluate the quality of an LLM output for a given task and input.
Score: 0.0 (completely wrong) to 1.0 (perfect).
Feedback: specific, actionable criticism.
"""
task_description: str = dspy.InputField(
desc="What the assistant is supposed to do."
)
input_text: str = dspy.InputField(
desc="The input provided to the assistant."
)
output_text: str = dspy.InputField(
desc="The assistant's response to evaluate."
)
score: float = dspy.OutputField(
desc="Quality score from 0.0 (wrong) to 1.0 (perfect)."
)
feedback: str = dspy.OutputField(
desc=(
"Specific, actionable feedback explaining what's wrong "
"with the output and how to improve it. Be critical."
),
)
class ProposeInstruction(dspy.Signature):
"""Given a current prompt and examples of where it fails with feedback,
propose an improved version of the prompt.
The new prompt should address all the issues identified in the feedback.
"""
current_instruction: str = dspy.InputField(
desc="The current prompt/instruction to improve."
)
task_description: str = dspy.InputField(
desc="Description of the task."
)
failure_examples: str = dspy.InputField(
desc=(
"Examples of inputs, outputs, scores, and feedback "
"showing where the current instruction fails."
),
)
new_instruction: str = dspy.OutputField(
desc="An improved version of the instruction."
)

View File

@@ -0,0 +1,25 @@
"""
File I/O — read/write config and result files.
Implements the PersistencePort with YAML.
"""
from __future__ import annotations
from typing import Any
import yaml
from prometheus.domain.ports import PersistencePort
class YamlPersistence(PersistencePort):
"""Reads and writes YAML files."""
def read_config(self, path: str) -> dict[str, Any]:
with open(path, encoding="utf-8") as f:
data: dict[str, Any] = yaml.safe_load(f)
return data
def write_result(self, path: str, data: dict[str, Any]) -> None:
with open(path, "w", encoding="utf-8") as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True)

View File

@@ -0,0 +1,34 @@
"""
Adapter: LLM-as-Judge.
Implements the JudgePort via the DSPy OutputJudge module.
"""
from __future__ import annotations
from prometheus.domain.ports import JudgePort
from prometheus.infrastructure.dspy_modules import OutputJudge
class DSPyJudgeAdapter(JudgePort):
"""Evaluates a batch of (input, output) pairs by calling the Judge for each.
Sequential for MVP. Future: parallelize via dspy.Parallel.
"""
def __init__(self) -> None:
self._judge = OutputJudge()
def judge_batch(
self,
task_description: str,
pairs: list[tuple[str, str]],
) -> list[tuple[float, str]]:
results: list[tuple[float, str]] = []
for input_text, output_text in pairs:
pred = self._judge(
task_description=task_description,
input_text=input_text,
output_text=output_text,
)
results.append((pred.score, pred.feedback))
return results

View File

@@ -0,0 +1,32 @@
"""
Adapter: Execute a prompt on an input.
Implements the LLMPort via DSPy.
"""
from __future__ import annotations
import dspy
from prometheus.domain.entities import Prompt
from prometheus.domain.ports import LLMPort
class DSPyLLMAdapter(LLMPort):
"""Executes a prompt using dspy.Predict with a simple signature."""
class _ExecuteSignature(dspy.Signature):
"""Execute the instruction on the given input."""
instruction: str = dspy.InputField(desc="The instruction/prompt to follow.")
input_text: str = dspy.InputField(desc="The input to process.")
output: str = dspy.OutputField(desc="The response following the instruction.")
def __init__(self, model: str) -> None:
self._predictor = dspy.Predict(self._ExecuteSignature)
def execute(self, prompt: Prompt, input_text: str) -> str:
result = self._predictor(
instruction=prompt.text,
input_text=input_text,
)
return str(result.output)

View File

@@ -0,0 +1,47 @@
"""
Adapter: Reflective Mutation Proposer.
Implements the ProposerPort via the DSPy InstructionProposer.
Converts trajectories into readable format for the LLM proposer.
"""
from __future__ import annotations
from prometheus.domain.entities import Prompt, Trajectory
from prometheus.domain.ports import ProposerPort
from prometheus.infrastructure.dspy_modules import InstructionProposer
class DSPyProposerAdapter(ProposerPort):
"""Uses evaluation trajectories to build a failure report and propose a new prompt."""
def __init__(self) -> None:
self._proposer = InstructionProposer()
def propose(
self,
current_prompt: Prompt,
trajectories: list[Trajectory],
task_description: str,
) -> Prompt:
failure_examples = self._format_failures(trajectories)
pred = self._proposer(
current_instruction=current_prompt.text,
task_description=task_description,
failure_examples=failure_examples,
)
return Prompt(text=pred.new_instruction)
@staticmethod
def _format_failures(trajectories: list[Trajectory]) -> str:
"""Convert trajectories into a structured textual report."""
sections: list[str] = []
for i, t in enumerate(trajectories, 1):
section = (
f"# Example {i}\n"
f"## Input\n{t.input_text}\n\n"
f"## Generated Output\n{t.output_text}\n\n"
f"## Score\n{t.score:.2f}\n\n"
f"## Feedback\n{t.feedback}\n"
)
sections.append(section)
return "\n---\n".join(sections)

View File

@@ -0,0 +1,34 @@
"""
Adapter: Synthetic input generation.
Implements the SyntheticGeneratorPort via DSPy.
"""
from __future__ import annotations
from prometheus.domain.entities import SyntheticExample
from prometheus.domain.ports import SyntheticGeneratorPort
from prometheus.infrastructure.dspy_modules import SyntheticInputGenerator
class DSPySyntheticAdapter(SyntheticGeneratorPort):
"""Generates synthetic inputs in a single batch call via DSPy."""
def __init__(self) -> None:
self._generator = SyntheticInputGenerator()
def generate_inputs(
self,
task_description: str,
n_examples: int,
) -> list[SyntheticExample]:
pred = self._generator(
task_description=task_description,
n_examples=n_examples,
)
return [
SyntheticExample(
input_text=text,
id=i,
)
for i, text in enumerate(pred.examples[:n_examples])
]