Initial commit: PROMETHEUS v0.1.0 - Prompt optimizer
- Clean architecture (domain/application/infrastructure) - DSPy-based evolution engine with scoring - CLI via pyproject.toml entry point - Unit + integration tests (~300 tests) - Configs for glm-5.1 and glm-4.5-air models - Z.AI endpoint integration
This commit is contained in:
3
src/prometheus/__init__.py
Normal file
3
src/prometheus/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
"""PROMETHEUS — Prompt evolution without reference data."""
|
||||
|
||||
__version__ = "0.1.0"
|
||||
0
src/prometheus/application/__init__.py
Normal file
0
src/prometheus/application/__init__.py
Normal file
42
src/prometheus/application/bootstrap.py
Normal file
42
src/prometheus/application/bootstrap.py
Normal file
@@ -0,0 +1,42 @@
|
||||
"""
|
||||
Bootstrap — synthetic input generation.
|
||||
|
||||
Creates a pool of test inputs from the task description.
|
||||
This replaces the need for a labelled dataset.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import random
|
||||
|
||||
from prometheus.domain.entities import SyntheticExample
|
||||
from prometheus.domain.ports import SyntheticGeneratorPort
|
||||
|
||||
|
||||
class SyntheticBootstrap:
|
||||
"""Orchestrates synthetic input generation.
|
||||
|
||||
Depends only on the abstract port, not on DSPy directly.
|
||||
"""
|
||||
|
||||
def __init__(self, generator: SyntheticGeneratorPort, seed: int = 42):
|
||||
self._generator = generator
|
||||
self._rng = random.Random(seed)
|
||||
|
||||
def run(self, task_description: str, n_examples: int) -> list[SyntheticExample]:
|
||||
"""Generate the synthetic pool in a single call.
|
||||
|
||||
Single call minimizes LLM cost (1 call instead of N),
|
||||
and the LLM can ensure diversity in a single generation.
|
||||
"""
|
||||
examples = self._generator.generate_inputs(task_description, n_examples)
|
||||
self._rng.shuffle(examples)
|
||||
return examples
|
||||
|
||||
def sample_minibatch(
|
||||
self,
|
||||
pool: list[SyntheticExample],
|
||||
size: int,
|
||||
) -> list[SyntheticExample]:
|
||||
"""Sample a minibatch from the synthetic pool."""
|
||||
size = min(size, len(pool))
|
||||
return self._rng.sample(pool, size)
|
||||
47
src/prometheus/application/dto.py
Normal file
47
src/prometheus/application/dto.py
Normal file
@@ -0,0 +1,47 @@
|
||||
"""Data Transfer Objects — configuration and results."""
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
|
||||
@dataclass
|
||||
class OptimizationConfig:
|
||||
"""Complete configuration for a PROMETHEUS run."""
|
||||
|
||||
# --- Prompt ---
|
||||
seed_prompt: str
|
||||
task_description: str
|
||||
|
||||
# --- Models ---
|
||||
task_model: str = "openai/gpt-4o-mini"
|
||||
judge_model: str = "openai/gpt-4o"
|
||||
proposer_model: str = "openai/gpt-4o"
|
||||
synth_model: str = "openai/gpt-4o"
|
||||
|
||||
# --- Evolution parameters ---
|
||||
max_iterations: int = 30
|
||||
n_synthetic_inputs: int = 20
|
||||
minibatch_size: int = 5
|
||||
perfect_score: float = 1.0
|
||||
|
||||
# --- Reproducibility ---
|
||||
seed: int = 42
|
||||
|
||||
# --- Output ---
|
||||
output_path: str = "output.yaml"
|
||||
verbose: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class OptimizationResult:
|
||||
"""Result of a complete optimization."""
|
||||
|
||||
optimized_prompt: str
|
||||
initial_prompt: str
|
||||
iterations_used: int
|
||||
total_llm_calls: int
|
||||
initial_score: float
|
||||
final_score: float
|
||||
improvement: float
|
||||
history: list[dict[str, Any]] = field(default_factory=list)
|
||||
75
src/prometheus/application/evaluator.py
Normal file
75
src/prometheus/application/evaluator.py
Normal file
@@ -0,0 +1,75 @@
|
||||
"""
|
||||
Evaluator — execution + judgement.
|
||||
|
||||
Produces a quality signal without ground truth.
|
||||
Combines candidate prompt execution + LLM-as-Judge evaluation.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from prometheus.domain.entities import (
|
||||
EvalResult,
|
||||
Prompt,
|
||||
SyntheticExample,
|
||||
Trajectory,
|
||||
)
|
||||
from prometheus.domain.ports import JudgePort, LLMPort
|
||||
|
||||
|
||||
class PromptEvaluator:
|
||||
"""Evaluates a prompt on a minibatch of synthetic inputs.
|
||||
|
||||
Pipeline: execute → judge → build trajectories.
|
||||
Replaces GEPA's EvaluatorFn. Instead of comparing to ground truth,
|
||||
uses an LLM-as-Judge.
|
||||
"""
|
||||
|
||||
def __init__(self, executor: LLMPort, judge: JudgePort):
|
||||
self._executor = executor
|
||||
self._judge = judge
|
||||
|
||||
def evaluate(
|
||||
self,
|
||||
prompt: Prompt,
|
||||
minibatch: list[SyntheticExample],
|
||||
task_description: str,
|
||||
) -> EvalResult:
|
||||
"""Evaluate the prompt on the minibatch.
|
||||
|
||||
Steps:
|
||||
1. Execute the prompt on each input in the minibatch
|
||||
2. Judge each (input, output) pair
|
||||
3. Build trajectories with feedback
|
||||
"""
|
||||
# Step 1: Execution
|
||||
outputs: list[str] = []
|
||||
for example in minibatch:
|
||||
raw_output = self._executor.execute(prompt, example.input_text)
|
||||
outputs.append(raw_output)
|
||||
|
||||
# Step 2: Judgement
|
||||
pairs = [(ex.input_text, out) for ex, out in zip(minibatch, outputs)]
|
||||
judge_results = self._judge.judge_batch(task_description, pairs)
|
||||
|
||||
# Step 3: Build trajectories
|
||||
scores: list[float] = []
|
||||
feedbacks: list[str] = []
|
||||
trajectories: list[Trajectory] = []
|
||||
for i, (example, output) in enumerate(zip(minibatch, outputs)):
|
||||
score, feedback = judge_results[i]
|
||||
scores.append(score)
|
||||
feedbacks.append(feedback)
|
||||
trajectories.append(
|
||||
Trajectory(
|
||||
input_text=example.input_text,
|
||||
output_text=output,
|
||||
score=score,
|
||||
feedback=feedback,
|
||||
prompt_used=prompt.text,
|
||||
)
|
||||
)
|
||||
|
||||
return EvalResult(
|
||||
scores=scores,
|
||||
feedbacks=feedbacks,
|
||||
trajectories=trajectories,
|
||||
)
|
||||
174
src/prometheus/application/evolution.py
Normal file
174
src/prometheus/application/evolution.py
Normal file
@@ -0,0 +1,174 @@
|
||||
"""
|
||||
Evolution loop — core PROMETHEUS engine.
|
||||
|
||||
Orchestrates the select → evaluate → propose → accept cycle.
|
||||
Equivalent to GEPAEngine.run(), adapted to work without a valset.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
from prometheus.application.bootstrap import SyntheticBootstrap
|
||||
from prometheus.application.evaluator import PromptEvaluator
|
||||
from prometheus.domain.entities import (
|
||||
Candidate,
|
||||
OptimizationState,
|
||||
Prompt,
|
||||
SyntheticExample,
|
||||
)
|
||||
from prometheus.domain.ports import ProposerPort
|
||||
from prometheus.domain.scoring import should_accept
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class EvolutionLoop:
|
||||
"""Main evolution loop.
|
||||
|
||||
Design:
|
||||
- Keeps only the best candidate (no full population).
|
||||
- Simplifies vs GEPA (no Pareto, no merge).
|
||||
- Population support deferred to v2.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
evaluator: PromptEvaluator,
|
||||
proposer: ProposerPort,
|
||||
bootstrap: SyntheticBootstrap,
|
||||
max_iterations: int = 30,
|
||||
minibatch_size: int = 5,
|
||||
perfect_score: float = 1.0,
|
||||
verbose: bool = False,
|
||||
):
|
||||
self._evaluator = evaluator
|
||||
self._proposer = proposer
|
||||
self._bootstrap = bootstrap
|
||||
self._max_iterations = max_iterations
|
||||
self._minibatch_size = minibatch_size
|
||||
self._perfect_score = perfect_score
|
||||
self._verbose = verbose
|
||||
|
||||
def run(
|
||||
self,
|
||||
seed_prompt: Prompt,
|
||||
synthetic_pool: list[SyntheticExample],
|
||||
task_description: str,
|
||||
) -> OptimizationState:
|
||||
"""Execute the complete evolution loop."""
|
||||
state = OptimizationState()
|
||||
|
||||
# Evaluate the seed
|
||||
initial_batch = self._bootstrap.sample_minibatch(
|
||||
synthetic_pool, self._minibatch_size
|
||||
)
|
||||
initial_eval = self._evaluator.evaluate(
|
||||
seed_prompt, initial_batch, task_description
|
||||
)
|
||||
state.total_llm_calls += 2 * self._minibatch_size # N executions + N judge calls
|
||||
|
||||
best_candidate = Candidate(
|
||||
prompt=seed_prompt,
|
||||
best_score=initial_eval.total_score,
|
||||
generation=0,
|
||||
)
|
||||
state.best_candidate = best_candidate
|
||||
state.candidates.append(best_candidate)
|
||||
self._log(f"Initial score: {initial_eval.total_score:.2f}")
|
||||
|
||||
# Main loop
|
||||
for i in range(1, self._max_iterations + 1):
|
||||
state.iteration = i
|
||||
|
||||
try:
|
||||
# 1. Sample a fresh minibatch
|
||||
batch = self._bootstrap.sample_minibatch(
|
||||
synthetic_pool, self._minibatch_size
|
||||
)
|
||||
|
||||
# 2. Evaluate the current candidate
|
||||
current_eval = self._evaluator.evaluate(
|
||||
best_candidate.prompt, batch, task_description
|
||||
)
|
||||
state.total_llm_calls += 2 * self._minibatch_size
|
||||
|
||||
# 3. Skip if perfect
|
||||
if all(s >= self._perfect_score for s in current_eval.scores):
|
||||
self._log(f"Iter {i}: All scores perfect, skipping.")
|
||||
state.history.append(
|
||||
{
|
||||
"iteration": i,
|
||||
"event": "skip_perfect",
|
||||
"current_score": current_eval.total_score,
|
||||
}
|
||||
)
|
||||
continue
|
||||
|
||||
# 4. Propose a new prompt (reflective mutation)
|
||||
new_prompt = self._proposer.propose(
|
||||
best_candidate.prompt,
|
||||
current_eval.trajectories,
|
||||
task_description,
|
||||
)
|
||||
state.total_llm_calls += 1 # 1 proposition call
|
||||
|
||||
# 5. Evaluate the new prompt on the same minibatch
|
||||
new_eval = self._evaluator.evaluate(
|
||||
new_prompt, batch, task_description
|
||||
)
|
||||
state.total_llm_calls += 2 * self._minibatch_size
|
||||
|
||||
# 6. Accept or reject
|
||||
if should_accept(current_eval, new_eval):
|
||||
best_candidate = Candidate(
|
||||
prompt=new_prompt,
|
||||
best_score=new_eval.total_score,
|
||||
generation=i,
|
||||
parent_id=id(best_candidate),
|
||||
)
|
||||
state.best_candidate = best_candidate
|
||||
state.candidates.append(best_candidate)
|
||||
self._log(
|
||||
f"Iter {i}: ACCEPTED "
|
||||
f"({current_eval.total_score:.2f} -> {new_eval.total_score:.2f})"
|
||||
)
|
||||
state.history.append(
|
||||
{
|
||||
"iteration": i,
|
||||
"event": "accepted",
|
||||
"old_score": current_eval.total_score,
|
||||
"new_score": new_eval.total_score,
|
||||
"improvement": new_eval.total_score
|
||||
- current_eval.total_score,
|
||||
}
|
||||
)
|
||||
else:
|
||||
self._log(
|
||||
f"Iter {i}: REJECTED "
|
||||
f"({new_eval.total_score:.2f} <= {current_eval.total_score:.2f})"
|
||||
)
|
||||
state.history.append(
|
||||
{
|
||||
"iteration": i,
|
||||
"event": "rejected",
|
||||
"old_score": current_eval.total_score,
|
||||
"new_score": new_eval.total_score,
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as exc:
|
||||
self._log(f"Iter {i}: ERROR — {exc}. Skipping iteration.")
|
||||
state.history.append(
|
||||
{
|
||||
"iteration": i,
|
||||
"event": "error",
|
||||
"error": str(exc),
|
||||
}
|
||||
)
|
||||
continue
|
||||
|
||||
return state
|
||||
|
||||
def _log(self, msg: str) -> None:
|
||||
if self._verbose:
|
||||
logger.info("[PROMETHEUS] %s", msg)
|
||||
77
src/prometheus/application/use_cases.py
Normal file
77
src/prometheus/application/use_cases.py
Normal file
@@ -0,0 +1,77 @@
|
||||
"""
|
||||
Main use case — high-level orchestration.
|
||||
|
||||
Entry point for business logic. Coordinates bootstrap → evolution → result.
|
||||
Contains no technical logic, only orchestration.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from prometheus.application.bootstrap import SyntheticBootstrap
|
||||
from prometheus.application.dto import OptimizationConfig, OptimizationResult
|
||||
from prometheus.application.evaluator import PromptEvaluator
|
||||
from prometheus.application.evolution import EvolutionLoop
|
||||
from prometheus.domain.entities import Prompt
|
||||
from prometheus.domain.ports import ProposerPort
|
||||
|
||||
|
||||
class OptimizePromptUseCase:
|
||||
"""Single MVP use case.
|
||||
|
||||
Injects dependencies via constructor (dependency injection).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
evaluator: PromptEvaluator,
|
||||
proposer: ProposerPort,
|
||||
bootstrap: SyntheticBootstrap,
|
||||
):
|
||||
self._evaluator = evaluator
|
||||
self._proposer = proposer
|
||||
self._bootstrap = bootstrap
|
||||
|
||||
def execute(self, config: OptimizationConfig) -> OptimizationResult:
|
||||
"""Full pipeline:
|
||||
1. Bootstrap → generate synthetic inputs
|
||||
2. Evolution → optimization loop
|
||||
3. Return result
|
||||
"""
|
||||
# Phase 0: Bootstrap
|
||||
synthetic_pool = self._bootstrap.run(
|
||||
task_description=config.task_description,
|
||||
n_examples=config.n_synthetic_inputs,
|
||||
)
|
||||
|
||||
# Phase 1: Evolution
|
||||
loop = EvolutionLoop(
|
||||
evaluator=self._evaluator,
|
||||
proposer=self._proposer,
|
||||
bootstrap=self._bootstrap,
|
||||
max_iterations=config.max_iterations,
|
||||
minibatch_size=config.minibatch_size,
|
||||
perfect_score=config.perfect_score,
|
||||
verbose=config.verbose,
|
||||
)
|
||||
seed_prompt = Prompt(text=config.seed_prompt)
|
||||
state = loop.run(seed_prompt, synthetic_pool, config.task_description)
|
||||
|
||||
# Phase 2: Result
|
||||
initial_score = (
|
||||
state.candidates[0].best_score if state.candidates else 0.0
|
||||
)
|
||||
final_score = state.best_candidate.best_score if state.best_candidate else 0.0
|
||||
|
||||
return OptimizationResult(
|
||||
optimized_prompt=(
|
||||
state.best_candidate.prompt.text
|
||||
if state.best_candidate
|
||||
else config.seed_prompt
|
||||
),
|
||||
initial_prompt=config.seed_prompt,
|
||||
iterations_used=state.iteration,
|
||||
total_llm_calls=state.total_llm_calls + 1, # +1 for bootstrap
|
||||
initial_score=initial_score,
|
||||
final_score=final_score,
|
||||
improvement=final_score - initial_score,
|
||||
history=state.history,
|
||||
)
|
||||
0
src/prometheus/cli/__init__.py
Normal file
0
src/prometheus/cli/__init__.py
Normal file
168
src/prometheus/cli/app.py
Normal file
168
src/prometheus/cli/app.py
Normal file
@@ -0,0 +1,168 @@
|
||||
"""
|
||||
CLI — user entry point.
|
||||
|
||||
Typer interface with -i (input) and -o (output) options.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
from dataclasses import asdict
|
||||
|
||||
import dspy
|
||||
import typer
|
||||
from rich.console import Console
|
||||
from rich.panel import Panel
|
||||
from rich.table import Table
|
||||
|
||||
from prometheus.application.bootstrap import SyntheticBootstrap
|
||||
from prometheus.application.dto import OptimizationConfig, OptimizationResult
|
||||
from prometheus.application.evaluator import PromptEvaluator
|
||||
from prometheus.application.use_cases import OptimizePromptUseCase
|
||||
from prometheus.infrastructure.file_io import YamlPersistence
|
||||
from prometheus.infrastructure.judge_adapter import DSPyJudgeAdapter
|
||||
from prometheus.infrastructure.llm_adapter import DSPyLLMAdapter
|
||||
from prometheus.infrastructure.proposer_adapter import DSPyProposerAdapter
|
||||
from prometheus.infrastructure.synth_adapter import DSPySyntheticAdapter
|
||||
|
||||
app = typer.Typer(
|
||||
name="prometheus",
|
||||
help="PROMETHEUS — Prompt evolution without reference data.",
|
||||
no_args_is_help=True,
|
||||
)
|
||||
|
||||
console = Console()
|
||||
|
||||
|
||||
@app.command()
|
||||
def optimize(
|
||||
input: str = typer.Option(
|
||||
...,
|
||||
"-i",
|
||||
"--input",
|
||||
help="Path to input YAML config file.",
|
||||
exists=True,
|
||||
readable=True,
|
||||
),
|
||||
output: str = typer.Option(
|
||||
"output.yaml",
|
||||
"-o",
|
||||
"--output",
|
||||
help="Path to output YAML result file.",
|
||||
),
|
||||
verbose: bool = typer.Option(
|
||||
False,
|
||||
"-v",
|
||||
"--verbose",
|
||||
help="Print detailed progress.",
|
||||
),
|
||||
) -> None:
|
||||
"""Optimize a prompt without any reference data.
|
||||
|
||||
Usage:
|
||||
prometheus optimize -i config.yaml -o result.yaml
|
||||
"""
|
||||
# Configure verbose logging
|
||||
if verbose:
|
||||
logging.basicConfig(level=logging.INFO, format="[PROMETHEUS] %(message)s")
|
||||
|
||||
console.print(
|
||||
Panel.fit(
|
||||
"PROMETHEUS — Prompt Evolution Engine",
|
||||
subtitle="No reference data required",
|
||||
)
|
||||
)
|
||||
|
||||
# 1. Load config
|
||||
persistence = YamlPersistence()
|
||||
raw_config = persistence.read_config(input)
|
||||
config = OptimizationConfig(
|
||||
seed_prompt=raw_config["seed_prompt"],
|
||||
task_description=raw_config["task_description"],
|
||||
task_model=raw_config.get("task_model", "openai/gpt-4o-mini"),
|
||||
judge_model=raw_config.get("judge_model", "openai/gpt-4o"),
|
||||
proposer_model=raw_config.get("proposer_model", "openai/gpt-4o"),
|
||||
synth_model=raw_config.get("synth_model", "openai/gpt-4o"),
|
||||
max_iterations=raw_config.get("max_iterations", 30),
|
||||
n_synthetic_inputs=raw_config.get("n_synthetic_inputs", 20),
|
||||
minibatch_size=raw_config.get("minibatch_size", 5),
|
||||
seed=raw_config.get("seed", 42),
|
||||
output_path=output,
|
||||
verbose=verbose,
|
||||
)
|
||||
console.print(f"[dim]Task: {config.task_description[:80]}...[/dim]")
|
||||
console.print(f"[dim]Seed prompt: {config.seed_prompt[:80]}...[/dim]")
|
||||
|
||||
# 2. Configure DSPy with optional api_base/api_key from config
|
||||
lm_kwargs: dict = {}
|
||||
api_base = raw_config.get("api_base")
|
||||
api_key_env = raw_config.get("api_key_env")
|
||||
if api_base:
|
||||
lm_kwargs["api_base"] = api_base
|
||||
if api_key_env:
|
||||
lm_kwargs["api_key"] = os.environ.get(api_key_env, "")
|
||||
task_lm = dspy.LM(config.task_model, **lm_kwargs)
|
||||
dspy.configure(lm=task_lm)
|
||||
|
||||
# 3. Build adapters (Dependency Injection)
|
||||
synth_adapter = DSPySyntheticAdapter()
|
||||
llm_adapter = DSPyLLMAdapter(model=config.task_model)
|
||||
judge_adapter = DSPyJudgeAdapter()
|
||||
proposer_adapter = DSPyProposerAdapter()
|
||||
bootstrap = SyntheticBootstrap(generator=synth_adapter, seed=config.seed)
|
||||
evaluator = PromptEvaluator(executor=llm_adapter, judge=judge_adapter)
|
||||
use_case = OptimizePromptUseCase(
|
||||
evaluator=evaluator,
|
||||
proposer=proposer_adapter,
|
||||
bootstrap=bootstrap,
|
||||
)
|
||||
|
||||
# 4. Execute
|
||||
with console.status("[bold green]Evolving prompt..."):
|
||||
result = use_case.execute(config)
|
||||
|
||||
# 5. Display results
|
||||
_display_result(result)
|
||||
|
||||
# 6. Save
|
||||
_save_result(persistence, output, result)
|
||||
console.print(f"\n[green]Results saved to {output}[/green]")
|
||||
|
||||
|
||||
def _display_result(result: OptimizationResult) -> None:
|
||||
"""Display a Rich summary in the terminal."""
|
||||
console.print()
|
||||
console.print(
|
||||
Panel(
|
||||
f"[bold green]Optimized Prompt[/bold green]\n\n{result.optimized_prompt}",
|
||||
title="Result",
|
||||
)
|
||||
)
|
||||
table = Table(title="Metrics")
|
||||
table.add_column("Metric", style="cyan")
|
||||
table.add_column("Value", style="bold")
|
||||
table.add_row("Initial Score", f"{result.initial_score:.2f}")
|
||||
table.add_row("Final Score", f"{result.final_score:.2f}")
|
||||
table.add_row("Improvement", f"{result.improvement:+.2f}")
|
||||
table.add_row("Iterations", str(result.iterations_used))
|
||||
table.add_row("LLM Calls", str(result.total_llm_calls))
|
||||
console.print(table)
|
||||
|
||||
|
||||
def _save_result(
|
||||
persistence: YamlPersistence,
|
||||
path: str,
|
||||
result: OptimizationResult,
|
||||
) -> None:
|
||||
"""Save the result as YAML."""
|
||||
persistence.write_result(path, asdict(result))
|
||||
|
||||
|
||||
@app.command(hidden=True)
|
||||
def _help() -> None:
|
||||
"""Internal placeholder to force multi-command Typer behavior."""
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
12
src/prometheus/config.py
Normal file
12
src/prometheus/config.py
Normal file
@@ -0,0 +1,12 @@
|
||||
"""Application settings."""
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class AppSettings:
|
||||
"""Non-sensitive settings, hardcoded for the MVP."""
|
||||
|
||||
app_name: str = "prometheus"
|
||||
version: str = "0.1.0"
|
||||
0
src/prometheus/domain/__init__.py
Normal file
0
src/prometheus/domain/__init__.py
Normal file
87
src/prometheus/domain/entities.py
Normal file
87
src/prometheus/domain/entities.py
Normal file
@@ -0,0 +1,87 @@
|
||||
"""Domain entities — pure data, zero dependencies."""
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Prompt:
|
||||
"""Represents a candidate prompt.
|
||||
|
||||
frozen=True → immutable, safe for Pareto tracking.
|
||||
"""
|
||||
|
||||
text: str
|
||||
metadata: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self.text)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class SyntheticExample:
|
||||
"""A synthetic example: an input generated from the task description.
|
||||
|
||||
No expected output — the judge will evaluate the output directly.
|
||||
"""
|
||||
|
||||
input_text: str
|
||||
category: str = "default" # for future stratified sampling
|
||||
id: int = 0
|
||||
|
||||
|
||||
@dataclass
|
||||
class Trajectory:
|
||||
"""Execution trace of a prompt on an input.
|
||||
|
||||
Used by reflective mutation to understand failures.
|
||||
"""
|
||||
|
||||
input_text: str
|
||||
output_text: str
|
||||
score: float
|
||||
feedback: str # textual feedback from the judge
|
||||
prompt_used: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class EvalResult:
|
||||
"""Result of an evaluation on a minibatch."""
|
||||
|
||||
scores: list[float]
|
||||
feedbacks: list[str]
|
||||
trajectories: list[Trajectory]
|
||||
|
||||
@property
|
||||
def total_score(self) -> float:
|
||||
return sum(self.scores)
|
||||
|
||||
@property
|
||||
def mean_score(self) -> float:
|
||||
return sum(self.scores) / len(self.scores) if self.scores else 0.0
|
||||
|
||||
|
||||
@dataclass
|
||||
class Candidate:
|
||||
"""A candidate in the evolution pool.
|
||||
|
||||
Contains the prompt + its cumulative scores.
|
||||
"""
|
||||
|
||||
prompt: Prompt
|
||||
best_score: float = 0.0
|
||||
generation: int = 0 # at which iteration it was created
|
||||
parent_id: int | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class OptimizationState:
|
||||
"""Complete optimization state — serializable snapshot."""
|
||||
|
||||
iteration: int = 0
|
||||
best_candidate: Candidate | None = None
|
||||
candidates: list[Candidate] = field(default_factory=list)
|
||||
synthetic_pool: list[SyntheticExample] = field(default_factory=list)
|
||||
history: list[dict[str, Any]] = field(default_factory=list)
|
||||
total_llm_calls: int = 0
|
||||
85
src/prometheus/domain/ports.py
Normal file
85
src/prometheus/domain/ports.py
Normal file
@@ -0,0 +1,85 @@
|
||||
"""
|
||||
Domain ports — abstract interfaces that infrastructure implements.
|
||||
Uses ABC (abstract base classes) for the loose coupling.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
from typing import Any
|
||||
|
||||
from prometheus.domain.entities import Prompt, SyntheticExample, Trajectory
|
||||
|
||||
|
||||
class LLMPort(ABC):
|
||||
"""Port for executing a prompt on an input.
|
||||
|
||||
Infrastructure will provide an implementation via DSPy.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def execute(self, prompt: Prompt, input_text: str) -> str:
|
||||
"""Execute the prompt on the input, return the raw response."""
|
||||
...
|
||||
|
||||
|
||||
class JudgePort(ABC):
|
||||
"""Port for LLM-as-Judge evaluation.
|
||||
|
||||
Takes (input, output) pairs + the task description.
|
||||
Returns a score + textual feedback per pair.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def judge_batch(
|
||||
self,
|
||||
task_description: str,
|
||||
pairs: list[tuple[str, str]],
|
||||
) -> list[tuple[float, str]]:
|
||||
"""Evaluate a batch of (input, output) pairs.
|
||||
|
||||
Returns a list of (score, feedback).
|
||||
"""
|
||||
...
|
||||
|
||||
|
||||
class ProposerPort(ABC):
|
||||
"""Port for proposing a new prompt.
|
||||
|
||||
Uses evaluation trajectories to propose an improvement.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def propose(
|
||||
self,
|
||||
current_prompt: Prompt,
|
||||
trajectories: list[Trajectory],
|
||||
task_description: str,
|
||||
) -> Prompt:
|
||||
"""Propose a new prompt based on failure trajectories."""
|
||||
...
|
||||
|
||||
|
||||
class SyntheticGeneratorPort(ABC):
|
||||
"""Port for generating synthetic inputs."""
|
||||
|
||||
@abstractmethod
|
||||
def generate_inputs(
|
||||
self,
|
||||
task_description: str,
|
||||
n_examples: int,
|
||||
) -> list[SyntheticExample]:
|
||||
"""Generate N diverse synthetic inputs."""
|
||||
...
|
||||
|
||||
|
||||
class PersistencePort(ABC):
|
||||
"""Port for reading/writing files."""
|
||||
|
||||
@abstractmethod
|
||||
def read_config(self, path: str) -> dict[str, Any]:
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def write_result(self, path: str, data: dict[str, Any]) -> None:
|
||||
...
|
||||
21
src/prometheus/domain/scoring.py
Normal file
21
src/prometheus/domain/scoring.py
Normal file
@@ -0,0 +1,21 @@
|
||||
"""Scoring logic and acceptance criteria — pure domain."""
|
||||
from __future__ import annotations
|
||||
|
||||
from prometheus.domain.entities import EvalResult
|
||||
|
||||
|
||||
def should_accept(
|
||||
old_result: EvalResult,
|
||||
new_result: EvalResult,
|
||||
min_improvement: float = 0.0,
|
||||
) -> bool:
|
||||
"""Strict acceptance criterion.
|
||||
|
||||
The new candidate must strictly improve the total score.
|
||||
"""
|
||||
return new_result.total_score > old_result.total_score + min_improvement
|
||||
|
||||
|
||||
def normalize_score(raw: float, min_val: float = 0.0, max_val: float = 1.0) -> float:
|
||||
"""Clamp a score within [min_val, max_val]."""
|
||||
return max(min_val, min(max_val, raw))
|
||||
0
src/prometheus/infrastructure/__init__.py
Normal file
0
src/prometheus/infrastructure/__init__.py
Normal file
92
src/prometheus/infrastructure/dspy_modules.py
Normal file
92
src/prometheus/infrastructure/dspy_modules.py
Normal file
@@ -0,0 +1,92 @@
|
||||
"""
|
||||
DSPy Modules — signature composition.
|
||||
|
||||
Declarative LLM call orchestration via DSPy.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
|
||||
import dspy
|
||||
|
||||
from prometheus.infrastructure.dspy_signatures import (
|
||||
GenerateSyntheticInputs,
|
||||
JudgeOutput,
|
||||
ProposeInstruction,
|
||||
)
|
||||
|
||||
|
||||
class SyntheticInputGenerator(dspy.Module):
|
||||
"""Generates synthetic inputs in a single batch call.
|
||||
|
||||
Uses ChainOfThought for better diversity.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.generate = dspy.ChainOfThought(GenerateSyntheticInputs)
|
||||
|
||||
def forward(self, task_description: str, n_examples: int) -> dspy.Prediction:
|
||||
result = self.generate(
|
||||
task_description=task_description,
|
||||
n_examples=n_examples,
|
||||
)
|
||||
try:
|
||||
examples = json.loads(result.examples)
|
||||
except json.JSONDecodeError:
|
||||
examples = self._parse_fallback(result.examples)
|
||||
return dspy.Prediction(examples=examples)
|
||||
|
||||
@staticmethod
|
||||
def _parse_fallback(text: str) -> list[str]:
|
||||
"""Extract strings from non-JSON output."""
|
||||
matches = re.findall(r'"([^"]+)"', text)
|
||||
return matches if matches else [text]
|
||||
|
||||
|
||||
class OutputJudge(dspy.Module):
|
||||
"""Judges a single output. Called in batch by JudgeAdapter."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.judge = dspy.ChainOfThought(JudgeOutput)
|
||||
|
||||
def forward(
|
||||
self, task_description: str, input_text: str, output_text: str
|
||||
) -> dspy.Prediction:
|
||||
result = self.judge(
|
||||
task_description=task_description,
|
||||
input_text=input_text,
|
||||
output_text=output_text,
|
||||
)
|
||||
try:
|
||||
score = float(result.score)
|
||||
except (ValueError, TypeError):
|
||||
score = 0.5 # neutral fallback
|
||||
score = max(0.0, min(1.0, score))
|
||||
return dspy.Prediction(score=score, feedback=result.feedback)
|
||||
|
||||
|
||||
class InstructionProposer(dspy.Module):
|
||||
"""Proposes a new prompt from failure trajectories.
|
||||
|
||||
Equivalent to GEPA's InstructionProposalSignature.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.propose = dspy.ChainOfThought(ProposeInstruction)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
current_instruction: str,
|
||||
task_description: str,
|
||||
failure_examples: str,
|
||||
) -> dspy.Prediction:
|
||||
result = self.propose(
|
||||
current_instruction=current_instruction,
|
||||
task_description=task_description,
|
||||
failure_examples=failure_examples,
|
||||
)
|
||||
return dspy.Prediction(new_instruction=result.new_instruction)
|
||||
79
src/prometheus/infrastructure/dspy_signatures.py
Normal file
79
src/prometheus/infrastructure/dspy_signatures.py
Normal file
@@ -0,0 +1,79 @@
|
||||
"""
|
||||
DSPy Signatures — declarative LLM contracts.
|
||||
|
||||
Defines WHAT each LLM call does, not HOW.
|
||||
DSPy Signature = input_fields → output_fields + instruction.
|
||||
DSPy handles prompting, parsing, and structuring.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import dspy
|
||||
|
||||
|
||||
class GenerateSyntheticInputs(dspy.Signature):
|
||||
"""Generate diverse, realistic input examples for a given task."""
|
||||
|
||||
task_description: str = dspy.InputField(
|
||||
desc="Description of the task the prompt should accomplish."
|
||||
)
|
||||
n_examples: int = dspy.InputField(
|
||||
desc="Number of examples to generate."
|
||||
)
|
||||
examples: str = dspy.OutputField(
|
||||
desc=(
|
||||
"A JSON array of strings, each being a realistic input "
|
||||
"for the task. Cover: normal cases, edge cases, long inputs, "
|
||||
"short inputs, ambiguous cases, and tricky scenarios."
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
class JudgeOutput(dspy.Signature):
|
||||
"""Evaluate the quality of an LLM output for a given task and input.
|
||||
|
||||
Score: 0.0 (completely wrong) to 1.0 (perfect).
|
||||
Feedback: specific, actionable criticism.
|
||||
"""
|
||||
|
||||
task_description: str = dspy.InputField(
|
||||
desc="What the assistant is supposed to do."
|
||||
)
|
||||
input_text: str = dspy.InputField(
|
||||
desc="The input provided to the assistant."
|
||||
)
|
||||
output_text: str = dspy.InputField(
|
||||
desc="The assistant's response to evaluate."
|
||||
)
|
||||
score: float = dspy.OutputField(
|
||||
desc="Quality score from 0.0 (wrong) to 1.0 (perfect)."
|
||||
)
|
||||
feedback: str = dspy.OutputField(
|
||||
desc=(
|
||||
"Specific, actionable feedback explaining what's wrong "
|
||||
"with the output and how to improve it. Be critical."
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
class ProposeInstruction(dspy.Signature):
|
||||
"""Given a current prompt and examples of where it fails with feedback,
|
||||
propose an improved version of the prompt.
|
||||
|
||||
The new prompt should address all the issues identified in the feedback.
|
||||
"""
|
||||
|
||||
current_instruction: str = dspy.InputField(
|
||||
desc="The current prompt/instruction to improve."
|
||||
)
|
||||
task_description: str = dspy.InputField(
|
||||
desc="Description of the task."
|
||||
)
|
||||
failure_examples: str = dspy.InputField(
|
||||
desc=(
|
||||
"Examples of inputs, outputs, scores, and feedback "
|
||||
"showing where the current instruction fails."
|
||||
),
|
||||
)
|
||||
new_instruction: str = dspy.OutputField(
|
||||
desc="An improved version of the instruction."
|
||||
)
|
||||
25
src/prometheus/infrastructure/file_io.py
Normal file
25
src/prometheus/infrastructure/file_io.py
Normal file
@@ -0,0 +1,25 @@
|
||||
"""
|
||||
File I/O — read/write config and result files.
|
||||
|
||||
Implements the PersistencePort with YAML.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
import yaml
|
||||
|
||||
from prometheus.domain.ports import PersistencePort
|
||||
|
||||
|
||||
class YamlPersistence(PersistencePort):
|
||||
"""Reads and writes YAML files."""
|
||||
|
||||
def read_config(self, path: str) -> dict[str, Any]:
|
||||
with open(path, encoding="utf-8") as f:
|
||||
data: dict[str, Any] = yaml.safe_load(f)
|
||||
return data
|
||||
|
||||
def write_result(self, path: str, data: dict[str, Any]) -> None:
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
yaml.dump(data, f, default_flow_style=False, allow_unicode=True)
|
||||
34
src/prometheus/infrastructure/judge_adapter.py
Normal file
34
src/prometheus/infrastructure/judge_adapter.py
Normal file
@@ -0,0 +1,34 @@
|
||||
"""
|
||||
Adapter: LLM-as-Judge.
|
||||
|
||||
Implements the JudgePort via the DSPy OutputJudge module.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from prometheus.domain.ports import JudgePort
|
||||
from prometheus.infrastructure.dspy_modules import OutputJudge
|
||||
|
||||
|
||||
class DSPyJudgeAdapter(JudgePort):
|
||||
"""Evaluates a batch of (input, output) pairs by calling the Judge for each.
|
||||
|
||||
Sequential for MVP. Future: parallelize via dspy.Parallel.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._judge = OutputJudge()
|
||||
|
||||
def judge_batch(
|
||||
self,
|
||||
task_description: str,
|
||||
pairs: list[tuple[str, str]],
|
||||
) -> list[tuple[float, str]]:
|
||||
results: list[tuple[float, str]] = []
|
||||
for input_text, output_text in pairs:
|
||||
pred = self._judge(
|
||||
task_description=task_description,
|
||||
input_text=input_text,
|
||||
output_text=output_text,
|
||||
)
|
||||
results.append((pred.score, pred.feedback))
|
||||
return results
|
||||
32
src/prometheus/infrastructure/llm_adapter.py
Normal file
32
src/prometheus/infrastructure/llm_adapter.py
Normal file
@@ -0,0 +1,32 @@
|
||||
"""
|
||||
Adapter: Execute a prompt on an input.
|
||||
|
||||
Implements the LLMPort via DSPy.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import dspy
|
||||
|
||||
from prometheus.domain.entities import Prompt
|
||||
from prometheus.domain.ports import LLMPort
|
||||
|
||||
|
||||
class DSPyLLMAdapter(LLMPort):
|
||||
"""Executes a prompt using dspy.Predict with a simple signature."""
|
||||
|
||||
class _ExecuteSignature(dspy.Signature):
|
||||
"""Execute the instruction on the given input."""
|
||||
|
||||
instruction: str = dspy.InputField(desc="The instruction/prompt to follow.")
|
||||
input_text: str = dspy.InputField(desc="The input to process.")
|
||||
output: str = dspy.OutputField(desc="The response following the instruction.")
|
||||
|
||||
def __init__(self, model: str) -> None:
|
||||
self._predictor = dspy.Predict(self._ExecuteSignature)
|
||||
|
||||
def execute(self, prompt: Prompt, input_text: str) -> str:
|
||||
result = self._predictor(
|
||||
instruction=prompt.text,
|
||||
input_text=input_text,
|
||||
)
|
||||
return str(result.output)
|
||||
47
src/prometheus/infrastructure/proposer_adapter.py
Normal file
47
src/prometheus/infrastructure/proposer_adapter.py
Normal file
@@ -0,0 +1,47 @@
|
||||
"""
|
||||
Adapter: Reflective Mutation Proposer.
|
||||
|
||||
Implements the ProposerPort via the DSPy InstructionProposer.
|
||||
Converts trajectories into readable format for the LLM proposer.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from prometheus.domain.entities import Prompt, Trajectory
|
||||
from prometheus.domain.ports import ProposerPort
|
||||
from prometheus.infrastructure.dspy_modules import InstructionProposer
|
||||
|
||||
|
||||
class DSPyProposerAdapter(ProposerPort):
|
||||
"""Uses evaluation trajectories to build a failure report and propose a new prompt."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._proposer = InstructionProposer()
|
||||
|
||||
def propose(
|
||||
self,
|
||||
current_prompt: Prompt,
|
||||
trajectories: list[Trajectory],
|
||||
task_description: str,
|
||||
) -> Prompt:
|
||||
failure_examples = self._format_failures(trajectories)
|
||||
pred = self._proposer(
|
||||
current_instruction=current_prompt.text,
|
||||
task_description=task_description,
|
||||
failure_examples=failure_examples,
|
||||
)
|
||||
return Prompt(text=pred.new_instruction)
|
||||
|
||||
@staticmethod
|
||||
def _format_failures(trajectories: list[Trajectory]) -> str:
|
||||
"""Convert trajectories into a structured textual report."""
|
||||
sections: list[str] = []
|
||||
for i, t in enumerate(trajectories, 1):
|
||||
section = (
|
||||
f"# Example {i}\n"
|
||||
f"## Input\n{t.input_text}\n\n"
|
||||
f"## Generated Output\n{t.output_text}\n\n"
|
||||
f"## Score\n{t.score:.2f}\n\n"
|
||||
f"## Feedback\n{t.feedback}\n"
|
||||
)
|
||||
sections.append(section)
|
||||
return "\n---\n".join(sections)
|
||||
34
src/prometheus/infrastructure/synth_adapter.py
Normal file
34
src/prometheus/infrastructure/synth_adapter.py
Normal file
@@ -0,0 +1,34 @@
|
||||
"""
|
||||
Adapter: Synthetic input generation.
|
||||
|
||||
Implements the SyntheticGeneratorPort via DSPy.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from prometheus.domain.entities import SyntheticExample
|
||||
from prometheus.domain.ports import SyntheticGeneratorPort
|
||||
from prometheus.infrastructure.dspy_modules import SyntheticInputGenerator
|
||||
|
||||
|
||||
class DSPySyntheticAdapter(SyntheticGeneratorPort):
|
||||
"""Generates synthetic inputs in a single batch call via DSPy."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._generator = SyntheticInputGenerator()
|
||||
|
||||
def generate_inputs(
|
||||
self,
|
||||
task_description: str,
|
||||
n_examples: int,
|
||||
) -> list[SyntheticExample]:
|
||||
pred = self._generator(
|
||||
task_description=task_description,
|
||||
n_examples=n_examples,
|
||||
)
|
||||
return [
|
||||
SyntheticExample(
|
||||
input_text=text,
|
||||
id=i,
|
||||
)
|
||||
for i, text in enumerate(pred.examples[:n_examples])
|
||||
]
|
||||
Reference in New Issue
Block a user