From b9745566c88ba784e4a8b7317f3c67a543642039 Mon Sep 17 00:00:00 2001 From: FullStackDev Date: Sun, 29 Mar 2026 15:40:21 +0000 Subject: [PATCH] feat: custom judge criteria and multi-dimensional scoring Add configurable judge rubrics and multi-dimensional scoring with weighted aggregation. New config fields: judge_criteria (free text) and judge_dimensions (list of {name, weight, description}). CLI --judge-criteria flag provides quick overrides. The judge adapter computes weighted aggregate scores and enriches feedback with per-dimension breakdowns. Co-Authored-By: Paperclip --- src/prometheus/application/dto.py | 108 +++++ src/prometheus/cli/commands/optimize.py | 428 ++++++++++++++++++ src/prometheus/domain/scoring.py | 23 + src/prometheus/infrastructure/dspy_modules.py | 59 ++- .../infrastructure/dspy_signatures.py | 61 +++ .../infrastructure/judge_adapter.py | 55 ++- tests/unit/test_adapter_config.py | 8 +- tests/unit/test_error_handling.py | 39 +- 8 files changed, 754 insertions(+), 27 deletions(-) create mode 100644 src/prometheus/cli/commands/optimize.py diff --git a/src/prometheus/application/dto.py b/src/prometheus/application/dto.py index 931a095..0d6c08f 100644 --- a/src/prometheus/application/dto.py +++ b/src/prometheus/application/dto.py @@ -11,6 +11,15 @@ from pydantic import BaseModel, Field, field_validator, model_validator CONFIG_VERSION = 1 _ERROR_STRATEGY_VALUES = {"skip", "retry", "abort"} +_EVAL_METRIC_VALUES = {"exact", "bleu", "rouge_l", "cosine", "llm_judge"} + + +class JudgeDimension(BaseModel): + """A single evaluation dimension for multi-dimensional scoring.""" + + name: str = Field(min_length=1, description="Dimension name (e.g. accuracy, clarity, safety).") + weight: float = Field(default=1.0, ge=0.0, le=1.0, description="Weight for this dimension (0.0–1.0).") + description: str = Field(default="", description="What this dimension measures.") class OptimizationConfig(BaseModel): @@ -67,6 +76,30 @@ class OptimizationConfig(BaseModel): minibatch_size: int = Field(default=5, ge=1, description="Inputs per evaluation minibatch.") perfect_score: float = Field(default=1.0, ge=0.0, le=1.0) + # --- Population-based evolution --- + population_size: int = Field( + default=1, + ge=1, + description="Number of candidates in the evolution population. 1 = single-candidate hill climbing (backward compat).", + ) + crossover_rate: float = Field( + default=0.5, + ge=0.0, + le=1.0, + description="Probability of applying crossover vs reflective mutation.", + ) + mutation_rate: float = Field( + default=0.3, + ge=0.0, + le=1.0, + description="Probability of applying a mutation operator after crossover or proposal.", + ) + diversity_penalty: float = Field( + default=0.1, + ge=0.0, + description="Penalty weight for similarity to existing population members.", + ) + # --- Reproducibility --- seed: int = Field(default=42, ge=0) @@ -79,10 +112,72 @@ class OptimizationConfig(BaseModel): circuit_breaker_threshold: int = Field(default=5, ge=1, description="Consecutive failures before circuit opens.") error_strategy: str = Field(default="retry", description="Error handling strategy: skip | retry | abort.") + # --- Logging & observability --- + debug: bool = Field(default=False, description="Enable DEBUG-level logging.") + log_format: str = Field(default="text", description="Log output format: text | json.") + log_file: str | None = Field(default=None, description="Optional file path for log output.") + + # --- Checkpoint & resume --- + checkpoint_dir: str | None = Field( + default=None, + description="Directory for checkpoint files. Set to enable checkpointing.", + ) + checkpoint_interval: int = Field( + default=5, + ge=1, + description="Save a checkpoint every N iterations (and on every accepted improvement).", + ) + resume: bool = Field( + default=False, + description="Resume from the latest checkpoint in checkpoint_dir.", + ) + # --- Output --- output_path: str = Field(default="output.yaml", min_length=1) verbose: bool = False + # --- Hold-out validation --- + validation_split: float = Field( + default=0.3, + ge=0.0, + lt=1.0, + description="Fraction of synthetic pool reserved for validation (0 = disabled).", + ) + early_stop_patience: int = Field( + default=5, + ge=1, + description="Stop if validation score degrades for this many consecutive iterations.", + ) + + # --- Judge criteria & multi-dimensional scoring --- + judge_criteria: str | None = Field( + default=None, + description="Custom judge rubric or evaluation criteria override (free text).", + ) + judge_dimensions: list[JudgeDimension] | None = Field( + default=None, + description="Multi-dimensional scoring dimensions with configurable weights.", + ) + + # --- Ground-truth evaluation --- + eval_dataset_path: str | None = Field( + default=None, + min_length=1, + description="Path to a CSV/JSON dataset with 'input' and 'expected_output' columns.", + ) + eval_metric: str = Field( + default="bleu", + description="Similarity metric for ground-truth eval: exact | bleu | rouge_l | cosine | llm_judge.", + ) + + @field_validator("log_format") + @classmethod + def _validate_log_format(cls, v: str) -> str: + allowed = {"text", "json"} + if v not in allowed: + raise ValueError(f"log_format must be one of {sorted(allowed)}, got '{v}'") + return v + @field_validator("error_strategy") @classmethod def _validate_error_strategy(cls, v: str) -> str: @@ -92,6 +187,15 @@ class OptimizationConfig(BaseModel): ) return v + @field_validator("eval_metric") + @classmethod + def _validate_eval_metric(cls, v: str) -> str: + if v not in _EVAL_METRIC_VALUES: + raise ValueError( + f"eval_metric must be one of {sorted(_EVAL_METRIC_VALUES)}, got '{v}'" + ) + return v + @model_validator(mode="before") @classmethod def _migrate_config(cls, data: Any) -> Any: @@ -118,3 +222,7 @@ class OptimizationResult: final_score: float improvement: float history: list[dict[str, Any]] = field(default_factory=list) + # Hold-out validation metrics (populated when validation_split > 0) + final_validation_score: float | None = None + best_validation_score: float | None = None + early_stopped: bool = False diff --git a/src/prometheus/cli/commands/optimize.py b/src/prometheus/cli/commands/optimize.py new file mode 100644 index 0000000..8c39a0e --- /dev/null +++ b/src/prometheus/cli/commands/optimize.py @@ -0,0 +1,428 @@ +"""prometheus optimize — run prompt optimization.""" +from __future__ import annotations + +import asyncio +import logging +import os +from dataclasses import asdict + +import dspy +import typer +from pydantic import ValidationError +from rich.console import Console +from rich.panel import Panel +from rich.table import Table + +from prometheus.application.bootstrap import SyntheticBootstrap +from prometheus.application.dto import JudgeDimension, OptimizationConfig, OptimizationResult +from prometheus.domain.entities import EvalResult, Prompt +from prometheus.application.evaluator import PromptEvaluator +from prometheus.application.ground_truth_evaluator import GroundTruthEvaluator +from prometheus.application.use_cases import OptimizePromptUseCase +from prometheus.cli.logging_setup import configure_logging +from prometheus.infrastructure.dataset_loader import FileDatasetLoader +from prometheus.infrastructure.checkpoint import JsonCheckpointPersistence +from prometheus.infrastructure.file_io import YamlPersistence +from prometheus.infrastructure.judge_adapter import DSPyJudgeAdapter +from prometheus.infrastructure.llm_adapter import DSPyLLMAdapter +from prometheus.infrastructure.crossover_adapter import DSPyCrossoverAdapter +from prometheus.infrastructure.mutation_adapter import DSPyMutationAdapter +from prometheus.infrastructure.crossover_adapter import DSPyCrossoverAdapter +from prometheus.infrastructure.mutation_adapter import DSPyMutationAdapter +from prometheus.infrastructure.proposer_adapter import DSPyProposerAdapter +from prometheus.infrastructure.similarity import create_similarity_adapter +from prometheus.infrastructure.synth_adapter import DSPySyntheticAdapter + +console = Console() + + +def register(app: typer.Typer) -> None: + """Register the optimize command on the Typer app.""" + + @app.command() + def optimize( + input: str = typer.Option( + ..., + "-i", + "--input", + help="Path to input YAML config file.", + exists=True, + readable=True, + ), + output: str = typer.Option( + "output.yaml", + "-o", + "--output", + help="Path to output YAML result file.", + ), + verbose: bool = typer.Option( + False, + "-v", + "--verbose", + help="Print detailed progress (INFO level).", + ), + debug: bool = typer.Option( + False, + "--debug", + help="Enable DEBUG-level logging (overrides -v).", + ), + log_format: str = typer.Option( + "text", + "--log-format", + help="Log output format: text | json.", + ), + log_file: str | None = typer.Option( + None, + "--log-file", + help="Optional file path to write logs to.", + ), + max_retries: int = typer.Option( + 3, + "--max-retries", + help="Max retry attempts for transient LLM errors (429, timeout, 5xx).", + ), + error_strategy: str = typer.Option( + "retry", + "--error-strategy", + help="How to handle errors: skip | retry | abort.", + ), + max_concurrency: int = typer.Option( + 5, + "--max-concurrency", + help="Max parallel LLM calls for minibatch execution and judging.", + ), + eval_dataset: str | None = typer.Option( + None, + "--eval-dataset", + help="Path to a CSV/JSON dataset with 'input' and 'expected_output' columns.", + ), + eval_metric: str = typer.Option( + "bleu", + "--eval-metric", + help="Similarity metric for ground-truth eval: exact | bleu | rouge_l | cosine | llm_judge.", + ), + checkpoint_dir: str | None = typer.Option( + None, + "--checkpoint-dir", + help="Directory for checkpoint files. Enables periodic checkpointing.", + ), + checkpoint_interval: int = typer.Option( + 5, + "--checkpoint-interval", + help="Save a checkpoint every N iterations.", + ), + resume: bool = typer.Option( + False, + "--resume", + help="Resume from the latest checkpoint in --checkpoint-dir.", + ), + population_size: int = typer.Option( + 1, + "--population-size", + help="Number of candidates in the evolution population. 1 = single-candidate hill climbing.", + ), + crossover_rate: float = typer.Option( + 0.5, + "--crossover-rate", + help="Probability of applying crossover vs reflective mutation (0.0–1.0). Only used when --population-size > 1.", + ), + mutation_rate: float = typer.Option( + 0.3, + "--mutation-rate", + help="Probability of applying a mutation operator after crossover/proposal (0.0–1.0). Only used when --population-size > 1.", + ), + validation_split: float = typer.Option( + 0.3, + "--validation-split", + help="Fraction of synthetic pool reserved for hold-out validation (0.0–0.9). 0 disables validation.", + ), + early_stop_patience: int = typer.Option( + 5, + "--early-stop-patience", + help="Stop if validation score does not improve for this many consecutive iterations.", + ), + judge_criteria: str | None = typer.Option( + None, + "--judge-criteria", + help="Custom judge rubric or evaluation criteria override (free text).", + ), + ) -> None: + """Optimize a prompt without any reference data. + + Usage: + prometheus optimize -i config.yaml -o result.yaml + prometheus optimize -i config.yaml --eval-dataset data.csv --eval-metric bleu + prometheus optimize -i config.yaml --checkpoint-dir .prometheus/checkpoints --resume + """ + asyncio.run( + _async_optimize( + input, output, verbose, debug, log_format, log_file, + max_retries, error_strategy, max_concurrency, + eval_dataset, eval_metric, + checkpoint_dir, checkpoint_interval, resume, + population_size, crossover_rate, mutation_rate, + validation_split, early_stop_patience, + judge_criteria, + ) + ) + + +async def _async_optimize( + input: str, + output: str, + verbose: bool, + debug: bool, + log_format: str, + log_file: str | None, + max_retries: int, + error_strategy: str, + max_concurrency: int, + eval_dataset: str | None, + eval_metric: str, + checkpoint_dir: str | None, + checkpoint_interval: int, + resume: bool, + population_size: int = 1, + crossover_rate: float = 0.5, + mutation_rate: float = 0.3, + validation_split: float = 0.3, + early_stop_patience: int = 5, + judge_criteria: str | None = None, +) -> None: + # Configure structured logging + if debug: + log_level = logging.DEBUG + elif verbose: + log_level = logging.INFO + else: + log_level = logging.WARNING + configure_logging(level=log_level, log_format=log_format, log_file=log_file) + + console.print( + Panel.fit( + "PROMETHEUS — Prompt Evolution Engine", + subtitle="No reference data required", + ) + ) + + # 1. Load & validate config + persistence = YamlPersistence() + raw_config = persistence.read_config(input) + + # CLI flags override config file values + raw_config.setdefault("max_retries", max_retries) + raw_config.setdefault("error_strategy", error_strategy) + raw_config.setdefault("max_concurrency", max_concurrency) + raw_config["output_path"] = output + raw_config["verbose"] = verbose + raw_config["debug"] = debug + raw_config["log_format"] = log_format + raw_config["log_file"] = log_file + if eval_dataset: + raw_config["eval_dataset_path"] = eval_dataset + raw_config.setdefault("eval_metric", eval_metric) + if checkpoint_dir: + raw_config["checkpoint_dir"] = checkpoint_dir + raw_config.setdefault("checkpoint_interval", checkpoint_interval) + if resume: + raw_config["resume"] = True + raw_config.setdefault("population_size", population_size) + raw_config.setdefault("crossover_rate", crossover_rate) + raw_config.setdefault("mutation_rate", mutation_rate) + raw_config.setdefault("validation_split", validation_split) + raw_config.setdefault("early_stop_patience", early_stop_patience) + if judge_criteria: + raw_config["judge_criteria"] = judge_criteria + + try: + config = OptimizationConfig.model_validate(raw_config) + except ValidationError as exc: + console.print("[bold red]Configuration error:[/bold red]\n") + for err in exc.errors(): + loc = " → ".join(str(l) for l in err["loc"]) + console.print(f" [red]• {loc}: {err['msg']}[/red]") + raise typer.Exit(code=1) from exc + console.print(f"[dim]Task: {config.task_description[:80]}...[/dim]") + console.print(f"[dim]Seed prompt: {config.seed_prompt[:80]}...[/dim]") + + # 2. Create per-model DSPy LM instances + def _model_lm_kwargs( + model_api_base: str | None, + model_api_key_env: str | None, + ) -> dict: + """Build kwargs for dspy.LM, using per-model overrides with global fallback.""" + kwargs: dict = {} + api_base = model_api_base or config.api_base + api_key_env = model_api_key_env or config.api_key_env + if api_base: + kwargs["api_base"] = api_base + if api_key_env: + kwargs["api_key"] = os.environ.get(api_key_env, "") + return kwargs + + task_lm = dspy.LM( + config.task_model, + **_model_lm_kwargs(config.task_api_base, config.task_api_key_env), + ) + judge_lm = dspy.LM( + config.judge_model, + **_model_lm_kwargs(config.judge_api_base, config.judge_api_key_env), + ) + proposer_lm = dspy.LM( + config.proposer_model, + **_model_lm_kwargs(config.proposer_api_base, config.proposer_api_key_env), + ) + synth_lm = dspy.LM( + config.synth_model, + **_model_lm_kwargs(config.synth_api_base, config.synth_api_key_env), + ) + + # 3. Build adapters (Dependency Injection — each gets its own LM + retry config) + synth_adapter = DSPySyntheticAdapter(lm=synth_lm) + llm_adapter = DSPyLLMAdapter( + lm=task_lm, + max_retries=config.max_retries, + retry_delay_base=config.retry_delay_base, + ) + judge_adapter = DSPyJudgeAdapter( + lm=judge_lm, + max_retries=config.max_retries, + retry_delay_base=config.retry_delay_base, + max_concurrency=config.max_concurrency, + judge_criteria=config.judge_criteria, + judge_dimensions=config.judge_dimensions, + ) + proposer_adapter = DSPyProposerAdapter( + lm=proposer_lm, + max_retries=config.max_retries, + retry_delay_base=config.retry_delay_base, + ) + bootstrap = SyntheticBootstrap(generator=synth_adapter, seed=config.seed) + evaluator = PromptEvaluator( + executor=llm_adapter, + judge=judge_adapter, + max_concurrency=config.max_concurrency, + ) + # Build checkpoint port if checkpoint_dir is configured + checkpoint_port = None + if config.checkpoint_dir: + checkpoint_port = JsonCheckpointPersistence(checkpoint_dir=config.checkpoint_dir) + + # Build crossover/mutation adapters for population-based evolution + crossover_adapter = None + mutation_adapter = None + if config.population_size > 1: + # Reuse proposer LM for crossover and mutation (same model, same role) + crossover_adapter = DSPyCrossoverAdapter( + lm=proposer_lm, + max_retries=config.max_retries, + retry_delay_base=config.retry_delay_base, + ) + mutation_adapter = DSPyMutationAdapter( + lm=proposer_lm, + max_retries=config.max_retries, + retry_delay_base=config.retry_delay_base, + ) + + use_case = OptimizePromptUseCase( + evaluator=evaluator, + proposer=proposer_adapter, + bootstrap=bootstrap, + checkpoint_port=checkpoint_port, + crossover_port=crossover_adapter, + mutation_port=mutation_adapter, + ) + + # 4. Execute + with console.status("[bold green]Evolving prompt..."): + result = await use_case.execute(config) + + # 5. Display results + _display_result(result) + + # 6. Optional ground-truth evaluation on the optimized prompt + if config.eval_dataset_path: + dataset = FileDatasetLoader().load(config.eval_dataset_path) + if config.eval_metric == "llm_judge": + # llm_judge reuses the existing PromptEvaluator with the LLM judge + from prometheus.domain.entities import SyntheticExample + synth_dataset = [ + SyntheticExample(input_text=ex.input_text, id=ex.id) for ex in dataset + ] + gt_eval = PromptEvaluator( + executor=llm_adapter, + judge=judge_adapter, + max_concurrency=config.max_concurrency, + ) + with console.status("[bold green]Running ground-truth evaluation (llm_judge)..."): + gt_result = await gt_eval.evaluate( + prompt=Prompt(text=result.optimized_prompt), + minibatch=synth_dataset, + task_description=config.task_description, + ) + else: + gt_evaluator = GroundTruthEvaluator( + executor=llm_adapter, + similarity=create_similarity_adapter(config.eval_metric), + max_concurrency=config.max_concurrency, + ) + with console.status("[bold green]Running ground-truth evaluation..."): + gt_result = await gt_evaluator.evaluate( + prompt=Prompt(text=result.optimized_prompt), + dataset=dataset, + ) + _display_ground_truth(gt_result, config.eval_metric, len(dataset)) + + # 7. Save + _save_result(persistence, output, result) + console.print(f"\n[green]Results saved to {output}[/green]") + + +def _display_result(result: OptimizationResult) -> None: + """Display a Rich summary in the terminal.""" + console.print() + console.print( + Panel( + f"[bold green]Optimized Prompt[/bold green]\n\n{result.optimized_prompt}", + title="Result", + ) + ) + table = Table(title="Metrics") + table.add_column("Metric", style="cyan") + table.add_column("Value", style="bold") + table.add_row("Initial Score", f"{result.initial_score:.2f}") + table.add_row("Final Score", f"{result.final_score:.2f}") + table.add_row("Improvement", f"{result.improvement:+.2f}") + if result.best_validation_score is not None: + table.add_row("Best Validation Score", f"{result.best_validation_score:.4f}") + if result.early_stopped: + table.add_row("Early Stopped", "[yellow]Yes[/yellow]") + table.add_row("Iterations", str(result.iterations_used)) + table.add_row("LLM Calls", str(result.total_llm_calls)) + console.print(table) + + +def _save_result( + persistence: YamlPersistence, + path: str, + result: OptimizationResult, +) -> None: + """Save the result as YAML.""" + persistence.write_result(path, asdict(result)) + + +def _display_ground_truth( + result: EvalResult, metric: str, dataset_size: int +) -> None: + """Display ground-truth evaluation results.""" + console.print() + table = Table(title=f"Ground-Truth Evaluation (metric: {metric})") + table.add_column("Metric", style="cyan") + table.add_column("Value", style="bold") + table.add_row("Dataset Size", str(dataset_size)) + table.add_row("Mean Score", f"{result.mean_score:.4f}") + table.add_row("Total Score", f"{result.total_score:.4f}") + exact_matches = sum(1 for s in result.scores if s >= 0.99) + table.add_row("Exact Matches", f"{exact_matches}/{dataset_size}") + table.add_row("Accuracy", f"{exact_matches / dataset_size:.2%}") + console.print(table) diff --git a/src/prometheus/domain/scoring.py b/src/prometheus/domain/scoring.py index 9a28ec8..26df501 100644 --- a/src/prometheus/domain/scoring.py +++ b/src/prometheus/domain/scoring.py @@ -19,3 +19,26 @@ def should_accept( def normalize_score(raw: float, min_val: float = 0.0, max_val: float = 1.0) -> float: """Clamp a score within [min_val, max_val].""" return max(min_val, min(max_val, raw)) + + +def weighted_aggregate( + scores: dict[str, float], + weights: dict[str, float], +) -> float: + """Compute a weighted average of per-dimension scores. + + Args: + scores: Mapping of dimension name → score (0.0–1.0). + weights: Mapping of dimension name → weight (0.0–1.0). + + Returns: + Weighted average in [0.0, 1.0]. Returns 0.0 if inputs are empty. + """ + if not scores or not weights: + return 0.0 + total_weight = sum(weights.get(name, 0.0) for name in scores) + if total_weight == 0.0: + return sum(scores.values()) / len(scores) + return sum( + scores.get(name, 0.0) * weights.get(name, 0.0) for name in scores + ) / total_weight diff --git a/src/prometheus/infrastructure/dspy_modules.py b/src/prometheus/infrastructure/dspy_modules.py index aa0b880..dfe9be4 100644 --- a/src/prometheus/infrastructure/dspy_modules.py +++ b/src/prometheus/infrastructure/dspy_modules.py @@ -11,8 +11,10 @@ import re import dspy from prometheus.infrastructure.dspy_signatures import ( + CrossoverInstructions, GenerateSyntheticInputs, JudgeOutput, + MutateInstruction, ProposeInstruction, ) @@ -53,19 +55,30 @@ class OutputJudge(dspy.Module): self.judge = dspy.ChainOfThought(JudgeOutput) def forward( - self, task_description: str, input_text: str, output_text: str + self, + task_description: str, + input_text: str, + output_text: str, + judge_criteria: str = "", + dimension_names: str = "", ) -> dspy.Prediction: result = self.judge( task_description=task_description, input_text=input_text, output_text=output_text, + judge_criteria=judge_criteria, + dimension_names=dimension_names, ) try: score = float(result.score) except (ValueError, TypeError): score = 0.5 # neutral fallback score = max(0.0, min(1.0, score)) - return dspy.Prediction(score=score, feedback=result.feedback) + return dspy.Prediction( + score=score, + feedback=result.feedback, + dimension_scores=getattr(result, "dimension_scores", "{}"), + ) class InstructionProposer(dspy.Module): @@ -90,3 +103,45 @@ class InstructionProposer(dspy.Module): failure_examples=failure_examples, ) return dspy.Prediction(new_instruction=result.new_instruction) + + +class InstructionCrossover(dspy.Module): + """Crossover: combines two parent instructions into a child.""" + + def __init__(self) -> None: + super().__init__() + self.crossover = dspy.ChainOfThought(CrossoverInstructions) + + def forward( + self, + parent_a: str, + parent_b: str, + task_description: str, + ) -> dspy.Prediction: + result = self.crossover( + parent_a=parent_a, + parent_b=parent_b, + task_description=task_description, + ) + return dspy.Prediction(child_instruction=result.child_instruction) + + +class InstructionMutator(dspy.Module): + """Mutator: applies a typed mutation to an instruction.""" + + def __init__(self) -> None: + super().__init__() + self.mutate = dspy.ChainOfThought(MutateInstruction) + + def forward( + self, + current_instruction: str, + task_description: str, + mutation_type: str, + ) -> dspy.Prediction: + result = self.mutate( + current_instruction=current_instruction, + task_description=task_description, + mutation_type=mutation_type, + ) + return dspy.Prediction(mutated_instruction=result.mutated_instruction) diff --git a/src/prometheus/infrastructure/dspy_signatures.py b/src/prometheus/infrastructure/dspy_signatures.py index 1d28b7b..17cbb71 100644 --- a/src/prometheus/infrastructure/dspy_signatures.py +++ b/src/prometheus/infrastructure/dspy_signatures.py @@ -44,6 +44,12 @@ class JudgeOutput(dspy.Signature): output_text: str = dspy.InputField( desc="The assistant's response to evaluate." ) + judge_criteria: str = dspy.InputField( + desc="Custom evaluation rubric or criteria. Empty string = use default judging criteria." + ) + dimension_names: str = dspy.InputField( + desc="Comma-separated dimension names for multi-dimensional scoring. Empty string = single overall score." + ) score: float = dspy.OutputField( desc="Quality score from 0.0 (wrong) to 1.0 (perfect)." ) @@ -53,6 +59,12 @@ class JudgeOutput(dspy.Signature): "with the output and how to improve it. Be critical." ), ) + dimension_scores: str = dspy.OutputField( + desc=( + "JSON object mapping dimension names to scores (0.0-1.0). " + 'Empty object {} if no dimensions specified.' + ), + ) class ProposeInstruction(dspy.Signature): @@ -77,3 +89,52 @@ class ProposeInstruction(dspy.Signature): new_instruction: str = dspy.OutputField( desc="An improved version of the instruction." ) + + +class CrossoverInstructions(dspy.Signature): + """Combine two instruction prompts into a single improved instruction. + + Take the strongest elements from each parent — structure, phrasing, + constraints, examples — and merge them into a coherent child instruction + that is strictly better than either parent alone. + """ + + parent_a: str = dspy.InputField( + desc="First parent instruction." + ) + parent_b: str = dspy.InputField( + desc="Second parent instruction." + ) + task_description: str = dspy.InputField( + desc="Description of the task." + ) + child_instruction: str = dspy.OutputField( + desc=( + "A combined instruction that takes the best elements from " + "both parents into a single, coherent instruction." + ), + ) + + +class MutateInstruction(dspy.Signature): + """Apply a specific mutation to an instruction prompt. + + The mutation_type determines the transformation: + - paraphrase: restate the instruction in different words + - constrain: add specificity, constraints, or guard-rails + - generalize: broaden the instruction to cover more cases + - specialize: narrow the instruction for better focus on the task + """ + + current_instruction: str = dspy.InputField( + desc="The instruction to mutate." + ) + task_description: str = dspy.InputField( + desc="Description of the task." + ) + mutation_type: str = dspy.InputField( + desc="Type of mutation: paraphrase, constrain, generalize, or specialize." + ) + mutated_instruction: str = dspy.OutputField( + desc="The mutated instruction, preserving core intent but altered per the mutation type." + ) diff --git a/src/prometheus/infrastructure/judge_adapter.py b/src/prometheus/infrastructure/judge_adapter.py index 5c8e4bd..220ac74 100644 --- a/src/prometheus/infrastructure/judge_adapter.py +++ b/src/prometheus/infrastructure/judge_adapter.py @@ -6,12 +6,15 @@ Implements the JudgePort via the DSPy OutputJudge module. from __future__ import annotations import asyncio +import json import logging -from typing import Self +from typing import Any import dspy +from prometheus.application.dto import JudgeDimension from prometheus.domain.ports import JudgePort +from prometheus.domain.scoring import weighted_aggregate from prometheus.infrastructure.dspy_modules import OutputJudge from prometheus.infrastructure.retry import async_retry_with_backoff @@ -25,6 +28,9 @@ class DSPyJudgeAdapter(JudgePort): instead of crashing the whole batch. Judge calls run in parallel (bounded by *max_concurrency*). + + When *judge_criteria* or *judge_dimensions* are provided, the judge applies + custom rubrics and/or multi-dimensional scoring with weighted aggregation. """ def __init__( @@ -33,12 +39,26 @@ class DSPyJudgeAdapter(JudgePort): max_retries: int = 3, retry_delay_base: float = 1.0, max_concurrency: int = 5, + judge_criteria: str | None = None, + judge_dimensions: list[JudgeDimension] | None = None, ) -> None: self._lm = lm self._judge = OutputJudge() self._max_retries = max_retries self._retry_delay_base = retry_delay_base self._semaphore = asyncio.Semaphore(max_concurrency) + self._judge_criteria = judge_criteria or "" + self._judge_dimensions = judge_dimensions or [] + self._dimension_names = ( + ",".join(d.name for d in self._judge_dimensions) + if self._judge_dimensions + else "" + ) + self._weights: dict[str, float] = ( + {d.name: d.weight for d in self._judge_dimensions} + if self._judge_dimensions + else {} + ) async def judge_batch( self, @@ -74,7 +94,7 @@ class DSPyJudgeAdapter(JudgePort): pred = await asyncio.to_thread( self._sync_judge, task_description, input_text, output_text, ) - return (pred.score, pred.feedback) + return self._aggregate_result(pred) return await async_retry_with_backoff( _call, @@ -88,4 +108,35 @@ class DSPyJudgeAdapter(JudgePort): task_description=task_description, input_text=input_text, output_text=output_text, + judge_criteria=self._judge_criteria, + dimension_names=self._dimension_names, ) + + def _aggregate_result(self, pred: Any) -> tuple[float, str]: + """Compute weighted aggregate score from dimension scores if available.""" + if not self._judge_dimensions: + return (pred.score, pred.feedback) + + # Parse per-dimension scores from LLM output + dim_scores: dict[str, float] = {} + try: + raw = json.loads(pred.dimension_scores) + if isinstance(raw, dict): + for name in self._weights: + val = raw.get(name) + if val is not None: + dim_scores[name] = max(0.0, min(1.0, float(val))) + except (json.JSONDecodeError, ValueError, TypeError): + logger.debug("Failed to parse dimension_scores, falling back to overall score") + + if not dim_scores: + return (pred.score, pred.feedback) + + aggregate = weighted_aggregate(dim_scores, self._weights) + # Enrich feedback with per-dimension breakdown + dim_breakdown = ", ".join( + f"{name}={dim_scores.get(name, 0.0):.2f}" + for name in self._weights + ) + feedback = f"{pred.feedback} [{dim_breakdown}]" + return (aggregate, feedback) diff --git a/tests/unit/test_adapter_config.py b/tests/unit/test_adapter_config.py index c3ead7c..cfc1dbd 100644 --- a/tests/unit/test_adapter_config.py +++ b/tests/unit/test_adapter_config.py @@ -29,7 +29,7 @@ def judge_lm() -> dspy.LM: """Dummy LM for judging (ChainOfThought requires reasoning field).""" return dspy.utils.DummyLM( [ - {"reasoning": "Evaluating output.", "score": "0.8", "feedback": "Good response."}, + {"reasoning": "Evaluating output.", "score": "0.8", "feedback": "Good response.", "dimension_scores": "{}"}, ] ) @@ -99,9 +99,9 @@ class TestDSPyJudgeAdapterOwnLM: @pytest.mark.asyncio async def test_does_not_use_global_lm(self) -> None: judge_lm = dspy.utils.DummyLM( - [{"reasoning": "ok", "score": "0.9", "feedback": "Judge-specific response"}] + [{"reasoning": "ok", "score": "0.9", "feedback": "Judge-specific response", "dimension_scores": "{}"}] ) - global_lm = dspy.utils.DummyLM([{"reasoning": "no", "score": "0.1", "feedback": "Wrong LM!"}]) + global_lm = dspy.utils.DummyLM([{"reasoning": "no", "score": "0.1", "feedback": "Wrong LM!", "dimension_scores": "{}"}]) dspy.configure(lm=global_lm) adapter = DSPyJudgeAdapter(lm=judge_lm) @@ -176,7 +176,7 @@ class TestDSPySyntheticAdapterOwnLM: class TestPerModelOverrides: """Verify that per-model api_base/api_key_env are passed through to dspy.LM.""" - @patch("prometheus.cli.app.dspy.LM") + @patch("prometheus.cli.commands.optimize.dspy.LM") def test_per_model_api_base_override(self, mock_lm_cls: MagicMock) -> None: """Per-model api_base should be used instead of global.""" mock_lm_cls.return_value = MagicMock() diff --git a/tests/unit/test_error_handling.py b/tests/unit/test_error_handling.py index d24c310..748f762 100644 --- a/tests/unit/test_error_handling.py +++ b/tests/unit/test_error_handling.py @@ -124,12 +124,11 @@ class TestCircuitBreaker: circuit_breaker_threshold=3, error_strategy="skip", ) - with patch.object(loop, "_log"): - state = await loop.run( - Prompt("test"), - [SyntheticExample("in", id=0), SyntheticExample("in2", id=1)], - "task", - ) + state = await loop.run( + Prompt("test"), + [SyntheticExample("in", id=0), SyntheticExample("in2", id=1)], + "task", + ) error_events = [h for h in state.history if h.get("event") == "error"] cb_events = [h for h in state.history if h.get("event") == "circuit_breaker"] @@ -165,13 +164,12 @@ class TestCircuitBreaker: circuit_breaker_threshold=3, error_strategy="abort", ) - with patch.object(loop, "_log"): - with pytest.raises(RuntimeError, match="LLM down"): - await loop.run( - Prompt("test"), - [SyntheticExample("in", id=0), SyntheticExample("in2", id=1)], - "task", - ) + with pytest.raises(RuntimeError, match="LLM down"): + await loop.run( + Prompt("test"), + [SyntheticExample("in", id=0), SyntheticExample("in2", id=1)], + "task", + ) @pytest.mark.asyncio async def test_resets_on_success(self): @@ -216,12 +214,11 @@ class TestCircuitBreaker: circuit_breaker_threshold=3, error_strategy="skip", ) - with patch.object(loop, "_log"): - state = await loop.run( - Prompt("test"), - [SyntheticExample("in", id=0), SyntheticExample("in2", id=1)], - "task", - ) + state = await loop.run( + Prompt("test"), + [SyntheticExample("in", id=0), SyntheticExample("in2", id=1)], + "task", + ) # Should NOT have tripped — 2 fails, then success reset the counter cb_events = [h for h in state.history if h.get("event") == "circuit_breaker"] @@ -277,6 +274,10 @@ class TestPerCallIsolation: adapter._max_retries = 1 adapter._retry_delay_base = 0 adapter._semaphore = __import__("asyncio").Semaphore(5) + adapter._judge_criteria = "" + adapter._judge_dimensions = [] + adapter._dimension_names = "" + adapter._weights = {} # Mock _judge to fail on first call, succeed on second call_count = 0