Initial commit: PROMETHEUS v0.1.0 - Prompt optimizer

- Clean architecture (domain/application/infrastructure)
- DSPy-based evolution engine with scoring
- CLI via pyproject.toml entry point
- Unit + integration tests (~300 tests)
- Configs for glm-5.1 and glm-4.5-air models
- Z.AI endpoint integration
This commit is contained in:
2026-03-29 11:44:03 +00:00
commit 837a44970f
49 changed files with 6599 additions and 0 deletions

0
tests/__init__.py Normal file
View File

93
tests/conftest.py Normal file
View File

@@ -0,0 +1,93 @@
"""Shared test fixtures."""
from __future__ import annotations
from unittest.mock import MagicMock
import pytest
from prometheus.domain.entities import (
EvalResult,
Prompt,
SyntheticExample,
Trajectory,
)
@pytest.fixture
def seed_prompt() -> Prompt:
return Prompt(text="You are a helpful assistant. Answer the question.")
@pytest.fixture
def task_description() -> str:
return "Answer factual questions accurately and concisely."
@pytest.fixture
def synthetic_pool() -> list[SyntheticExample]:
return [
SyntheticExample(input_text=f"Test input {i}", id=i) for i in range(20)
]
@pytest.fixture
def mock_eval_result() -> EvalResult:
return EvalResult(
scores=[0.3, 0.5, 0.4, 0.6, 0.2],
feedbacks=[
"Incomplete answer",
"Missing key detail",
"Wrong format",
"Partially correct",
"Completely off topic",
],
trajectories=[
Trajectory(
input_text=f"Input {i}",
output_text=f"Output {i}",
score=s,
feedback=f,
prompt_used="test prompt",
)
for i, (s, f) in enumerate(
zip(
[0.3, 0.5, 0.4, 0.6, 0.2],
[
"Incomplete answer",
"Missing key detail",
"Wrong format",
"Partially correct",
"Completely off topic",
],
)
)
],
)
@pytest.fixture
def mock_llm_port() -> MagicMock:
"""Mock LLMPort that returns canned responses."""
port = MagicMock()
port.execute.return_value = "This is a mock response."
return port
@pytest.fixture
def mock_judge_port() -> MagicMock:
"""Mock JudgePort that returns moderate scores."""
port = MagicMock()
port.judge_batch.return_value = [
(0.5, "Moderate quality, needs improvement."),
] * 5
return port
@pytest.fixture
def mock_proposer_port() -> MagicMock:
"""Mock ProposerPort that returns a slightly modified prompt."""
port = MagicMock()
port.propose.return_value = Prompt(
text="You are a very helpful assistant. Answer the question precisely."
)
return port

View File

View File

@@ -0,0 +1,29 @@
"""Integration tests for DSPy adapters using DSPy mock LM."""
from __future__ import annotations
import dspy
import pytest
from prometheus.domain.entities import Prompt
from prometheus.infrastructure.llm_adapter import DSPyLLMAdapter
@pytest.fixture
def mock_lm() -> dspy.LM:
"""Create a DSPy mock LM that returns predictable responses."""
lm = dspy.utils.DummyLM(
[
{"output": "Mock output response"},
]
)
dspy.configure(lm=lm)
return lm
class TestDSPyLLMAdapter:
def test_execute_returns_response(self, mock_lm: dspy.LM) -> None:
adapter = DSPyLLMAdapter(model="openai/gpt-4o-mini")
prompt = Prompt(text="Answer the question.")
result = adapter.execute(prompt, "What is 2+2?")
assert isinstance(result, str)
assert len(result) > 0

View File

@@ -0,0 +1,74 @@
"""End-to-end pipeline test with mocked LLM calls."""
from __future__ import annotations
from unittest.mock import MagicMock
from prometheus.application.bootstrap import SyntheticBootstrap
from prometheus.application.dto import OptimizationConfig
from prometheus.application.evaluator import PromptEvaluator
from prometheus.application.use_cases import OptimizePromptUseCase
from prometheus.domain.entities import EvalResult, Prompt, SyntheticExample, Trajectory
from prometheus.domain.ports import JudgePort, LLMPort, ProposerPort
def _make_eval(scores: list[float]) -> EvalResult:
return EvalResult(
scores=scores,
feedbacks=["feedback"] * len(scores),
trajectories=[
Trajectory(f"in{i}", f"out{i}", s, "feedback", "prompt")
for i, s in enumerate(scores)
],
)
class TestFullPipeline:
def test_pipeline_produces_result(self) -> None:
"""Full pipeline with mocked ports produces an OptimizationResult."""
mock_llm = MagicMock(spec=LLMPort)
mock_llm.execute.return_value = "mock response"
mock_judge = MagicMock(spec=JudgePort)
# Initial eval (low), then alternating current/new evals per iteration
eval_sequence = [
_make_eval([0.3, 0.3, 0.3, 0.3, 0.3]), # initial seed eval
]
for _ in range(5): # 5 iterations
eval_sequence.append(_make_eval([0.4, 0.4, 0.4, 0.4, 0.4])) # current eval
eval_sequence.append(_make_eval([0.6, 0.6, 0.6, 0.6, 0.6])) # new eval (accepted)
mock_judge.judge_batch.return_value = [(0.5, "ok")] * 5
mock_proposer = MagicMock(spec=ProposerPort)
mock_proposer.propose.return_value = Prompt(text="Improved prompt")
evaluator = PromptEvaluator(mock_llm, mock_judge)
evaluator.evaluate = MagicMock(side_effect=eval_sequence)
mock_gen = MagicMock()
mock_gen.generate_inputs.return_value = [
SyntheticExample(input_text=f"synth input {i}", id=i) for i in range(20)
]
bootstrap = SyntheticBootstrap(generator=mock_gen, seed=42)
use_case = OptimizePromptUseCase(
evaluator=evaluator,
proposer=mock_proposer,
bootstrap=bootstrap,
)
config = OptimizationConfig(
seed_prompt="Answer questions.",
task_description="Answer questions accurately.",
max_iterations=5,
n_synthetic_inputs=20,
minibatch_size=5,
seed=42,
)
result = use_case.execute(config)
assert result.initial_prompt == "Answer questions."
assert result.optimized_prompt == "Improved prompt"
assert result.iterations_used == 5
assert result.total_llm_calls > 0
assert result.final_score > result.initial_score

0
tests/unit/__init__.py Normal file
View File

View File

@@ -0,0 +1,50 @@
"""Unit tests for the bootstrap module."""
from __future__ import annotations
from unittest.mock import MagicMock
from prometheus.application.bootstrap import SyntheticBootstrap
from prometheus.domain.entities import SyntheticExample
from prometheus.domain.ports import SyntheticGeneratorPort
class TestSyntheticBootstrap:
def test_run_returns_shuffled_examples(self) -> None:
mock_gen = MagicMock(spec=SyntheticGeneratorPort)
examples = [SyntheticExample(input_text=f"input {i}", id=i) for i in range(10)]
mock_gen.generate_inputs.return_value = examples
bootstrap = SyntheticBootstrap(generator=mock_gen, seed=42)
result = bootstrap.run("task desc", 10)
assert len(result) == 10
mock_gen.generate_inputs.assert_called_once_with("task desc", 10)
def test_sample_minibatch_returns_correct_size(self) -> None:
mock_gen = MagicMock(spec=SyntheticGeneratorPort)
pool = [SyntheticExample(input_text=f"input {i}", id=i) for i in range(20)]
bootstrap = SyntheticBootstrap(generator=mock_gen, seed=42)
batch = bootstrap.sample_minibatch(pool, 5)
assert len(batch) == 5
# All items should be from the pool
assert all(item in pool for item in batch)
def test_sample_minibatch_capped_at_pool_size(self) -> None:
mock_gen = MagicMock(spec=SyntheticGeneratorPort)
pool = [SyntheticExample(input_text=f"input {i}", id=i) for i in range(3)]
bootstrap = SyntheticBootstrap(generator=mock_gen, seed=42)
batch = bootstrap.sample_minibatch(pool, 10)
assert len(batch) == 3
def test_deterministic_with_same_seed(self) -> None:
mock_gen = MagicMock(spec=SyntheticGeneratorPort)
pool = [SyntheticExample(input_text=f"input {i}", id=i) for i in range(20)]
b1 = SyntheticBootstrap(generator=mock_gen, seed=42)
b2 = SyntheticBootstrap(generator=mock_gen, seed=42)
assert b1.sample_minibatch(pool, 5) == b2.sample_minibatch(pool, 5)

View File

@@ -0,0 +1,198 @@
"""Unit tests for DSPy module parsing logic."""
from __future__ import annotations
import json
from unittest.mock import MagicMock, patch
import dspy
import pytest
from prometheus.infrastructure.dspy_modules import (
InstructionProposer,
OutputJudge,
SyntheticInputGenerator,
)
class TestSyntheticInputGeneratorParseFallback:
"""Tests for _parse_fallback — regex-based JSON recovery."""
def test_extracts_quoted_strings(self) -> None:
text = 'Here are some: "first example" and "second example" done.'
result = SyntheticInputGenerator._parse_fallback(text)
assert result == ["first example", "second example"]
def test_single_quoted_string(self) -> None:
text = 'Just one: "hello world"'
result = SyntheticInputGenerator._parse_fallback(text)
assert result == ["hello world"]
def test_no_quotes_returns_raw_text(self) -> None:
text = "no quotes at all here"
result = SyntheticInputGenerator._parse_fallback(text)
assert result == ["no quotes at all here"]
def test_empty_string_returns_itself(self) -> None:
result = SyntheticInputGenerator._parse_fallback("")
assert result == [""]
def test_mixed_json_with_extra_text(self) -> None:
text = 'Results: "alpha", "beta", "gamma" — take your pick.'
result = SyntheticInputGenerator._parse_fallback(text)
assert result == ["alpha", "beta", "gamma"]
class TestOutputJudgeForward:
"""Tests for OutputJudge score parsing and clamping.
Mocks the internal ChainOfThought module to isolate parsing logic.
"""
@pytest.fixture
def judge(self) -> OutputJudge:
return OutputJudge()
def test_valid_numeric_score(self, judge: OutputJudge) -> None:
judge.judge = MagicMock(
return_value=dspy.Prediction(score="0.8", feedback="Good output.")
)
result = judge.forward("task", "input", "output")
assert result.score == 0.8
assert result.feedback == "Good output."
def test_non_numeric_score_falls_back_to_half(
self, judge: OutputJudge
) -> None:
judge.judge = MagicMock(
return_value=dspy.Prediction(
score="not-a-number", feedback="N/A"
)
)
result = judge.forward("task", "input", "output")
assert result.score == 0.5
def test_score_clamped_to_upper_bound(self, judge: OutputJudge) -> None:
judge.judge = MagicMock(
return_value=dspy.Prediction(score="1.5", feedback="Great!")
)
result = judge.forward("task", "input", "output")
assert result.score == 1.0
def test_score_clamped_to_lower_bound(self, judge: OutputJudge) -> None:
judge.judge = MagicMock(
return_value=dspy.Prediction(score="-0.3", feedback="Terrible.")
)
result = judge.forward("task", "input", "output")
assert result.score == 0.0
def test_empty_score_string_falls_back(self, judge: OutputJudge) -> None:
judge.judge = MagicMock(
return_value=dspy.Prediction(score="", feedback="No score.")
)
result = judge.forward("task", "input", "output")
assert result.score == 0.5
def test_boundary_score_one(self, judge: OutputJudge) -> None:
judge.judge = MagicMock(
return_value=dspy.Prediction(score="1.0", feedback="Perfect.")
)
result = judge.forward("task", "input", "output")
assert result.score == 1.0
def test_boundary_score_zero(self, judge: OutputJudge) -> None:
judge.judge = MagicMock(
return_value=dspy.Prediction(score="0.0", feedback="Wrong.")
)
result = judge.forward("task", "input", "output")
assert result.score == 0.0
def test_none_score_falls_back(self, judge: OutputJudge) -> None:
judge.judge = MagicMock(
return_value=dspy.Prediction(score=None, feedback="Missing.")
)
result = judge.forward("task", "input", "output")
assert result.score == 0.5
class TestSyntheticInputGeneratorForward:
"""Tests for SyntheticInputGenerator.forward JSON/fallback parsing.
Mocks the internal ChainOfThought module to isolate parsing logic.
"""
@pytest.fixture
def generator(self) -> SyntheticInputGenerator:
return SyntheticInputGenerator()
def test_valid_json_parsed_correctly(
self, generator: SyntheticInputGenerator
) -> None:
examples_json = json.dumps(["q1", "q2", "q3"])
generator.generate = MagicMock(
return_value=dspy.Prediction(examples=examples_json)
)
result = generator.forward("task desc", 3)
assert result.examples == ["q1", "q2", "q3"]
def test_malformed_json_triggers_fallback(
self, generator: SyntheticInputGenerator
) -> None:
generator.generate = MagicMock(
return_value=dspy.Prediction(
examples='Here: "fallback item" and "another one"'
)
)
result = generator.forward("task desc", 2)
assert result.examples == ["fallback item", "another one"]
def test_empty_json_array(self, generator: SyntheticInputGenerator) -> None:
generator.generate = MagicMock(
return_value=dspy.Prediction(examples="[]")
)
result = generator.forward("task desc", 0)
assert result.examples == []
class TestInstructionProposerForward:
"""Tests for InstructionProposer.forward."""
@pytest.fixture
def proposer(self) -> InstructionProposer:
return InstructionProposer()
def test_returns_new_instruction(self, proposer: InstructionProposer) -> None:
proposer.propose = MagicMock(
return_value=dspy.Prediction(
new_instruction="Be concise and accurate."
)
)
result = proposer.forward(
"Be helpful.", "Answer questions.", "Failed: too verbose"
)
assert result.new_instruction == "Be concise and accurate."
def test_passes_correct_arguments(
self, proposer: InstructionProposer
) -> None:
proposer.propose = MagicMock(
return_value=dspy.Prediction(new_instruction="improved")
)
proposer.forward("current", "task desc", "failures")
proposer.propose.assert_called_once_with(
current_instruction="current",
task_description="task desc",
failure_examples="failures",
)

View File

@@ -0,0 +1,99 @@
"""Unit tests for domain entities."""
from __future__ import annotations
from prometheus.domain.entities import (
Candidate,
EvalResult,
OptimizationState,
Prompt,
SyntheticExample,
Trajectory,
)
class TestPrompt:
def test_prompt_text(self) -> None:
p = Prompt(text="Hello")
assert p.text == "Hello"
def test_prompt_len(self) -> None:
p = Prompt(text="Hello")
assert len(p) == 5
def test_prompt_frozen(self) -> None:
p = Prompt(text="Hello")
try:
p.text = "World" # type: ignore[misc]
raise AssertionError("Should have raised FrozenInstanceError")
except AttributeError:
pass
def test_prompt_default_metadata(self) -> None:
p = Prompt(text="Hello")
assert p.metadata == {}
def test_prompt_custom_metadata(self) -> None:
p = Prompt(text="Hello", metadata={"key": "value"})
assert p.metadata["key"] == "value"
class TestSyntheticExample:
def test_default_category(self) -> None:
ex = SyntheticExample(input_text="test")
assert ex.category == "default"
def test_default_id(self) -> None:
ex = SyntheticExample(input_text="test")
assert ex.id == 0
class TestEvalResult:
def test_total_score(self) -> None:
result = EvalResult(
scores=[0.3, 0.5, 0.4],
feedbacks=["a", "b", "c"],
trajectories=[],
)
assert result.total_score == 1.2
def test_mean_score(self) -> None:
result = EvalResult(
scores=[0.3, 0.5, 0.4],
feedbacks=["a", "b", "c"],
trajectories=[],
)
assert abs(result.mean_score - 0.4) < 1e-9
def test_mean_score_empty(self) -> None:
result = EvalResult(scores=[], feedbacks=[], trajectories=[])
assert result.mean_score == 0.0
class TestTrajectory:
def test_trajectory_fields(self) -> None:
t = Trajectory(
input_text="in",
output_text="out",
score=0.8,
feedback="good",
prompt_used="test",
)
assert t.input_text == "in"
assert t.score == 0.8
class TestCandidate:
def test_candidate_defaults(self) -> None:
c = Candidate(prompt=Prompt(text="test"))
assert c.best_score == 0.0
assert c.generation == 0
assert c.parent_id is None
class TestOptimizationState:
def test_default_state(self) -> None:
state = OptimizationState()
assert state.iteration == 0
assert state.best_candidate is None
assert state.candidates == []
assert state.total_llm_calls == 0

View File

@@ -0,0 +1,121 @@
"""Unit tests for PromptEvaluator.evaluate()."""
from __future__ import annotations
from unittest.mock import MagicMock
import pytest
from prometheus.application.evaluator import PromptEvaluator
from prometheus.domain.entities import EvalResult, Prompt, SyntheticExample, Trajectory
from prometheus.domain.ports import JudgePort, LLMPort
class TestPromptEvaluatorEvaluate:
"""Tests for the evaluate() pipeline: execute → judge → trajectories."""
@pytest.fixture
def executor(self) -> MagicMock:
return MagicMock(spec=LLMPort)
@pytest.fixture
def judge(self) -> MagicMock:
return MagicMock(spec=JudgePort)
@pytest.fixture
def evaluator(self, executor: MagicMock, judge: MagicMock) -> PromptEvaluator:
return PromptEvaluator(executor=executor, judge=judge)
def test_happy_path_builds_correct_trajectories(
self,
evaluator: PromptEvaluator,
executor: MagicMock,
judge: MagicMock,
) -> None:
prompt = Prompt(text="Answer the question.")
examples = [
SyntheticExample(input_text="What is 2+2?", id=0),
SyntheticExample(input_text="Capital of France?", id=1),
]
executor.execute.side_effect = ["4", "Paris"]
judge.judge_batch.return_value = [
(0.9, "Correct."),
(0.8, "Mostly correct."),
]
result = evaluator.evaluate(prompt, examples, "math and geography")
assert isinstance(result, EvalResult)
assert result.scores == [0.9, 0.8]
assert result.feedbacks == ["Correct.", "Mostly correct."]
assert len(result.trajectories) == 2
assert result.trajectories[0].input_text == "What is 2+2?"
assert result.trajectories[0].output_text == "4"
assert result.trajectories[0].score == 0.9
assert result.trajectories[0].feedback == "Correct."
assert result.trajectories[0].prompt_used == "Answer the question."
assert result.trajectories[1].prompt_used == "Answer the question."
def test_empty_minibatch_returns_empty_result(
self,
evaluator: PromptEvaluator,
executor: MagicMock,
judge: MagicMock,
) -> None:
prompt = Prompt(text="test")
result = evaluator.evaluate(prompt, [], "task")
assert result.scores == []
assert result.feedbacks == []
assert result.trajectories == []
executor.execute.assert_not_called()
# judge_batch is called with empty pairs list
judge.judge_batch.assert_called_once_with("task", [])
def test_executor_called_with_correct_prompt(
self,
evaluator: PromptEvaluator,
executor: MagicMock,
judge: MagicMock,
) -> None:
prompt = Prompt(text="Summarize this.")
examples = [SyntheticExample(input_text="Long text here", id=0)]
executor.execute.return_value = "Summary."
judge.judge_batch.return_value = [(0.7, "Good summary.")]
evaluator.evaluate(prompt, examples, "summarization")
executor.execute.assert_called_once_with(prompt, "Long text here")
def test_trajectories_prompt_used_matches_input_prompt(
self,
evaluator: PromptEvaluator,
executor: MagicMock,
judge: MagicMock,
) -> None:
prompt = Prompt(text="Translate to French.")
examples = [SyntheticExample(input_text="Hello", id=0)]
executor.execute.return_value = "Bonjour"
judge.judge_batch.return_value = [(1.0, "Perfect.")]
result = evaluator.evaluate(prompt, examples, "translation")
assert result.trajectories[0].prompt_used == "Translate to French."
def test_scores_feedbacks_trajectories_lists_sized_correctly(
self,
evaluator: PromptEvaluator,
executor: MagicMock,
judge: MagicMock,
) -> None:
prompt = Prompt(text="test prompt")
examples = [SyntheticExample(input_text=f"q{i}", id=i) for i in range(4)]
executor.execute.side_effect = [f"a{i}" for i in range(4)]
judge.judge_batch.return_value = [
(0.1 * i, f"fb{i}") for i in range(4)
]
result = evaluator.evaluate(prompt, examples, "task")
assert len(result.scores) == 4
assert len(result.feedbacks) == 4
assert len(result.trajectories) == 4

View File

@@ -0,0 +1,147 @@
"""Unit tests for the evolution loop — with full mocking."""
from __future__ import annotations
from unittest.mock import MagicMock, patch
from prometheus.application.bootstrap import SyntheticBootstrap
from prometheus.application.evaluator import PromptEvaluator
from prometheus.application.evolution import EvolutionLoop
from prometheus.domain.entities import EvalResult, Prompt, SyntheticExample, Trajectory
class TestEvolutionLoop:
def test_accepts_improvement(
self,
seed_prompt: Prompt,
synthetic_pool: list[SyntheticExample],
task_description: str,
mock_llm_port: MagicMock,
mock_judge_port: MagicMock,
mock_proposer_port: MagicMock,
) -> None:
"""When the new prompt improves the score, the best candidate is updated."""
evaluator = PromptEvaluator(mock_llm_port, mock_judge_port)
bootstrap = MagicMock(spec=SyntheticBootstrap)
bootstrap.sample_minibatch.return_value = synthetic_pool[:5]
initial_eval = EvalResult(
scores=[0.3, 0.4, 0.3, 0.5, 0.2],
feedbacks=["bad"] * 5,
trajectories=[
Trajectory(f"input{i}", f"output{i}", s, "bad", "prompt")
for i, s in enumerate([0.3, 0.4, 0.3, 0.5, 0.2])
],
)
old_eval = EvalResult(
scores=[0.3, 0.4, 0.3, 0.5, 0.2],
feedbacks=["bad"] * 5,
trajectories=[
Trajectory(f"input{i}", f"output{i}", s, "bad", "prompt")
for i, s in enumerate([0.3, 0.4, 0.3, 0.5, 0.2])
],
)
new_eval = EvalResult(
scores=[0.8, 0.9, 0.7, 0.8, 0.9],
feedbacks=["good"] * 5,
trajectories=[],
)
evaluator.evaluate = MagicMock(side_effect=[initial_eval, old_eval, new_eval])
loop = EvolutionLoop(
evaluator=evaluator,
proposer=mock_proposer_port,
bootstrap=bootstrap,
max_iterations=1,
minibatch_size=5,
)
with patch.object(loop, "_log"):
state = loop.run(seed_prompt, synthetic_pool, task_description)
assert state.best_candidate is not None
assert state.best_candidate.best_score > 0
def test_rejects_regression(
self,
seed_prompt: Prompt,
synthetic_pool: list[SyntheticExample],
task_description: str,
mock_llm_port: MagicMock,
mock_judge_port: MagicMock,
mock_proposer_port: MagicMock,
) -> None:
"""When the new prompt degrades the score, the best candidate stays unchanged."""
evaluator = PromptEvaluator(mock_llm_port, mock_judge_port)
bootstrap = MagicMock(spec=SyntheticBootstrap)
bootstrap.sample_minibatch.return_value = synthetic_pool[:5]
initial_eval = EvalResult(
scores=[0.7, 0.8, 0.7, 0.8, 0.9],
feedbacks=["ok"] * 5,
trajectories=[
Trajectory(f"input{i}", f"output{i}", s, "ok", "prompt")
for i, s in enumerate([0.7, 0.8, 0.7, 0.8, 0.9])
],
)
old_eval = EvalResult(
scores=[0.7, 0.8, 0.7, 0.8, 0.9],
feedbacks=["ok"] * 5,
trajectories=[
Trajectory(f"input{i}", f"output{i}", s, "ok", "prompt")
for i, s in enumerate([0.7, 0.8, 0.7, 0.8, 0.9])
],
)
new_eval = EvalResult(
scores=[0.2, 0.1, 0.3, 0.2, 0.1],
feedbacks=["bad"] * 5,
trajectories=[],
)
evaluator.evaluate = MagicMock(side_effect=[initial_eval, old_eval, new_eval])
loop = EvolutionLoop(
evaluator=evaluator,
proposer=mock_proposer_port,
bootstrap=bootstrap,
max_iterations=1,
minibatch_size=5,
)
with patch.object(loop, "_log"):
state = loop.run(seed_prompt, synthetic_pool, task_description)
assert state.best_candidate is not None
assert state.best_candidate.prompt.text == seed_prompt.text
def test_skips_perfect_scores(
self,
seed_prompt: Prompt,
synthetic_pool: list[SyntheticExample],
task_description: str,
mock_llm_port: MagicMock,
mock_judge_port: MagicMock,
mock_proposer_port: MagicMock,
) -> None:
"""When all scores are perfect, no proposition is made."""
evaluator = PromptEvaluator(mock_llm_port, mock_judge_port)
bootstrap = MagicMock(spec=SyntheticBootstrap)
bootstrap.sample_minibatch.return_value = synthetic_pool[:5]
perfect_eval = EvalResult(
scores=[1.0, 1.0, 1.0, 1.0, 1.0],
feedbacks=["perfect"] * 5,
trajectories=[
Trajectory(f"input{i}", f"output{i}", 1.0, "perfect", "prompt")
for i in range(5)
],
)
evaluator.evaluate = MagicMock(return_value=perfect_eval)
loop = EvolutionLoop(
evaluator=evaluator,
proposer=mock_proposer_port,
bootstrap=bootstrap,
max_iterations=3,
minibatch_size=5,
)
with patch.object(loop, "_log"):
loop.run(seed_prompt, synthetic_pool, task_description)
mock_proposer_port.propose.assert_not_called()

View File

@@ -0,0 +1,99 @@
"""Unit tests for YamlPersistence file I/O."""
from __future__ import annotations
from pathlib import Path
import pytest
import yaml
from prometheus.infrastructure.file_io import YamlPersistence
class TestYamlPersistenceReadConfig:
"""Tests for read_config YAML loading."""
def test_roundtrip_write_and_read(self, tmp_path: Path) -> None:
persistence = YamlPersistence()
data = {
"seed_prompt": "You are helpful.",
"task_description": "Answer questions.",
"max_iterations": 30,
"verbose": True,
}
config_file = tmp_path / "config.yaml"
with open(config_file, "w") as f:
yaml.dump(data, f)
result = persistence.read_config(str(config_file))
assert result == data
def test_reads_nested_yaml(self, tmp_path: Path) -> None:
persistence = YamlPersistence()
data = {
"model": {"name": "gpt-4o", "temperature": 0.7},
"params": [1, 2, 3],
}
config_file = tmp_path / "nested.yaml"
with open(config_file, "w") as f:
yaml.dump(data, f)
result = persistence.read_config(str(config_file))
assert result["model"]["name"] == "gpt-4o"
assert result["params"] == [1, 2, 3]
def test_missing_file_raises_error(self, tmp_path: Path) -> None:
persistence = YamlPersistence()
missing = tmp_path / "nonexistent.yaml"
with pytest.raises(FileNotFoundError):
persistence.read_config(str(missing))
def test_malformed_yaml_raises_error(self, tmp_path: Path) -> None:
persistence = YamlPersistence()
bad_file = tmp_path / "bad.yaml"
bad_file.write_text(": [invalid: {yaml", encoding="utf-8")
with pytest.raises(yaml.YAMLError):
persistence.read_config(str(bad_file))
class TestYamlPersistenceWriteResult:
"""Tests for write_result YAML output."""
def test_roundtrip_write_result(self, tmp_path: Path) -> None:
persistence = YamlPersistence()
data = {
"optimized_prompt": "Improved prompt.",
"initial_score": 0.4,
"final_score": 0.85,
}
output_file = tmp_path / "result.yaml"
persistence.write_result(str(output_file), data)
with open(output_file) as f:
loaded = yaml.safe_load(f)
assert loaded == data
def test_write_result_creates_valid_yaml(self, tmp_path: Path) -> None:
persistence = YamlPersistence()
data = {"key": "value", "number": 42}
output_file = tmp_path / "out.yaml"
persistence.write_result(str(output_file), data)
content = output_file.read_text()
assert "key: value" in content
assert "number: 42" in content
def test_write_result_handles_unicode(self, tmp_path: Path) -> None:
persistence = YamlPersistence()
data = {"prompt": "Répondez en français. 中文测试"}
output_file = tmp_path / "unicode.yaml"
persistence.write_result(str(output_file), data)
with open(output_file, encoding="utf-8") as f:
loaded = yaml.safe_load(f)
assert loaded["prompt"] == "Répondez en français. 中文测试"

View File

@@ -0,0 +1,54 @@
"""Unit tests for scoring logic."""
from __future__ import annotations
from prometheus.domain.entities import EvalResult, Trajectory
from prometheus.domain.scoring import normalize_score, should_accept
def _make_eval(scores: list[float]) -> EvalResult:
return EvalResult(
scores=scores,
feedbacks=[""] * len(scores),
trajectories=[
Trajectory(f"in{i}", f"out{i}", s, "", "p")
for i, s in enumerate(scores)
],
)
class TestShouldAccept:
def test_accepts_improvement(self) -> None:
old = _make_eval([0.3, 0.4])
new = _make_eval([0.8, 0.9])
assert should_accept(old, new) is True
def test_rejects_regression(self) -> None:
old = _make_eval([0.8, 0.9])
new = _make_eval([0.3, 0.4])
assert should_accept(old, new) is False
def test_rejects_equal(self) -> None:
old = _make_eval([0.5, 0.5])
new = _make_eval([0.5, 0.5])
assert should_accept(old, new) is False
def test_min_improvement_threshold(self) -> None:
old = _make_eval([0.5])
new = _make_eval([0.6])
assert should_accept(old, new, min_improvement=0.2) is False
assert should_accept(old, new, min_improvement=0.05) is True
class TestNormalizeScore:
def test_clamps_high(self) -> None:
assert normalize_score(1.5) == 1.0
def test_clamps_low(self) -> None:
assert normalize_score(-0.5) == 0.0
def test_passes_within_range(self) -> None:
assert normalize_score(0.7) == 0.7
def test_custom_range(self) -> None:
assert normalize_score(15.0, min_val=0.0, max_val=10.0) == 10.0
assert normalize_score(-5.0, min_val=0.0, max_val=10.0) == 0.0