Initial commit: PROMETHEUS v0.1.0 - Prompt optimizer
- Clean architecture (domain/application/infrastructure) - DSPy-based evolution engine with scoring - CLI via pyproject.toml entry point - Unit + integration tests (~300 tests) - Configs for glm-5.1 and glm-4.5-air models - Z.AI endpoint integration
This commit is contained in:
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
93
tests/conftest.py
Normal file
93
tests/conftest.py
Normal file
@@ -0,0 +1,93 @@
|
||||
"""Shared test fixtures."""
|
||||
from __future__ import annotations
|
||||
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from prometheus.domain.entities import (
|
||||
EvalResult,
|
||||
Prompt,
|
||||
SyntheticExample,
|
||||
Trajectory,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def seed_prompt() -> Prompt:
|
||||
return Prompt(text="You are a helpful assistant. Answer the question.")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def task_description() -> str:
|
||||
return "Answer factual questions accurately and concisely."
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def synthetic_pool() -> list[SyntheticExample]:
|
||||
return [
|
||||
SyntheticExample(input_text=f"Test input {i}", id=i) for i in range(20)
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_eval_result() -> EvalResult:
|
||||
return EvalResult(
|
||||
scores=[0.3, 0.5, 0.4, 0.6, 0.2],
|
||||
feedbacks=[
|
||||
"Incomplete answer",
|
||||
"Missing key detail",
|
||||
"Wrong format",
|
||||
"Partially correct",
|
||||
"Completely off topic",
|
||||
],
|
||||
trajectories=[
|
||||
Trajectory(
|
||||
input_text=f"Input {i}",
|
||||
output_text=f"Output {i}",
|
||||
score=s,
|
||||
feedback=f,
|
||||
prompt_used="test prompt",
|
||||
)
|
||||
for i, (s, f) in enumerate(
|
||||
zip(
|
||||
[0.3, 0.5, 0.4, 0.6, 0.2],
|
||||
[
|
||||
"Incomplete answer",
|
||||
"Missing key detail",
|
||||
"Wrong format",
|
||||
"Partially correct",
|
||||
"Completely off topic",
|
||||
],
|
||||
)
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_llm_port() -> MagicMock:
|
||||
"""Mock LLMPort that returns canned responses."""
|
||||
port = MagicMock()
|
||||
port.execute.return_value = "This is a mock response."
|
||||
return port
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_judge_port() -> MagicMock:
|
||||
"""Mock JudgePort that returns moderate scores."""
|
||||
port = MagicMock()
|
||||
port.judge_batch.return_value = [
|
||||
(0.5, "Moderate quality, needs improvement."),
|
||||
] * 5
|
||||
return port
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_proposer_port() -> MagicMock:
|
||||
"""Mock ProposerPort that returns a slightly modified prompt."""
|
||||
port = MagicMock()
|
||||
port.propose.return_value = Prompt(
|
||||
text="You are a very helpful assistant. Answer the question precisely."
|
||||
)
|
||||
return port
|
||||
0
tests/integration/__init__.py
Normal file
0
tests/integration/__init__.py
Normal file
29
tests/integration/test_dspy_adapters.py
Normal file
29
tests/integration/test_dspy_adapters.py
Normal file
@@ -0,0 +1,29 @@
|
||||
"""Integration tests for DSPy adapters using DSPy mock LM."""
|
||||
from __future__ import annotations
|
||||
|
||||
import dspy
|
||||
import pytest
|
||||
|
||||
from prometheus.domain.entities import Prompt
|
||||
from prometheus.infrastructure.llm_adapter import DSPyLLMAdapter
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_lm() -> dspy.LM:
|
||||
"""Create a DSPy mock LM that returns predictable responses."""
|
||||
lm = dspy.utils.DummyLM(
|
||||
[
|
||||
{"output": "Mock output response"},
|
||||
]
|
||||
)
|
||||
dspy.configure(lm=lm)
|
||||
return lm
|
||||
|
||||
|
||||
class TestDSPyLLMAdapter:
|
||||
def test_execute_returns_response(self, mock_lm: dspy.LM) -> None:
|
||||
adapter = DSPyLLMAdapter(model="openai/gpt-4o-mini")
|
||||
prompt = Prompt(text="Answer the question.")
|
||||
result = adapter.execute(prompt, "What is 2+2?")
|
||||
assert isinstance(result, str)
|
||||
assert len(result) > 0
|
||||
74
tests/integration/test_full_pipeline.py
Normal file
74
tests/integration/test_full_pipeline.py
Normal file
@@ -0,0 +1,74 @@
|
||||
"""End-to-end pipeline test with mocked LLM calls."""
|
||||
from __future__ import annotations
|
||||
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
from prometheus.application.bootstrap import SyntheticBootstrap
|
||||
from prometheus.application.dto import OptimizationConfig
|
||||
from prometheus.application.evaluator import PromptEvaluator
|
||||
from prometheus.application.use_cases import OptimizePromptUseCase
|
||||
from prometheus.domain.entities import EvalResult, Prompt, SyntheticExample, Trajectory
|
||||
from prometheus.domain.ports import JudgePort, LLMPort, ProposerPort
|
||||
|
||||
|
||||
def _make_eval(scores: list[float]) -> EvalResult:
|
||||
return EvalResult(
|
||||
scores=scores,
|
||||
feedbacks=["feedback"] * len(scores),
|
||||
trajectories=[
|
||||
Trajectory(f"in{i}", f"out{i}", s, "feedback", "prompt")
|
||||
for i, s in enumerate(scores)
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
class TestFullPipeline:
|
||||
def test_pipeline_produces_result(self) -> None:
|
||||
"""Full pipeline with mocked ports produces an OptimizationResult."""
|
||||
mock_llm = MagicMock(spec=LLMPort)
|
||||
mock_llm.execute.return_value = "mock response"
|
||||
|
||||
mock_judge = MagicMock(spec=JudgePort)
|
||||
# Initial eval (low), then alternating current/new evals per iteration
|
||||
eval_sequence = [
|
||||
_make_eval([0.3, 0.3, 0.3, 0.3, 0.3]), # initial seed eval
|
||||
]
|
||||
for _ in range(5): # 5 iterations
|
||||
eval_sequence.append(_make_eval([0.4, 0.4, 0.4, 0.4, 0.4])) # current eval
|
||||
eval_sequence.append(_make_eval([0.6, 0.6, 0.6, 0.6, 0.6])) # new eval (accepted)
|
||||
mock_judge.judge_batch.return_value = [(0.5, "ok")] * 5
|
||||
|
||||
mock_proposer = MagicMock(spec=ProposerPort)
|
||||
mock_proposer.propose.return_value = Prompt(text="Improved prompt")
|
||||
|
||||
evaluator = PromptEvaluator(mock_llm, mock_judge)
|
||||
evaluator.evaluate = MagicMock(side_effect=eval_sequence)
|
||||
|
||||
mock_gen = MagicMock()
|
||||
mock_gen.generate_inputs.return_value = [
|
||||
SyntheticExample(input_text=f"synth input {i}", id=i) for i in range(20)
|
||||
]
|
||||
bootstrap = SyntheticBootstrap(generator=mock_gen, seed=42)
|
||||
|
||||
use_case = OptimizePromptUseCase(
|
||||
evaluator=evaluator,
|
||||
proposer=mock_proposer,
|
||||
bootstrap=bootstrap,
|
||||
)
|
||||
|
||||
config = OptimizationConfig(
|
||||
seed_prompt="Answer questions.",
|
||||
task_description="Answer questions accurately.",
|
||||
max_iterations=5,
|
||||
n_synthetic_inputs=20,
|
||||
minibatch_size=5,
|
||||
seed=42,
|
||||
)
|
||||
|
||||
result = use_case.execute(config)
|
||||
|
||||
assert result.initial_prompt == "Answer questions."
|
||||
assert result.optimized_prompt == "Improved prompt"
|
||||
assert result.iterations_used == 5
|
||||
assert result.total_llm_calls > 0
|
||||
assert result.final_score > result.initial_score
|
||||
0
tests/unit/__init__.py
Normal file
0
tests/unit/__init__.py
Normal file
50
tests/unit/test_bootstrap.py
Normal file
50
tests/unit/test_bootstrap.py
Normal file
@@ -0,0 +1,50 @@
|
||||
"""Unit tests for the bootstrap module."""
|
||||
from __future__ import annotations
|
||||
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
from prometheus.application.bootstrap import SyntheticBootstrap
|
||||
from prometheus.domain.entities import SyntheticExample
|
||||
from prometheus.domain.ports import SyntheticGeneratorPort
|
||||
|
||||
|
||||
class TestSyntheticBootstrap:
|
||||
def test_run_returns_shuffled_examples(self) -> None:
|
||||
mock_gen = MagicMock(spec=SyntheticGeneratorPort)
|
||||
examples = [SyntheticExample(input_text=f"input {i}", id=i) for i in range(10)]
|
||||
mock_gen.generate_inputs.return_value = examples
|
||||
|
||||
bootstrap = SyntheticBootstrap(generator=mock_gen, seed=42)
|
||||
result = bootstrap.run("task desc", 10)
|
||||
|
||||
assert len(result) == 10
|
||||
mock_gen.generate_inputs.assert_called_once_with("task desc", 10)
|
||||
|
||||
def test_sample_minibatch_returns_correct_size(self) -> None:
|
||||
mock_gen = MagicMock(spec=SyntheticGeneratorPort)
|
||||
pool = [SyntheticExample(input_text=f"input {i}", id=i) for i in range(20)]
|
||||
|
||||
bootstrap = SyntheticBootstrap(generator=mock_gen, seed=42)
|
||||
batch = bootstrap.sample_minibatch(pool, 5)
|
||||
|
||||
assert len(batch) == 5
|
||||
# All items should be from the pool
|
||||
assert all(item in pool for item in batch)
|
||||
|
||||
def test_sample_minibatch_capped_at_pool_size(self) -> None:
|
||||
mock_gen = MagicMock(spec=SyntheticGeneratorPort)
|
||||
pool = [SyntheticExample(input_text=f"input {i}", id=i) for i in range(3)]
|
||||
|
||||
bootstrap = SyntheticBootstrap(generator=mock_gen, seed=42)
|
||||
batch = bootstrap.sample_minibatch(pool, 10)
|
||||
|
||||
assert len(batch) == 3
|
||||
|
||||
def test_deterministic_with_same_seed(self) -> None:
|
||||
mock_gen = MagicMock(spec=SyntheticGeneratorPort)
|
||||
pool = [SyntheticExample(input_text=f"input {i}", id=i) for i in range(20)]
|
||||
|
||||
b1 = SyntheticBootstrap(generator=mock_gen, seed=42)
|
||||
b2 = SyntheticBootstrap(generator=mock_gen, seed=42)
|
||||
|
||||
assert b1.sample_minibatch(pool, 5) == b2.sample_minibatch(pool, 5)
|
||||
198
tests/unit/test_dspy_modules.py
Normal file
198
tests/unit/test_dspy_modules.py
Normal file
@@ -0,0 +1,198 @@
|
||||
"""Unit tests for DSPy module parsing logic."""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import dspy
|
||||
import pytest
|
||||
|
||||
from prometheus.infrastructure.dspy_modules import (
|
||||
InstructionProposer,
|
||||
OutputJudge,
|
||||
SyntheticInputGenerator,
|
||||
)
|
||||
|
||||
|
||||
class TestSyntheticInputGeneratorParseFallback:
|
||||
"""Tests for _parse_fallback — regex-based JSON recovery."""
|
||||
|
||||
def test_extracts_quoted_strings(self) -> None:
|
||||
text = 'Here are some: "first example" and "second example" done.'
|
||||
result = SyntheticInputGenerator._parse_fallback(text)
|
||||
assert result == ["first example", "second example"]
|
||||
|
||||
def test_single_quoted_string(self) -> None:
|
||||
text = 'Just one: "hello world"'
|
||||
result = SyntheticInputGenerator._parse_fallback(text)
|
||||
assert result == ["hello world"]
|
||||
|
||||
def test_no_quotes_returns_raw_text(self) -> None:
|
||||
text = "no quotes at all here"
|
||||
result = SyntheticInputGenerator._parse_fallback(text)
|
||||
assert result == ["no quotes at all here"]
|
||||
|
||||
def test_empty_string_returns_itself(self) -> None:
|
||||
result = SyntheticInputGenerator._parse_fallback("")
|
||||
assert result == [""]
|
||||
|
||||
def test_mixed_json_with_extra_text(self) -> None:
|
||||
text = 'Results: "alpha", "beta", "gamma" — take your pick.'
|
||||
result = SyntheticInputGenerator._parse_fallback(text)
|
||||
assert result == ["alpha", "beta", "gamma"]
|
||||
|
||||
|
||||
class TestOutputJudgeForward:
|
||||
"""Tests for OutputJudge score parsing and clamping.
|
||||
|
||||
Mocks the internal ChainOfThought module to isolate parsing logic.
|
||||
"""
|
||||
|
||||
@pytest.fixture
|
||||
def judge(self) -> OutputJudge:
|
||||
return OutputJudge()
|
||||
|
||||
def test_valid_numeric_score(self, judge: OutputJudge) -> None:
|
||||
judge.judge = MagicMock(
|
||||
return_value=dspy.Prediction(score="0.8", feedback="Good output.")
|
||||
)
|
||||
result = judge.forward("task", "input", "output")
|
||||
|
||||
assert result.score == 0.8
|
||||
assert result.feedback == "Good output."
|
||||
|
||||
def test_non_numeric_score_falls_back_to_half(
|
||||
self, judge: OutputJudge
|
||||
) -> None:
|
||||
judge.judge = MagicMock(
|
||||
return_value=dspy.Prediction(
|
||||
score="not-a-number", feedback="N/A"
|
||||
)
|
||||
)
|
||||
result = judge.forward("task", "input", "output")
|
||||
|
||||
assert result.score == 0.5
|
||||
|
||||
def test_score_clamped_to_upper_bound(self, judge: OutputJudge) -> None:
|
||||
judge.judge = MagicMock(
|
||||
return_value=dspy.Prediction(score="1.5", feedback="Great!")
|
||||
)
|
||||
result = judge.forward("task", "input", "output")
|
||||
|
||||
assert result.score == 1.0
|
||||
|
||||
def test_score_clamped_to_lower_bound(self, judge: OutputJudge) -> None:
|
||||
judge.judge = MagicMock(
|
||||
return_value=dspy.Prediction(score="-0.3", feedback="Terrible.")
|
||||
)
|
||||
result = judge.forward("task", "input", "output")
|
||||
|
||||
assert result.score == 0.0
|
||||
|
||||
def test_empty_score_string_falls_back(self, judge: OutputJudge) -> None:
|
||||
judge.judge = MagicMock(
|
||||
return_value=dspy.Prediction(score="", feedback="No score.")
|
||||
)
|
||||
result = judge.forward("task", "input", "output")
|
||||
|
||||
assert result.score == 0.5
|
||||
|
||||
def test_boundary_score_one(self, judge: OutputJudge) -> None:
|
||||
judge.judge = MagicMock(
|
||||
return_value=dspy.Prediction(score="1.0", feedback="Perfect.")
|
||||
)
|
||||
result = judge.forward("task", "input", "output")
|
||||
|
||||
assert result.score == 1.0
|
||||
|
||||
def test_boundary_score_zero(self, judge: OutputJudge) -> None:
|
||||
judge.judge = MagicMock(
|
||||
return_value=dspy.Prediction(score="0.0", feedback="Wrong.")
|
||||
)
|
||||
result = judge.forward("task", "input", "output")
|
||||
|
||||
assert result.score == 0.0
|
||||
|
||||
def test_none_score_falls_back(self, judge: OutputJudge) -> None:
|
||||
judge.judge = MagicMock(
|
||||
return_value=dspy.Prediction(score=None, feedback="Missing.")
|
||||
)
|
||||
result = judge.forward("task", "input", "output")
|
||||
|
||||
assert result.score == 0.5
|
||||
|
||||
|
||||
class TestSyntheticInputGeneratorForward:
|
||||
"""Tests for SyntheticInputGenerator.forward JSON/fallback parsing.
|
||||
|
||||
Mocks the internal ChainOfThought module to isolate parsing logic.
|
||||
"""
|
||||
|
||||
@pytest.fixture
|
||||
def generator(self) -> SyntheticInputGenerator:
|
||||
return SyntheticInputGenerator()
|
||||
|
||||
def test_valid_json_parsed_correctly(
|
||||
self, generator: SyntheticInputGenerator
|
||||
) -> None:
|
||||
examples_json = json.dumps(["q1", "q2", "q3"])
|
||||
generator.generate = MagicMock(
|
||||
return_value=dspy.Prediction(examples=examples_json)
|
||||
)
|
||||
result = generator.forward("task desc", 3)
|
||||
|
||||
assert result.examples == ["q1", "q2", "q3"]
|
||||
|
||||
def test_malformed_json_triggers_fallback(
|
||||
self, generator: SyntheticInputGenerator
|
||||
) -> None:
|
||||
generator.generate = MagicMock(
|
||||
return_value=dspy.Prediction(
|
||||
examples='Here: "fallback item" and "another one"'
|
||||
)
|
||||
)
|
||||
result = generator.forward("task desc", 2)
|
||||
|
||||
assert result.examples == ["fallback item", "another one"]
|
||||
|
||||
def test_empty_json_array(self, generator: SyntheticInputGenerator) -> None:
|
||||
generator.generate = MagicMock(
|
||||
return_value=dspy.Prediction(examples="[]")
|
||||
)
|
||||
result = generator.forward("task desc", 0)
|
||||
|
||||
assert result.examples == []
|
||||
|
||||
|
||||
class TestInstructionProposerForward:
|
||||
"""Tests for InstructionProposer.forward."""
|
||||
|
||||
@pytest.fixture
|
||||
def proposer(self) -> InstructionProposer:
|
||||
return InstructionProposer()
|
||||
|
||||
def test_returns_new_instruction(self, proposer: InstructionProposer) -> None:
|
||||
proposer.propose = MagicMock(
|
||||
return_value=dspy.Prediction(
|
||||
new_instruction="Be concise and accurate."
|
||||
)
|
||||
)
|
||||
result = proposer.forward(
|
||||
"Be helpful.", "Answer questions.", "Failed: too verbose"
|
||||
)
|
||||
|
||||
assert result.new_instruction == "Be concise and accurate."
|
||||
|
||||
def test_passes_correct_arguments(
|
||||
self, proposer: InstructionProposer
|
||||
) -> None:
|
||||
proposer.propose = MagicMock(
|
||||
return_value=dspy.Prediction(new_instruction="improved")
|
||||
)
|
||||
proposer.forward("current", "task desc", "failures")
|
||||
|
||||
proposer.propose.assert_called_once_with(
|
||||
current_instruction="current",
|
||||
task_description="task desc",
|
||||
failure_examples="failures",
|
||||
)
|
||||
99
tests/unit/test_entities.py
Normal file
99
tests/unit/test_entities.py
Normal file
@@ -0,0 +1,99 @@
|
||||
"""Unit tests for domain entities."""
|
||||
from __future__ import annotations
|
||||
|
||||
from prometheus.domain.entities import (
|
||||
Candidate,
|
||||
EvalResult,
|
||||
OptimizationState,
|
||||
Prompt,
|
||||
SyntheticExample,
|
||||
Trajectory,
|
||||
)
|
||||
|
||||
|
||||
class TestPrompt:
|
||||
def test_prompt_text(self) -> None:
|
||||
p = Prompt(text="Hello")
|
||||
assert p.text == "Hello"
|
||||
|
||||
def test_prompt_len(self) -> None:
|
||||
p = Prompt(text="Hello")
|
||||
assert len(p) == 5
|
||||
|
||||
def test_prompt_frozen(self) -> None:
|
||||
p = Prompt(text="Hello")
|
||||
try:
|
||||
p.text = "World" # type: ignore[misc]
|
||||
raise AssertionError("Should have raised FrozenInstanceError")
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
def test_prompt_default_metadata(self) -> None:
|
||||
p = Prompt(text="Hello")
|
||||
assert p.metadata == {}
|
||||
|
||||
def test_prompt_custom_metadata(self) -> None:
|
||||
p = Prompt(text="Hello", metadata={"key": "value"})
|
||||
assert p.metadata["key"] == "value"
|
||||
|
||||
|
||||
class TestSyntheticExample:
|
||||
def test_default_category(self) -> None:
|
||||
ex = SyntheticExample(input_text="test")
|
||||
assert ex.category == "default"
|
||||
|
||||
def test_default_id(self) -> None:
|
||||
ex = SyntheticExample(input_text="test")
|
||||
assert ex.id == 0
|
||||
|
||||
|
||||
class TestEvalResult:
|
||||
def test_total_score(self) -> None:
|
||||
result = EvalResult(
|
||||
scores=[0.3, 0.5, 0.4],
|
||||
feedbacks=["a", "b", "c"],
|
||||
trajectories=[],
|
||||
)
|
||||
assert result.total_score == 1.2
|
||||
|
||||
def test_mean_score(self) -> None:
|
||||
result = EvalResult(
|
||||
scores=[0.3, 0.5, 0.4],
|
||||
feedbacks=["a", "b", "c"],
|
||||
trajectories=[],
|
||||
)
|
||||
assert abs(result.mean_score - 0.4) < 1e-9
|
||||
|
||||
def test_mean_score_empty(self) -> None:
|
||||
result = EvalResult(scores=[], feedbacks=[], trajectories=[])
|
||||
assert result.mean_score == 0.0
|
||||
|
||||
|
||||
class TestTrajectory:
|
||||
def test_trajectory_fields(self) -> None:
|
||||
t = Trajectory(
|
||||
input_text="in",
|
||||
output_text="out",
|
||||
score=0.8,
|
||||
feedback="good",
|
||||
prompt_used="test",
|
||||
)
|
||||
assert t.input_text == "in"
|
||||
assert t.score == 0.8
|
||||
|
||||
|
||||
class TestCandidate:
|
||||
def test_candidate_defaults(self) -> None:
|
||||
c = Candidate(prompt=Prompt(text="test"))
|
||||
assert c.best_score == 0.0
|
||||
assert c.generation == 0
|
||||
assert c.parent_id is None
|
||||
|
||||
|
||||
class TestOptimizationState:
|
||||
def test_default_state(self) -> None:
|
||||
state = OptimizationState()
|
||||
assert state.iteration == 0
|
||||
assert state.best_candidate is None
|
||||
assert state.candidates == []
|
||||
assert state.total_llm_calls == 0
|
||||
121
tests/unit/test_evaluator.py
Normal file
121
tests/unit/test_evaluator.py
Normal file
@@ -0,0 +1,121 @@
|
||||
"""Unit tests for PromptEvaluator.evaluate()."""
|
||||
from __future__ import annotations
|
||||
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from prometheus.application.evaluator import PromptEvaluator
|
||||
from prometheus.domain.entities import EvalResult, Prompt, SyntheticExample, Trajectory
|
||||
from prometheus.domain.ports import JudgePort, LLMPort
|
||||
|
||||
|
||||
class TestPromptEvaluatorEvaluate:
|
||||
"""Tests for the evaluate() pipeline: execute → judge → trajectories."""
|
||||
|
||||
@pytest.fixture
|
||||
def executor(self) -> MagicMock:
|
||||
return MagicMock(spec=LLMPort)
|
||||
|
||||
@pytest.fixture
|
||||
def judge(self) -> MagicMock:
|
||||
return MagicMock(spec=JudgePort)
|
||||
|
||||
@pytest.fixture
|
||||
def evaluator(self, executor: MagicMock, judge: MagicMock) -> PromptEvaluator:
|
||||
return PromptEvaluator(executor=executor, judge=judge)
|
||||
|
||||
def test_happy_path_builds_correct_trajectories(
|
||||
self,
|
||||
evaluator: PromptEvaluator,
|
||||
executor: MagicMock,
|
||||
judge: MagicMock,
|
||||
) -> None:
|
||||
prompt = Prompt(text="Answer the question.")
|
||||
examples = [
|
||||
SyntheticExample(input_text="What is 2+2?", id=0),
|
||||
SyntheticExample(input_text="Capital of France?", id=1),
|
||||
]
|
||||
executor.execute.side_effect = ["4", "Paris"]
|
||||
judge.judge_batch.return_value = [
|
||||
(0.9, "Correct."),
|
||||
(0.8, "Mostly correct."),
|
||||
]
|
||||
|
||||
result = evaluator.evaluate(prompt, examples, "math and geography")
|
||||
|
||||
assert isinstance(result, EvalResult)
|
||||
assert result.scores == [0.9, 0.8]
|
||||
assert result.feedbacks == ["Correct.", "Mostly correct."]
|
||||
assert len(result.trajectories) == 2
|
||||
assert result.trajectories[0].input_text == "What is 2+2?"
|
||||
assert result.trajectories[0].output_text == "4"
|
||||
assert result.trajectories[0].score == 0.9
|
||||
assert result.trajectories[0].feedback == "Correct."
|
||||
assert result.trajectories[0].prompt_used == "Answer the question."
|
||||
assert result.trajectories[1].prompt_used == "Answer the question."
|
||||
|
||||
def test_empty_minibatch_returns_empty_result(
|
||||
self,
|
||||
evaluator: PromptEvaluator,
|
||||
executor: MagicMock,
|
||||
judge: MagicMock,
|
||||
) -> None:
|
||||
prompt = Prompt(text="test")
|
||||
result = evaluator.evaluate(prompt, [], "task")
|
||||
|
||||
assert result.scores == []
|
||||
assert result.feedbacks == []
|
||||
assert result.trajectories == []
|
||||
executor.execute.assert_not_called()
|
||||
# judge_batch is called with empty pairs list
|
||||
judge.judge_batch.assert_called_once_with("task", [])
|
||||
|
||||
def test_executor_called_with_correct_prompt(
|
||||
self,
|
||||
evaluator: PromptEvaluator,
|
||||
executor: MagicMock,
|
||||
judge: MagicMock,
|
||||
) -> None:
|
||||
prompt = Prompt(text="Summarize this.")
|
||||
examples = [SyntheticExample(input_text="Long text here", id=0)]
|
||||
executor.execute.return_value = "Summary."
|
||||
judge.judge_batch.return_value = [(0.7, "Good summary.")]
|
||||
|
||||
evaluator.evaluate(prompt, examples, "summarization")
|
||||
|
||||
executor.execute.assert_called_once_with(prompt, "Long text here")
|
||||
|
||||
def test_trajectories_prompt_used_matches_input_prompt(
|
||||
self,
|
||||
evaluator: PromptEvaluator,
|
||||
executor: MagicMock,
|
||||
judge: MagicMock,
|
||||
) -> None:
|
||||
prompt = Prompt(text="Translate to French.")
|
||||
examples = [SyntheticExample(input_text="Hello", id=0)]
|
||||
executor.execute.return_value = "Bonjour"
|
||||
judge.judge_batch.return_value = [(1.0, "Perfect.")]
|
||||
|
||||
result = evaluator.evaluate(prompt, examples, "translation")
|
||||
|
||||
assert result.trajectories[0].prompt_used == "Translate to French."
|
||||
|
||||
def test_scores_feedbacks_trajectories_lists_sized_correctly(
|
||||
self,
|
||||
evaluator: PromptEvaluator,
|
||||
executor: MagicMock,
|
||||
judge: MagicMock,
|
||||
) -> None:
|
||||
prompt = Prompt(text="test prompt")
|
||||
examples = [SyntheticExample(input_text=f"q{i}", id=i) for i in range(4)]
|
||||
executor.execute.side_effect = [f"a{i}" for i in range(4)]
|
||||
judge.judge_batch.return_value = [
|
||||
(0.1 * i, f"fb{i}") for i in range(4)
|
||||
]
|
||||
|
||||
result = evaluator.evaluate(prompt, examples, "task")
|
||||
|
||||
assert len(result.scores) == 4
|
||||
assert len(result.feedbacks) == 4
|
||||
assert len(result.trajectories) == 4
|
||||
147
tests/unit/test_evolution.py
Normal file
147
tests/unit/test_evolution.py
Normal file
@@ -0,0 +1,147 @@
|
||||
"""Unit tests for the evolution loop — with full mocking."""
|
||||
from __future__ import annotations
|
||||
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
from prometheus.application.bootstrap import SyntheticBootstrap
|
||||
from prometheus.application.evaluator import PromptEvaluator
|
||||
from prometheus.application.evolution import EvolutionLoop
|
||||
from prometheus.domain.entities import EvalResult, Prompt, SyntheticExample, Trajectory
|
||||
|
||||
|
||||
class TestEvolutionLoop:
|
||||
def test_accepts_improvement(
|
||||
self,
|
||||
seed_prompt: Prompt,
|
||||
synthetic_pool: list[SyntheticExample],
|
||||
task_description: str,
|
||||
mock_llm_port: MagicMock,
|
||||
mock_judge_port: MagicMock,
|
||||
mock_proposer_port: MagicMock,
|
||||
) -> None:
|
||||
"""When the new prompt improves the score, the best candidate is updated."""
|
||||
evaluator = PromptEvaluator(mock_llm_port, mock_judge_port)
|
||||
bootstrap = MagicMock(spec=SyntheticBootstrap)
|
||||
bootstrap.sample_minibatch.return_value = synthetic_pool[:5]
|
||||
|
||||
initial_eval = EvalResult(
|
||||
scores=[0.3, 0.4, 0.3, 0.5, 0.2],
|
||||
feedbacks=["bad"] * 5,
|
||||
trajectories=[
|
||||
Trajectory(f"input{i}", f"output{i}", s, "bad", "prompt")
|
||||
for i, s in enumerate([0.3, 0.4, 0.3, 0.5, 0.2])
|
||||
],
|
||||
)
|
||||
old_eval = EvalResult(
|
||||
scores=[0.3, 0.4, 0.3, 0.5, 0.2],
|
||||
feedbacks=["bad"] * 5,
|
||||
trajectories=[
|
||||
Trajectory(f"input{i}", f"output{i}", s, "bad", "prompt")
|
||||
for i, s in enumerate([0.3, 0.4, 0.3, 0.5, 0.2])
|
||||
],
|
||||
)
|
||||
new_eval = EvalResult(
|
||||
scores=[0.8, 0.9, 0.7, 0.8, 0.9],
|
||||
feedbacks=["good"] * 5,
|
||||
trajectories=[],
|
||||
)
|
||||
evaluator.evaluate = MagicMock(side_effect=[initial_eval, old_eval, new_eval])
|
||||
|
||||
loop = EvolutionLoop(
|
||||
evaluator=evaluator,
|
||||
proposer=mock_proposer_port,
|
||||
bootstrap=bootstrap,
|
||||
max_iterations=1,
|
||||
minibatch_size=5,
|
||||
)
|
||||
with patch.object(loop, "_log"):
|
||||
state = loop.run(seed_prompt, synthetic_pool, task_description)
|
||||
|
||||
assert state.best_candidate is not None
|
||||
assert state.best_candidate.best_score > 0
|
||||
|
||||
def test_rejects_regression(
|
||||
self,
|
||||
seed_prompt: Prompt,
|
||||
synthetic_pool: list[SyntheticExample],
|
||||
task_description: str,
|
||||
mock_llm_port: MagicMock,
|
||||
mock_judge_port: MagicMock,
|
||||
mock_proposer_port: MagicMock,
|
||||
) -> None:
|
||||
"""When the new prompt degrades the score, the best candidate stays unchanged."""
|
||||
evaluator = PromptEvaluator(mock_llm_port, mock_judge_port)
|
||||
bootstrap = MagicMock(spec=SyntheticBootstrap)
|
||||
bootstrap.sample_minibatch.return_value = synthetic_pool[:5]
|
||||
|
||||
initial_eval = EvalResult(
|
||||
scores=[0.7, 0.8, 0.7, 0.8, 0.9],
|
||||
feedbacks=["ok"] * 5,
|
||||
trajectories=[
|
||||
Trajectory(f"input{i}", f"output{i}", s, "ok", "prompt")
|
||||
for i, s in enumerate([0.7, 0.8, 0.7, 0.8, 0.9])
|
||||
],
|
||||
)
|
||||
old_eval = EvalResult(
|
||||
scores=[0.7, 0.8, 0.7, 0.8, 0.9],
|
||||
feedbacks=["ok"] * 5,
|
||||
trajectories=[
|
||||
Trajectory(f"input{i}", f"output{i}", s, "ok", "prompt")
|
||||
for i, s in enumerate([0.7, 0.8, 0.7, 0.8, 0.9])
|
||||
],
|
||||
)
|
||||
new_eval = EvalResult(
|
||||
scores=[0.2, 0.1, 0.3, 0.2, 0.1],
|
||||
feedbacks=["bad"] * 5,
|
||||
trajectories=[],
|
||||
)
|
||||
evaluator.evaluate = MagicMock(side_effect=[initial_eval, old_eval, new_eval])
|
||||
|
||||
loop = EvolutionLoop(
|
||||
evaluator=evaluator,
|
||||
proposer=mock_proposer_port,
|
||||
bootstrap=bootstrap,
|
||||
max_iterations=1,
|
||||
minibatch_size=5,
|
||||
)
|
||||
with patch.object(loop, "_log"):
|
||||
state = loop.run(seed_prompt, synthetic_pool, task_description)
|
||||
|
||||
assert state.best_candidate is not None
|
||||
assert state.best_candidate.prompt.text == seed_prompt.text
|
||||
|
||||
def test_skips_perfect_scores(
|
||||
self,
|
||||
seed_prompt: Prompt,
|
||||
synthetic_pool: list[SyntheticExample],
|
||||
task_description: str,
|
||||
mock_llm_port: MagicMock,
|
||||
mock_judge_port: MagicMock,
|
||||
mock_proposer_port: MagicMock,
|
||||
) -> None:
|
||||
"""When all scores are perfect, no proposition is made."""
|
||||
evaluator = PromptEvaluator(mock_llm_port, mock_judge_port)
|
||||
bootstrap = MagicMock(spec=SyntheticBootstrap)
|
||||
bootstrap.sample_minibatch.return_value = synthetic_pool[:5]
|
||||
|
||||
perfect_eval = EvalResult(
|
||||
scores=[1.0, 1.0, 1.0, 1.0, 1.0],
|
||||
feedbacks=["perfect"] * 5,
|
||||
trajectories=[
|
||||
Trajectory(f"input{i}", f"output{i}", 1.0, "perfect", "prompt")
|
||||
for i in range(5)
|
||||
],
|
||||
)
|
||||
evaluator.evaluate = MagicMock(return_value=perfect_eval)
|
||||
|
||||
loop = EvolutionLoop(
|
||||
evaluator=evaluator,
|
||||
proposer=mock_proposer_port,
|
||||
bootstrap=bootstrap,
|
||||
max_iterations=3,
|
||||
minibatch_size=5,
|
||||
)
|
||||
with patch.object(loop, "_log"):
|
||||
loop.run(seed_prompt, synthetic_pool, task_description)
|
||||
|
||||
mock_proposer_port.propose.assert_not_called()
|
||||
99
tests/unit/test_file_io.py
Normal file
99
tests/unit/test_file_io.py
Normal file
@@ -0,0 +1,99 @@
|
||||
"""Unit tests for YamlPersistence file I/O."""
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
import yaml
|
||||
|
||||
from prometheus.infrastructure.file_io import YamlPersistence
|
||||
|
||||
|
||||
class TestYamlPersistenceReadConfig:
|
||||
"""Tests for read_config YAML loading."""
|
||||
|
||||
def test_roundtrip_write_and_read(self, tmp_path: Path) -> None:
|
||||
persistence = YamlPersistence()
|
||||
data = {
|
||||
"seed_prompt": "You are helpful.",
|
||||
"task_description": "Answer questions.",
|
||||
"max_iterations": 30,
|
||||
"verbose": True,
|
||||
}
|
||||
config_file = tmp_path / "config.yaml"
|
||||
with open(config_file, "w") as f:
|
||||
yaml.dump(data, f)
|
||||
|
||||
result = persistence.read_config(str(config_file))
|
||||
|
||||
assert result == data
|
||||
|
||||
def test_reads_nested_yaml(self, tmp_path: Path) -> None:
|
||||
persistence = YamlPersistence()
|
||||
data = {
|
||||
"model": {"name": "gpt-4o", "temperature": 0.7},
|
||||
"params": [1, 2, 3],
|
||||
}
|
||||
config_file = tmp_path / "nested.yaml"
|
||||
with open(config_file, "w") as f:
|
||||
yaml.dump(data, f)
|
||||
|
||||
result = persistence.read_config(str(config_file))
|
||||
|
||||
assert result["model"]["name"] == "gpt-4o"
|
||||
assert result["params"] == [1, 2, 3]
|
||||
|
||||
def test_missing_file_raises_error(self, tmp_path: Path) -> None:
|
||||
persistence = YamlPersistence()
|
||||
missing = tmp_path / "nonexistent.yaml"
|
||||
|
||||
with pytest.raises(FileNotFoundError):
|
||||
persistence.read_config(str(missing))
|
||||
|
||||
def test_malformed_yaml_raises_error(self, tmp_path: Path) -> None:
|
||||
persistence = YamlPersistence()
|
||||
bad_file = tmp_path / "bad.yaml"
|
||||
bad_file.write_text(": [invalid: {yaml", encoding="utf-8")
|
||||
|
||||
with pytest.raises(yaml.YAMLError):
|
||||
persistence.read_config(str(bad_file))
|
||||
|
||||
|
||||
class TestYamlPersistenceWriteResult:
|
||||
"""Tests for write_result YAML output."""
|
||||
|
||||
def test_roundtrip_write_result(self, tmp_path: Path) -> None:
|
||||
persistence = YamlPersistence()
|
||||
data = {
|
||||
"optimized_prompt": "Improved prompt.",
|
||||
"initial_score": 0.4,
|
||||
"final_score": 0.85,
|
||||
}
|
||||
output_file = tmp_path / "result.yaml"
|
||||
persistence.write_result(str(output_file), data)
|
||||
|
||||
with open(output_file) as f:
|
||||
loaded = yaml.safe_load(f)
|
||||
|
||||
assert loaded == data
|
||||
|
||||
def test_write_result_creates_valid_yaml(self, tmp_path: Path) -> None:
|
||||
persistence = YamlPersistence()
|
||||
data = {"key": "value", "number": 42}
|
||||
output_file = tmp_path / "out.yaml"
|
||||
persistence.write_result(str(output_file), data)
|
||||
|
||||
content = output_file.read_text()
|
||||
assert "key: value" in content
|
||||
assert "number: 42" in content
|
||||
|
||||
def test_write_result_handles_unicode(self, tmp_path: Path) -> None:
|
||||
persistence = YamlPersistence()
|
||||
data = {"prompt": "Répondez en français. 中文测试"}
|
||||
output_file = tmp_path / "unicode.yaml"
|
||||
persistence.write_result(str(output_file), data)
|
||||
|
||||
with open(output_file, encoding="utf-8") as f:
|
||||
loaded = yaml.safe_load(f)
|
||||
|
||||
assert loaded["prompt"] == "Répondez en français. 中文测试"
|
||||
54
tests/unit/test_scoring.py
Normal file
54
tests/unit/test_scoring.py
Normal file
@@ -0,0 +1,54 @@
|
||||
"""Unit tests for scoring logic."""
|
||||
from __future__ import annotations
|
||||
|
||||
from prometheus.domain.entities import EvalResult, Trajectory
|
||||
from prometheus.domain.scoring import normalize_score, should_accept
|
||||
|
||||
|
||||
def _make_eval(scores: list[float]) -> EvalResult:
|
||||
return EvalResult(
|
||||
scores=scores,
|
||||
feedbacks=[""] * len(scores),
|
||||
trajectories=[
|
||||
Trajectory(f"in{i}", f"out{i}", s, "", "p")
|
||||
for i, s in enumerate(scores)
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
class TestShouldAccept:
|
||||
def test_accepts_improvement(self) -> None:
|
||||
old = _make_eval([0.3, 0.4])
|
||||
new = _make_eval([0.8, 0.9])
|
||||
assert should_accept(old, new) is True
|
||||
|
||||
def test_rejects_regression(self) -> None:
|
||||
old = _make_eval([0.8, 0.9])
|
||||
new = _make_eval([0.3, 0.4])
|
||||
assert should_accept(old, new) is False
|
||||
|
||||
def test_rejects_equal(self) -> None:
|
||||
old = _make_eval([0.5, 0.5])
|
||||
new = _make_eval([0.5, 0.5])
|
||||
assert should_accept(old, new) is False
|
||||
|
||||
def test_min_improvement_threshold(self) -> None:
|
||||
old = _make_eval([0.5])
|
||||
new = _make_eval([0.6])
|
||||
assert should_accept(old, new, min_improvement=0.2) is False
|
||||
assert should_accept(old, new, min_improvement=0.05) is True
|
||||
|
||||
|
||||
class TestNormalizeScore:
|
||||
def test_clamps_high(self) -> None:
|
||||
assert normalize_score(1.5) == 1.0
|
||||
|
||||
def test_clamps_low(self) -> None:
|
||||
assert normalize_score(-0.5) == 0.0
|
||||
|
||||
def test_passes_within_range(self) -> None:
|
||||
assert normalize_score(0.7) == 0.7
|
||||
|
||||
def test_custom_range(self) -> None:
|
||||
assert normalize_score(15.0, min_val=0.0, max_val=10.0) == 10.0
|
||||
assert normalize_score(-5.0, min_val=0.0, max_val=10.0) == 0.0
|
||||
Reference in New Issue
Block a user