Initial commit: PROMETHEUS v0.1.0 - Prompt optimizer
- Clean architecture (domain/application/infrastructure) - DSPy-based evolution engine with scoring - CLI via pyproject.toml entry point - Unit + integration tests (~300 tests) - Configs for glm-5.1 and glm-4.5-air models - Z.AI endpoint integration
This commit is contained in:
147
tests/unit/test_evolution.py
Normal file
147
tests/unit/test_evolution.py
Normal file
@@ -0,0 +1,147 @@
|
||||
"""Unit tests for the evolution loop — with full mocking."""
|
||||
from __future__ import annotations
|
||||
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
from prometheus.application.bootstrap import SyntheticBootstrap
|
||||
from prometheus.application.evaluator import PromptEvaluator
|
||||
from prometheus.application.evolution import EvolutionLoop
|
||||
from prometheus.domain.entities import EvalResult, Prompt, SyntheticExample, Trajectory
|
||||
|
||||
|
||||
class TestEvolutionLoop:
|
||||
def test_accepts_improvement(
|
||||
self,
|
||||
seed_prompt: Prompt,
|
||||
synthetic_pool: list[SyntheticExample],
|
||||
task_description: str,
|
||||
mock_llm_port: MagicMock,
|
||||
mock_judge_port: MagicMock,
|
||||
mock_proposer_port: MagicMock,
|
||||
) -> None:
|
||||
"""When the new prompt improves the score, the best candidate is updated."""
|
||||
evaluator = PromptEvaluator(mock_llm_port, mock_judge_port)
|
||||
bootstrap = MagicMock(spec=SyntheticBootstrap)
|
||||
bootstrap.sample_minibatch.return_value = synthetic_pool[:5]
|
||||
|
||||
initial_eval = EvalResult(
|
||||
scores=[0.3, 0.4, 0.3, 0.5, 0.2],
|
||||
feedbacks=["bad"] * 5,
|
||||
trajectories=[
|
||||
Trajectory(f"input{i}", f"output{i}", s, "bad", "prompt")
|
||||
for i, s in enumerate([0.3, 0.4, 0.3, 0.5, 0.2])
|
||||
],
|
||||
)
|
||||
old_eval = EvalResult(
|
||||
scores=[0.3, 0.4, 0.3, 0.5, 0.2],
|
||||
feedbacks=["bad"] * 5,
|
||||
trajectories=[
|
||||
Trajectory(f"input{i}", f"output{i}", s, "bad", "prompt")
|
||||
for i, s in enumerate([0.3, 0.4, 0.3, 0.5, 0.2])
|
||||
],
|
||||
)
|
||||
new_eval = EvalResult(
|
||||
scores=[0.8, 0.9, 0.7, 0.8, 0.9],
|
||||
feedbacks=["good"] * 5,
|
||||
trajectories=[],
|
||||
)
|
||||
evaluator.evaluate = MagicMock(side_effect=[initial_eval, old_eval, new_eval])
|
||||
|
||||
loop = EvolutionLoop(
|
||||
evaluator=evaluator,
|
||||
proposer=mock_proposer_port,
|
||||
bootstrap=bootstrap,
|
||||
max_iterations=1,
|
||||
minibatch_size=5,
|
||||
)
|
||||
with patch.object(loop, "_log"):
|
||||
state = loop.run(seed_prompt, synthetic_pool, task_description)
|
||||
|
||||
assert state.best_candidate is not None
|
||||
assert state.best_candidate.best_score > 0
|
||||
|
||||
def test_rejects_regression(
|
||||
self,
|
||||
seed_prompt: Prompt,
|
||||
synthetic_pool: list[SyntheticExample],
|
||||
task_description: str,
|
||||
mock_llm_port: MagicMock,
|
||||
mock_judge_port: MagicMock,
|
||||
mock_proposer_port: MagicMock,
|
||||
) -> None:
|
||||
"""When the new prompt degrades the score, the best candidate stays unchanged."""
|
||||
evaluator = PromptEvaluator(mock_llm_port, mock_judge_port)
|
||||
bootstrap = MagicMock(spec=SyntheticBootstrap)
|
||||
bootstrap.sample_minibatch.return_value = synthetic_pool[:5]
|
||||
|
||||
initial_eval = EvalResult(
|
||||
scores=[0.7, 0.8, 0.7, 0.8, 0.9],
|
||||
feedbacks=["ok"] * 5,
|
||||
trajectories=[
|
||||
Trajectory(f"input{i}", f"output{i}", s, "ok", "prompt")
|
||||
for i, s in enumerate([0.7, 0.8, 0.7, 0.8, 0.9])
|
||||
],
|
||||
)
|
||||
old_eval = EvalResult(
|
||||
scores=[0.7, 0.8, 0.7, 0.8, 0.9],
|
||||
feedbacks=["ok"] * 5,
|
||||
trajectories=[
|
||||
Trajectory(f"input{i}", f"output{i}", s, "ok", "prompt")
|
||||
for i, s in enumerate([0.7, 0.8, 0.7, 0.8, 0.9])
|
||||
],
|
||||
)
|
||||
new_eval = EvalResult(
|
||||
scores=[0.2, 0.1, 0.3, 0.2, 0.1],
|
||||
feedbacks=["bad"] * 5,
|
||||
trajectories=[],
|
||||
)
|
||||
evaluator.evaluate = MagicMock(side_effect=[initial_eval, old_eval, new_eval])
|
||||
|
||||
loop = EvolutionLoop(
|
||||
evaluator=evaluator,
|
||||
proposer=mock_proposer_port,
|
||||
bootstrap=bootstrap,
|
||||
max_iterations=1,
|
||||
minibatch_size=5,
|
||||
)
|
||||
with patch.object(loop, "_log"):
|
||||
state = loop.run(seed_prompt, synthetic_pool, task_description)
|
||||
|
||||
assert state.best_candidate is not None
|
||||
assert state.best_candidate.prompt.text == seed_prompt.text
|
||||
|
||||
def test_skips_perfect_scores(
|
||||
self,
|
||||
seed_prompt: Prompt,
|
||||
synthetic_pool: list[SyntheticExample],
|
||||
task_description: str,
|
||||
mock_llm_port: MagicMock,
|
||||
mock_judge_port: MagicMock,
|
||||
mock_proposer_port: MagicMock,
|
||||
) -> None:
|
||||
"""When all scores are perfect, no proposition is made."""
|
||||
evaluator = PromptEvaluator(mock_llm_port, mock_judge_port)
|
||||
bootstrap = MagicMock(spec=SyntheticBootstrap)
|
||||
bootstrap.sample_minibatch.return_value = synthetic_pool[:5]
|
||||
|
||||
perfect_eval = EvalResult(
|
||||
scores=[1.0, 1.0, 1.0, 1.0, 1.0],
|
||||
feedbacks=["perfect"] * 5,
|
||||
trajectories=[
|
||||
Trajectory(f"input{i}", f"output{i}", 1.0, "perfect", "prompt")
|
||||
for i in range(5)
|
||||
],
|
||||
)
|
||||
evaluator.evaluate = MagicMock(return_value=perfect_eval)
|
||||
|
||||
loop = EvolutionLoop(
|
||||
evaluator=evaluator,
|
||||
proposer=mock_proposer_port,
|
||||
bootstrap=bootstrap,
|
||||
max_iterations=3,
|
||||
minibatch_size=5,
|
||||
)
|
||||
with patch.object(loop, "_log"):
|
||||
loop.run(seed_prompt, synthetic_pool, task_description)
|
||||
|
||||
mock_proposer_port.propose.assert_not_called()
|
||||
Reference in New Issue
Block a user