"""End-to-end pipeline test with mocked LLM calls.""" from __future__ import annotations from unittest.mock import MagicMock from prometheus.application.bootstrap import SyntheticBootstrap from prometheus.application.dto import OptimizationConfig from prometheus.application.evaluator import PromptEvaluator from prometheus.application.use_cases import OptimizePromptUseCase from prometheus.domain.entities import EvalResult, Prompt, SyntheticExample, Trajectory from prometheus.domain.ports import JudgePort, LLMPort, ProposerPort def _make_eval(scores: list[float]) -> EvalResult: return EvalResult( scores=scores, feedbacks=["feedback"] * len(scores), trajectories=[ Trajectory(f"in{i}", f"out{i}", s, "feedback", "prompt") for i, s in enumerate(scores) ], ) class TestFullPipeline: def test_pipeline_produces_result(self) -> None: """Full pipeline with mocked ports produces an OptimizationResult.""" mock_llm = MagicMock(spec=LLMPort) mock_llm.execute.return_value = "mock response" mock_judge = MagicMock(spec=JudgePort) # Initial eval (low), then alternating current/new evals per iteration eval_sequence = [ _make_eval([0.3, 0.3, 0.3, 0.3, 0.3]), # initial seed eval ] for _ in range(5): # 5 iterations eval_sequence.append(_make_eval([0.4, 0.4, 0.4, 0.4, 0.4])) # current eval eval_sequence.append(_make_eval([0.6, 0.6, 0.6, 0.6, 0.6])) # new eval (accepted) mock_judge.judge_batch.return_value = [(0.5, "ok")] * 5 mock_proposer = MagicMock(spec=ProposerPort) mock_proposer.propose.return_value = Prompt(text="Improved prompt") evaluator = PromptEvaluator(mock_llm, mock_judge) evaluator.evaluate = MagicMock(side_effect=eval_sequence) mock_gen = MagicMock() mock_gen.generate_inputs.return_value = [ SyntheticExample(input_text=f"synth input {i}", id=i) for i in range(20) ] bootstrap = SyntheticBootstrap(generator=mock_gen, seed=42) use_case = OptimizePromptUseCase( evaluator=evaluator, proposer=mock_proposer, bootstrap=bootstrap, ) config = OptimizationConfig( seed_prompt="Answer questions.", task_description="Answer questions accurately.", max_iterations=5, n_synthetic_inputs=20, minibatch_size=5, seed=42, ) result = use_case.execute(config) assert result.initial_prompt == "Answer questions." assert result.optimized_prompt == "Improved prompt" assert result.iterations_used == 5 assert result.total_llm_calls > 0 assert result.final_score > result.initial_score