feat: v0.2.0 sprint — ground truth eval, crossover/mutation, checkpointing, similarity guards, dataset loader, CLI commands, extended test coverage

Aggregates all v0.2.0 sprint work (GARAA-30 through GARAA-40) and fixes
2 integration tests that broke when the codebase went async (DSPyLLMAdapter
and full pipeline tests now properly await coroutines).

277 tests pass (260 unit + 17 integration).

Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
FullStackDev
2026-03-29 19:13:50 +00:00
parent b9745566c8
commit a5bf2ad59c
43 changed files with 5007 additions and 358 deletions

278
tests/unit/test_cli.py Normal file
View File

@@ -0,0 +1,278 @@
"""Tests for the CLI interface — prometheus optimize, version, etc.
Uses Typer's CliRunner for isolated command testing.
"""
from __future__ import annotations
from pathlib import Path
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
import yaml
from typer.testing import CliRunner
from prometheus.application.dto import OptimizationResult
from prometheus.cli.app import app
runner = CliRunner()
class TestCLIOptimize:
"""Tests for the `prometheus optimize` command."""
def _write_config(self, tmp_path: Path, **overrides: object) -> Path:
"""Write a minimal valid config YAML and return its path."""
data = {
"seed_prompt": "You are a helpful assistant.",
"task_description": "Answer factual questions accurately.",
}
data.update(overrides)
config_file = tmp_path / "config.yaml"
with open(config_file, "w") as f:
yaml.dump(data, f)
return config_file
def test_optimize_with_valid_config(self, tmp_path: Path) -> None:
config_file = self._write_config(tmp_path)
output_file = tmp_path / "output.yaml"
mock_result = OptimizationResult(
optimized_prompt="Improved prompt",
initial_prompt="You are a helpful assistant.",
iterations_used=5,
total_llm_calls=50,
initial_score=0.3,
final_score=0.9,
improvement=0.6,
history=[],
)
mock_uc = AsyncMock()
mock_uc.execute.return_value = mock_result
with patch("prometheus.cli.commands.optimize.OptimizePromptUseCase", return_value=mock_uc):
with patch("prometheus.cli.commands.optimize.DSPySyntheticAdapter"):
with patch("prometheus.cli.commands.optimize.DSPyLLMAdapter") as mock_llm_cls:
mock_llm_cls.return_value = MagicMock()
with patch("prometheus.cli.commands.optimize.DSPyJudgeAdapter") as mock_judge_cls:
mock_judge_cls.return_value = MagicMock()
with patch("prometheus.cli.commands.optimize.DSPyProposerAdapter") as mock_prop_cls:
mock_prop_cls.return_value = MagicMock()
with patch("prometheus.cli.commands.optimize.dspy"):
result = runner.invoke(
app,
[
"optimize",
"-i",
str(config_file),
"-o",
str(output_file),
],
)
assert result.exit_code == 0
assert "Optimized Prompt" in result.output
def test_optimize_missing_input_file(self) -> None:
result = runner.invoke(
app,
["optimize", "-i", "/nonexistent/config.yaml"],
)
assert result.exit_code != 0
def test_optimize_with_verbose_flag(self, tmp_path: Path) -> None:
config_file = self._write_config(tmp_path)
output_file = tmp_path / "output.yaml"
mock_result = OptimizationResult(
optimized_prompt="Improved",
initial_prompt="test",
iterations_used=1,
total_llm_calls=10,
initial_score=0.3,
final_score=0.8,
improvement=0.5,
history=[],
)
mock_uc = AsyncMock()
mock_uc.execute.return_value = mock_result
with patch("prometheus.cli.commands.optimize.OptimizePromptUseCase", return_value=mock_uc):
with patch("prometheus.cli.commands.optimize.DSPySyntheticAdapter"):
with patch("prometheus.cli.commands.optimize.DSPyLLMAdapter") as mock_llm_cls:
mock_llm_cls.return_value = MagicMock()
with patch("prometheus.cli.commands.optimize.DSPyJudgeAdapter") as mock_judge_cls:
mock_judge_cls.return_value = MagicMock()
with patch("prometheus.cli.commands.optimize.DSPyProposerAdapter") as mock_prop_cls:
mock_prop_cls.return_value = MagicMock()
with patch("prometheus.cli.commands.optimize.dspy"):
result = runner.invoke(
app,
[
"optimize",
"-i",
str(config_file),
"-o",
str(output_file),
"-v",
],
)
assert result.exit_code == 0
def test_optimize_displays_metrics(self, tmp_path: Path) -> None:
config_file = self._write_config(tmp_path)
output_file = tmp_path / "output.yaml"
mock_result = OptimizationResult(
optimized_prompt="Better prompt",
initial_prompt="test",
iterations_used=3,
total_llm_calls=30,
initial_score=0.40,
final_score=0.85,
improvement=0.45,
history=[],
)
mock_uc = AsyncMock()
mock_uc.execute.return_value = mock_result
with patch("prometheus.cli.commands.optimize.OptimizePromptUseCase", return_value=mock_uc):
with patch("prometheus.cli.commands.optimize.DSPySyntheticAdapter"):
with patch("prometheus.cli.commands.optimize.DSPyLLMAdapter") as mock_llm_cls:
mock_llm_cls.return_value = MagicMock()
with patch("prometheus.cli.commands.optimize.DSPyJudgeAdapter") as mock_judge_cls:
mock_judge_cls.return_value = MagicMock()
with patch("prometheus.cli.commands.optimize.DSPyProposerAdapter") as mock_prop_cls:
mock_prop_cls.return_value = MagicMock()
with patch("prometheus.cli.commands.optimize.dspy"):
result = runner.invoke(
app,
[
"optimize",
"-i",
str(config_file),
"-o",
str(output_file),
],
)
assert result.exit_code == 0
assert "0.40" in result.output
assert "0.85" in result.output
assert "+0.45" in result.output
def test_optimize_with_max_concurrency_flag(self, tmp_path: Path) -> None:
config_file = self._write_config(tmp_path)
output_file = tmp_path / "output.yaml"
mock_result = OptimizationResult(
optimized_prompt="Better prompt",
initial_prompt="test",
iterations_used=1,
total_llm_calls=10,
initial_score=0.3,
final_score=0.8,
improvement=0.5,
history=[],
)
mock_uc = AsyncMock()
mock_uc.execute.return_value = mock_result
with patch("prometheus.cli.commands.optimize.OptimizePromptUseCase", return_value=mock_uc):
with patch("prometheus.cli.commands.optimize.DSPySyntheticAdapter"):
with patch("prometheus.cli.commands.optimize.DSPyLLMAdapter") as mock_llm_cls:
mock_llm_cls.return_value = MagicMock()
with patch("prometheus.cli.commands.optimize.DSPyJudgeAdapter") as mock_judge_cls:
mock_judge_cls.return_value = MagicMock()
with patch("prometheus.cli.commands.optimize.DSPyProposerAdapter") as mock_prop_cls:
mock_prop_cls.return_value = MagicMock()
with patch("prometheus.cli.commands.optimize.dspy"):
result = runner.invoke(
app,
[
"optimize",
"-i",
str(config_file),
"-o",
str(output_file),
"--max-concurrency",
"10",
],
)
assert result.exit_code == 0
class TestCLIHelp:
"""Tests for CLI help and no-args behavior."""
def test_no_args_shows_help(self) -> None:
result = runner.invoke(app, [])
# Typer uses exit code 2 when no_args_is_help=True
assert result.exit_code in (0, 2)
assert "PROMETHEUS" in result.output or "Usage" in result.output
def test_optimize_help(self) -> None:
result = runner.invoke(app, ["optimize", "--help"])
assert result.exit_code == 0
assert "input" in result.output.lower() or "INPUT" in result.output
def test_version_help(self) -> None:
result = runner.invoke(app, ["version", "--help"])
assert result.exit_code == 0
def test_init_help(self) -> None:
result = runner.invoke(app, ["init", "--help"])
assert result.exit_code == 0
def test_list_help(self) -> None:
result = runner.invoke(app, ["list", "--help"])
assert result.exit_code == 0
class TestCLIVersion:
"""Tests for the `prometheus version` command."""
def test_version_prints_version(self) -> None:
result = runner.invoke(app, ["version"])
assert result.exit_code == 0
assert "PROMETHEUS" in result.output
assert "0.1.0" in result.output
class TestCLIList:
"""Tests for the `prometheus list` command."""
def test_list_no_runs(self, tmp_path: Path) -> None:
result = runner.invoke(app, ["list", "-d", str(tmp_path)])
assert result.exit_code == 0
assert "No optimization runs found" in result.output
def test_list_with_result(self, tmp_path: Path) -> None:
result_data = {
"optimized_prompt": "Better prompt for testing",
"initial_prompt": "test",
"iterations_used": 5,
"total_llm_calls": 50,
"initial_score": 0.30,
"final_score": 0.90,
"improvement": 0.60,
"history": [],
}
result_file = tmp_path / "output.yaml"
import yaml as _yaml
with open(result_file, "w") as f:
_yaml.dump(result_data, f)
result = runner.invoke(app, ["list", "-d", str(tmp_path)])
assert result.exit_code == 0
assert "0.30" in result.output
assert "0.90" in result.output
def test_list_nonexistent_directory(self) -> None:
result = runner.invoke(app, ["list", "-d", "/nonexistent/dir"])
assert result.exit_code == 1