feat: v0.2.0 sprint — ground truth eval, crossover/mutation, checkpointing, similarity guards, dataset loader, CLI commands, extended test coverage
Aggregates all v0.2.0 sprint work (GARAA-30 through GARAA-40) and fixes 2 integration tests that broke when the codebase went async (DSPyLLMAdapter and full pipeline tests now properly await coroutines). 277 tests pass (260 unit + 17 integration). Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
294
tests/unit/test_adapters.py
Normal file
294
tests/unit/test_adapters.py
Normal file
@@ -0,0 +1,294 @@
|
||||
"""Unit tests for infrastructure adapters — LLM, Judge, Proposer, Synthetic.
|
||||
|
||||
Uses mocked DSPy modules to isolate adapter logic from LLM calls.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
|
||||
import dspy
|
||||
import pytest
|
||||
|
||||
from prometheus.domain.entities import Prompt, SyntheticExample, Trajectory
|
||||
from prometheus.infrastructure.judge_adapter import DSPyJudgeAdapter
|
||||
from prometheus.infrastructure.llm_adapter import DSPyLLMAdapter
|
||||
from prometheus.infrastructure.proposer_adapter import DSPyProposerAdapter
|
||||
from prometheus.infrastructure.synth_adapter import DSPySyntheticAdapter
|
||||
|
||||
|
||||
# --- LLM Adapter ---
|
||||
|
||||
|
||||
class TestDSPyLLMAdapter:
|
||||
"""Tests for DSPyLLMAdapter.execute()."""
|
||||
|
||||
@pytest.fixture
|
||||
def mock_lm(self) -> MagicMock:
|
||||
return MagicMock(spec=dspy.LM)
|
||||
|
||||
@pytest.fixture
|
||||
def adapter(self, mock_lm: MagicMock) -> DSPyLLMAdapter:
|
||||
return DSPyLLMAdapter(lm=mock_lm)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_execute_returns_output_string(
|
||||
self, adapter: DSPyLLMAdapter, mock_lm: MagicMock
|
||||
) -> None:
|
||||
mock_predictor = MagicMock()
|
||||
mock_predictor.return_value = MagicMock(output="Hello response")
|
||||
adapter._predictor = mock_predictor
|
||||
|
||||
prompt = Prompt(text="Say hello.")
|
||||
result = await adapter.execute(prompt, "Hi there")
|
||||
|
||||
assert result == "Hello response"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_execute_passes_prompt_text_and_input(
|
||||
self, adapter: DSPyLLMAdapter, mock_lm: MagicMock
|
||||
) -> None:
|
||||
mock_predictor = MagicMock()
|
||||
mock_predictor.return_value = MagicMock(output="response")
|
||||
adapter._predictor = mock_predictor
|
||||
|
||||
prompt = Prompt(text="Translate this.")
|
||||
await adapter.execute(prompt, "Hello world")
|
||||
|
||||
mock_predictor.assert_called_once_with(
|
||||
instruction="Translate this.",
|
||||
input_text="Hello world",
|
||||
)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_execute_uses_dspy_context(
|
||||
self, adapter: DSPyLLMAdapter, mock_lm: MagicMock
|
||||
) -> None:
|
||||
mock_predictor = MagicMock()
|
||||
mock_predictor.return_value = MagicMock(output="ok")
|
||||
adapter._predictor = mock_predictor
|
||||
|
||||
with patch("prometheus.infrastructure.llm_adapter.dspy.context") as mock_ctx:
|
||||
await adapter.execute(Prompt(text="test"), "input")
|
||||
mock_ctx.assert_called_once_with(lm=mock_lm)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_execute_converts_output_to_str(
|
||||
self, adapter: DSPyLLMAdapter, mock_lm: MagicMock
|
||||
) -> None:
|
||||
mock_predictor = MagicMock()
|
||||
mock_predictor.return_value = MagicMock(output=42)
|
||||
adapter._predictor = mock_predictor
|
||||
|
||||
result = await adapter.execute(Prompt(text="test"), "input")
|
||||
assert isinstance(result, str)
|
||||
assert result == "42"
|
||||
|
||||
|
||||
# --- Judge Adapter ---
|
||||
|
||||
|
||||
class TestDSPyJudgeAdapter:
|
||||
"""Tests for DSPyJudgeAdapter.judge_batch()."""
|
||||
|
||||
@pytest.fixture
|
||||
def mock_lm(self) -> MagicMock:
|
||||
return MagicMock(spec=dspy.LM)
|
||||
|
||||
@pytest.fixture
|
||||
def adapter(self, mock_lm: MagicMock) -> DSPyJudgeAdapter:
|
||||
return DSPyJudgeAdapter(lm=mock_lm)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_judge_batch_returns_scores_and_feedback(
|
||||
self, adapter: DSPyJudgeAdapter, mock_lm: MagicMock
|
||||
) -> None:
|
||||
adapter._judge = MagicMock()
|
||||
adapter._judge.side_effect = [
|
||||
MagicMock(score=0.9, feedback="Excellent."),
|
||||
MagicMock(score=0.4, feedback="Incomplete."),
|
||||
]
|
||||
|
||||
pairs = [("What is 2+2?", "4"), ("Capital of France?", "London")]
|
||||
result = await adapter.judge_batch("math and geography", pairs)
|
||||
|
||||
assert len(result) == 2
|
||||
assert result[0] == (0.9, "Excellent.")
|
||||
assert result[1] == (0.4, "Incomplete.")
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_judge_batch_empty_pairs(
|
||||
self, adapter: DSPyJudgeAdapter, mock_lm: MagicMock
|
||||
) -> None:
|
||||
result = await adapter.judge_batch("task", [])
|
||||
assert result == []
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_judge_batch_uses_dspy_context(
|
||||
self, adapter: DSPyJudgeAdapter, mock_lm: MagicMock
|
||||
) -> None:
|
||||
adapter._judge = MagicMock()
|
||||
adapter._judge.return_value = MagicMock(score=0.5, feedback="ok")
|
||||
|
||||
with patch("prometheus.infrastructure.judge_adapter.dspy.context") as mock_ctx:
|
||||
await adapter.judge_batch("task", [("in", "out")])
|
||||
mock_ctx.assert_called_once_with(lm=mock_lm)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_judge_batch_returns_all_results(
|
||||
self, adapter: DSPyJudgeAdapter, mock_lm: MagicMock
|
||||
) -> None:
|
||||
"""Judge calls run in parallel but all results are returned."""
|
||||
adapter._judge = MagicMock()
|
||||
adapter._judge.side_effect = [
|
||||
MagicMock(score=0.5, feedback="ok"),
|
||||
MagicMock(score=0.7, feedback="better"),
|
||||
MagicMock(score=0.3, feedback="worse"),
|
||||
]
|
||||
|
||||
pairs = [("first", "out1"), ("second", "out2"), ("third", "out3")]
|
||||
results = await adapter.judge_batch("task", pairs)
|
||||
|
||||
assert len(results) == 3
|
||||
scores = [r[0] for r in results]
|
||||
assert 0.5 in scores
|
||||
assert 0.7 in scores
|
||||
assert 0.3 in scores
|
||||
|
||||
|
||||
# --- Proposer Adapter ---
|
||||
|
||||
|
||||
class TestDSPyProposerAdapter:
|
||||
"""Tests for DSPyProposerAdapter.propose()."""
|
||||
|
||||
@pytest.fixture
|
||||
def mock_lm(self) -> MagicMock:
|
||||
return MagicMock(spec=dspy.LM)
|
||||
|
||||
@pytest.fixture
|
||||
def adapter(self, mock_lm: MagicMock) -> DSPyProposerAdapter:
|
||||
return DSPyProposerAdapter(lm=mock_lm)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_propose_returns_new_prompt(
|
||||
self, adapter: DSPyProposerAdapter, mock_lm: MagicMock
|
||||
) -> None:
|
||||
adapter._proposer = MagicMock()
|
||||
adapter._proposer.return_value = MagicMock(
|
||||
new_instruction="Be concise and accurate."
|
||||
)
|
||||
|
||||
current = Prompt(text="Answer questions.")
|
||||
trajectories = [
|
||||
Trajectory("in", "out", 0.3, "too verbose", "Answer questions.")
|
||||
]
|
||||
result = await adapter.propose(current, trajectories, "Q&A task")
|
||||
|
||||
assert isinstance(result, Prompt)
|
||||
assert result.text == "Be concise and accurate."
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_propose_uses_dspy_context(
|
||||
self, adapter: DSPyProposerAdapter, mock_lm: MagicMock
|
||||
) -> None:
|
||||
adapter._proposer = MagicMock()
|
||||
adapter._proposer.return_value = MagicMock(new_instruction="improved")
|
||||
|
||||
with patch("prometheus.infrastructure.proposer_adapter.dspy.context") as mock_ctx:
|
||||
await adapter.propose(Prompt(text="t"), [], "task")
|
||||
mock_ctx.assert_called_once_with(lm=mock_lm)
|
||||
|
||||
def test_format_failures_single_trajectory(self) -> None:
|
||||
trajectories = [
|
||||
Trajectory("What is AI?", "A type of robot.", 0.3, "Incomplete definition.", "prompt")
|
||||
]
|
||||
result = DSPyProposerAdapter._format_failures(trajectories)
|
||||
|
||||
assert "What is AI?" in result
|
||||
assert "A type of robot." in result
|
||||
assert "0.30" in result
|
||||
assert "Incomplete definition." in result
|
||||
assert "# Example 1" in result
|
||||
|
||||
def test_format_failures_multiple_trajectories(self) -> None:
|
||||
trajectories = [
|
||||
Trajectory("input1", "output1", 0.4, "bad", "prompt"),
|
||||
Trajectory("input2", "output2", 0.2, "worse", "prompt"),
|
||||
]
|
||||
result = DSPyProposerAdapter._format_failures(trajectories)
|
||||
|
||||
assert "# Example 1" in result
|
||||
assert "# Example 2" in result
|
||||
assert "---" in result
|
||||
assert "input1" in result
|
||||
assert "input2" in result
|
||||
|
||||
def test_format_failures_empty_list(self) -> None:
|
||||
result = DSPyProposerAdapter._format_failures([])
|
||||
assert result == ""
|
||||
|
||||
|
||||
# --- Synthetic Adapter ---
|
||||
|
||||
|
||||
class TestDSPySyntheticAdapter:
|
||||
"""Tests for DSPySyntheticAdapter.generate_inputs()."""
|
||||
|
||||
@pytest.fixture
|
||||
def mock_lm(self) -> MagicMock:
|
||||
return MagicMock(spec=dspy.LM)
|
||||
|
||||
@pytest.fixture
|
||||
def adapter(self, mock_lm: MagicMock) -> DSPySyntheticAdapter:
|
||||
return DSPySyntheticAdapter(lm=mock_lm)
|
||||
|
||||
def test_generate_inputs_returns_examples(
|
||||
self, adapter: DSPySyntheticAdapter, mock_lm: MagicMock
|
||||
) -> None:
|
||||
adapter._generator = MagicMock()
|
||||
adapter._generator.return_value = MagicMock(
|
||||
examples=["What is AI?", "Explain ML.", "What is NLP?"]
|
||||
)
|
||||
|
||||
result = adapter.generate_inputs("AI task", 3)
|
||||
|
||||
assert len(result) == 3
|
||||
assert all(isinstance(ex, SyntheticExample) for ex in result)
|
||||
assert result[0].input_text == "What is AI?"
|
||||
assert result[0].id == 0
|
||||
assert result[1].id == 1
|
||||
|
||||
def test_generate_inputs_truncates_to_n(
|
||||
self, adapter: DSPySyntheticAdapter, mock_lm: MagicMock
|
||||
) -> None:
|
||||
adapter._generator = MagicMock()
|
||||
adapter._generator.return_value = MagicMock(
|
||||
examples=["q1", "q2", "q3", "q4", "q5"]
|
||||
)
|
||||
|
||||
result = adapter.generate_inputs("task", 3)
|
||||
|
||||
assert len(result) == 3
|
||||
|
||||
def test_generate_inputs_passes_correct_args(
|
||||
self, adapter: DSPySyntheticAdapter, mock_lm: MagicMock
|
||||
) -> None:
|
||||
adapter._generator = MagicMock()
|
||||
adapter._generator.return_value = MagicMock(examples=["q1"])
|
||||
|
||||
adapter.generate_inputs("my task", 5)
|
||||
|
||||
adapter._generator.assert_called_once_with(
|
||||
task_description="my task",
|
||||
n_examples=5,
|
||||
)
|
||||
|
||||
def test_generate_inputs_empty_list(
|
||||
self, adapter: DSPySyntheticAdapter, mock_lm: MagicMock
|
||||
) -> None:
|
||||
adapter._generator = MagicMock()
|
||||
adapter._generator.return_value = MagicMock(examples=[])
|
||||
|
||||
result = adapter.generate_inputs("task", 0)
|
||||
|
||||
assert result == []
|
||||
Reference in New Issue
Block a user