Aggregates all v0.2.0 sprint work (GARAA-30 through GARAA-40) and fixes 2 integration tests that broke when the codebase went async (DSPyLLMAdapter and full pipeline tests now properly await coroutines). 277 tests pass (260 unit + 17 integration). Co-Authored-By: Paperclip <noreply@paperclip.ing>
295 lines
9.7 KiB
Python
295 lines
9.7 KiB
Python
"""Unit tests for infrastructure adapters — LLM, Judge, Proposer, Synthetic.
|
|
|
|
Uses mocked DSPy modules to isolate adapter logic from LLM calls.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
from unittest.mock import AsyncMock, MagicMock, patch
|
|
|
|
import dspy
|
|
import pytest
|
|
|
|
from prometheus.domain.entities import Prompt, SyntheticExample, Trajectory
|
|
from prometheus.infrastructure.judge_adapter import DSPyJudgeAdapter
|
|
from prometheus.infrastructure.llm_adapter import DSPyLLMAdapter
|
|
from prometheus.infrastructure.proposer_adapter import DSPyProposerAdapter
|
|
from prometheus.infrastructure.synth_adapter import DSPySyntheticAdapter
|
|
|
|
|
|
# --- LLM Adapter ---
|
|
|
|
|
|
class TestDSPyLLMAdapter:
|
|
"""Tests for DSPyLLMAdapter.execute()."""
|
|
|
|
@pytest.fixture
|
|
def mock_lm(self) -> MagicMock:
|
|
return MagicMock(spec=dspy.LM)
|
|
|
|
@pytest.fixture
|
|
def adapter(self, mock_lm: MagicMock) -> DSPyLLMAdapter:
|
|
return DSPyLLMAdapter(lm=mock_lm)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_execute_returns_output_string(
|
|
self, adapter: DSPyLLMAdapter, mock_lm: MagicMock
|
|
) -> None:
|
|
mock_predictor = MagicMock()
|
|
mock_predictor.return_value = MagicMock(output="Hello response")
|
|
adapter._predictor = mock_predictor
|
|
|
|
prompt = Prompt(text="Say hello.")
|
|
result = await adapter.execute(prompt, "Hi there")
|
|
|
|
assert result == "Hello response"
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_execute_passes_prompt_text_and_input(
|
|
self, adapter: DSPyLLMAdapter, mock_lm: MagicMock
|
|
) -> None:
|
|
mock_predictor = MagicMock()
|
|
mock_predictor.return_value = MagicMock(output="response")
|
|
adapter._predictor = mock_predictor
|
|
|
|
prompt = Prompt(text="Translate this.")
|
|
await adapter.execute(prompt, "Hello world")
|
|
|
|
mock_predictor.assert_called_once_with(
|
|
instruction="Translate this.",
|
|
input_text="Hello world",
|
|
)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_execute_uses_dspy_context(
|
|
self, adapter: DSPyLLMAdapter, mock_lm: MagicMock
|
|
) -> None:
|
|
mock_predictor = MagicMock()
|
|
mock_predictor.return_value = MagicMock(output="ok")
|
|
adapter._predictor = mock_predictor
|
|
|
|
with patch("prometheus.infrastructure.llm_adapter.dspy.context") as mock_ctx:
|
|
await adapter.execute(Prompt(text="test"), "input")
|
|
mock_ctx.assert_called_once_with(lm=mock_lm)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_execute_converts_output_to_str(
|
|
self, adapter: DSPyLLMAdapter, mock_lm: MagicMock
|
|
) -> None:
|
|
mock_predictor = MagicMock()
|
|
mock_predictor.return_value = MagicMock(output=42)
|
|
adapter._predictor = mock_predictor
|
|
|
|
result = await adapter.execute(Prompt(text="test"), "input")
|
|
assert isinstance(result, str)
|
|
assert result == "42"
|
|
|
|
|
|
# --- Judge Adapter ---
|
|
|
|
|
|
class TestDSPyJudgeAdapter:
|
|
"""Tests for DSPyJudgeAdapter.judge_batch()."""
|
|
|
|
@pytest.fixture
|
|
def mock_lm(self) -> MagicMock:
|
|
return MagicMock(spec=dspy.LM)
|
|
|
|
@pytest.fixture
|
|
def adapter(self, mock_lm: MagicMock) -> DSPyJudgeAdapter:
|
|
return DSPyJudgeAdapter(lm=mock_lm)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_judge_batch_returns_scores_and_feedback(
|
|
self, adapter: DSPyJudgeAdapter, mock_lm: MagicMock
|
|
) -> None:
|
|
adapter._judge = MagicMock()
|
|
adapter._judge.side_effect = [
|
|
MagicMock(score=0.9, feedback="Excellent."),
|
|
MagicMock(score=0.4, feedback="Incomplete."),
|
|
]
|
|
|
|
pairs = [("What is 2+2?", "4"), ("Capital of France?", "London")]
|
|
result = await adapter.judge_batch("math and geography", pairs)
|
|
|
|
assert len(result) == 2
|
|
assert result[0] == (0.9, "Excellent.")
|
|
assert result[1] == (0.4, "Incomplete.")
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_judge_batch_empty_pairs(
|
|
self, adapter: DSPyJudgeAdapter, mock_lm: MagicMock
|
|
) -> None:
|
|
result = await adapter.judge_batch("task", [])
|
|
assert result == []
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_judge_batch_uses_dspy_context(
|
|
self, adapter: DSPyJudgeAdapter, mock_lm: MagicMock
|
|
) -> None:
|
|
adapter._judge = MagicMock()
|
|
adapter._judge.return_value = MagicMock(score=0.5, feedback="ok")
|
|
|
|
with patch("prometheus.infrastructure.judge_adapter.dspy.context") as mock_ctx:
|
|
await adapter.judge_batch("task", [("in", "out")])
|
|
mock_ctx.assert_called_once_with(lm=mock_lm)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_judge_batch_returns_all_results(
|
|
self, adapter: DSPyJudgeAdapter, mock_lm: MagicMock
|
|
) -> None:
|
|
"""Judge calls run in parallel but all results are returned."""
|
|
adapter._judge = MagicMock()
|
|
adapter._judge.side_effect = [
|
|
MagicMock(score=0.5, feedback="ok"),
|
|
MagicMock(score=0.7, feedback="better"),
|
|
MagicMock(score=0.3, feedback="worse"),
|
|
]
|
|
|
|
pairs = [("first", "out1"), ("second", "out2"), ("third", "out3")]
|
|
results = await adapter.judge_batch("task", pairs)
|
|
|
|
assert len(results) == 3
|
|
scores = [r[0] for r in results]
|
|
assert 0.5 in scores
|
|
assert 0.7 in scores
|
|
assert 0.3 in scores
|
|
|
|
|
|
# --- Proposer Adapter ---
|
|
|
|
|
|
class TestDSPyProposerAdapter:
|
|
"""Tests for DSPyProposerAdapter.propose()."""
|
|
|
|
@pytest.fixture
|
|
def mock_lm(self) -> MagicMock:
|
|
return MagicMock(spec=dspy.LM)
|
|
|
|
@pytest.fixture
|
|
def adapter(self, mock_lm: MagicMock) -> DSPyProposerAdapter:
|
|
return DSPyProposerAdapter(lm=mock_lm)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_propose_returns_new_prompt(
|
|
self, adapter: DSPyProposerAdapter, mock_lm: MagicMock
|
|
) -> None:
|
|
adapter._proposer = MagicMock()
|
|
adapter._proposer.return_value = MagicMock(
|
|
new_instruction="Be concise and accurate."
|
|
)
|
|
|
|
current = Prompt(text="Answer questions.")
|
|
trajectories = [
|
|
Trajectory("in", "out", 0.3, "too verbose", "Answer questions.")
|
|
]
|
|
result = await adapter.propose(current, trajectories, "Q&A task")
|
|
|
|
assert isinstance(result, Prompt)
|
|
assert result.text == "Be concise and accurate."
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_propose_uses_dspy_context(
|
|
self, adapter: DSPyProposerAdapter, mock_lm: MagicMock
|
|
) -> None:
|
|
adapter._proposer = MagicMock()
|
|
adapter._proposer.return_value = MagicMock(new_instruction="improved")
|
|
|
|
with patch("prometheus.infrastructure.proposer_adapter.dspy.context") as mock_ctx:
|
|
await adapter.propose(Prompt(text="t"), [], "task")
|
|
mock_ctx.assert_called_once_with(lm=mock_lm)
|
|
|
|
def test_format_failures_single_trajectory(self) -> None:
|
|
trajectories = [
|
|
Trajectory("What is AI?", "A type of robot.", 0.3, "Incomplete definition.", "prompt")
|
|
]
|
|
result = DSPyProposerAdapter._format_failures(trajectories)
|
|
|
|
assert "What is AI?" in result
|
|
assert "A type of robot." in result
|
|
assert "0.30" in result
|
|
assert "Incomplete definition." in result
|
|
assert "# Example 1" in result
|
|
|
|
def test_format_failures_multiple_trajectories(self) -> None:
|
|
trajectories = [
|
|
Trajectory("input1", "output1", 0.4, "bad", "prompt"),
|
|
Trajectory("input2", "output2", 0.2, "worse", "prompt"),
|
|
]
|
|
result = DSPyProposerAdapter._format_failures(trajectories)
|
|
|
|
assert "# Example 1" in result
|
|
assert "# Example 2" in result
|
|
assert "---" in result
|
|
assert "input1" in result
|
|
assert "input2" in result
|
|
|
|
def test_format_failures_empty_list(self) -> None:
|
|
result = DSPyProposerAdapter._format_failures([])
|
|
assert result == ""
|
|
|
|
|
|
# --- Synthetic Adapter ---
|
|
|
|
|
|
class TestDSPySyntheticAdapter:
|
|
"""Tests for DSPySyntheticAdapter.generate_inputs()."""
|
|
|
|
@pytest.fixture
|
|
def mock_lm(self) -> MagicMock:
|
|
return MagicMock(spec=dspy.LM)
|
|
|
|
@pytest.fixture
|
|
def adapter(self, mock_lm: MagicMock) -> DSPySyntheticAdapter:
|
|
return DSPySyntheticAdapter(lm=mock_lm)
|
|
|
|
def test_generate_inputs_returns_examples(
|
|
self, adapter: DSPySyntheticAdapter, mock_lm: MagicMock
|
|
) -> None:
|
|
adapter._generator = MagicMock()
|
|
adapter._generator.return_value = MagicMock(
|
|
examples=["What is AI?", "Explain ML.", "What is NLP?"]
|
|
)
|
|
|
|
result = adapter.generate_inputs("AI task", 3)
|
|
|
|
assert len(result) == 3
|
|
assert all(isinstance(ex, SyntheticExample) for ex in result)
|
|
assert result[0].input_text == "What is AI?"
|
|
assert result[0].id == 0
|
|
assert result[1].id == 1
|
|
|
|
def test_generate_inputs_truncates_to_n(
|
|
self, adapter: DSPySyntheticAdapter, mock_lm: MagicMock
|
|
) -> None:
|
|
adapter._generator = MagicMock()
|
|
adapter._generator.return_value = MagicMock(
|
|
examples=["q1", "q2", "q3", "q4", "q5"]
|
|
)
|
|
|
|
result = adapter.generate_inputs("task", 3)
|
|
|
|
assert len(result) == 3
|
|
|
|
def test_generate_inputs_passes_correct_args(
|
|
self, adapter: DSPySyntheticAdapter, mock_lm: MagicMock
|
|
) -> None:
|
|
adapter._generator = MagicMock()
|
|
adapter._generator.return_value = MagicMock(examples=["q1"])
|
|
|
|
adapter.generate_inputs("my task", 5)
|
|
|
|
adapter._generator.assert_called_once_with(
|
|
task_description="my task",
|
|
n_examples=5,
|
|
)
|
|
|
|
def test_generate_inputs_empty_list(
|
|
self, adapter: DSPySyntheticAdapter, mock_lm: MagicMock
|
|
) -> None:
|
|
adapter._generator = MagicMock()
|
|
adapter._generator.return_value = MagicMock(examples=[])
|
|
|
|
result = adapter.generate_inputs("task", 0)
|
|
|
|
assert result == []
|