Add configurable judge rubrics and multi-dimensional scoring with
weighted aggregation. New config fields: judge_criteria (free text)
and judge_dimensions (list of {name, weight, description}). CLI
--judge-criteria flag provides quick overrides. The judge adapter
computes weighted aggregate scores and enriches feedback with
per-dimension breakdowns.
Co-Authored-By: Paperclip <noreply@paperclip.ing>
214 lines
7.5 KiB
Python
214 lines
7.5 KiB
Python
"""Unit tests for multi-model adapter configuration.
|
|
|
|
Verifies that each adapter uses its own dspy.LM instance and
|
|
that per-model api_base/api_key_env overrides are wired correctly.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
import dspy
|
|
import pytest
|
|
|
|
from prometheus.domain.entities import Prompt, SyntheticExample, Trajectory
|
|
from prometheus.infrastructure.judge_adapter import DSPyJudgeAdapter
|
|
from prometheus.infrastructure.llm_adapter import DSPyLLMAdapter
|
|
from prometheus.infrastructure.proposer_adapter import DSPyProposerAdapter
|
|
from prometheus.infrastructure.synth_adapter import DSPySyntheticAdapter
|
|
|
|
|
|
@pytest.fixture
|
|
def task_lm() -> dspy.LM:
|
|
"""Dummy LM for task execution."""
|
|
return dspy.utils.DummyLM([{"output": "task model output"}])
|
|
|
|
|
|
@pytest.fixture
|
|
def judge_lm() -> dspy.LM:
|
|
"""Dummy LM for judging (ChainOfThought requires reasoning field)."""
|
|
return dspy.utils.DummyLM(
|
|
[
|
|
{"reasoning": "Evaluating output.", "score": "0.8", "feedback": "Good response.", "dimension_scores": "{}"},
|
|
]
|
|
)
|
|
|
|
|
|
@pytest.fixture
|
|
def proposer_lm() -> dspy.LM:
|
|
"""Dummy LM for proposing (ChainOfThought requires reasoning field)."""
|
|
return dspy.utils.DummyLM(
|
|
[
|
|
{"reasoning": "Analyzing failures.", "new_instruction": "Improved prompt: be more specific."},
|
|
]
|
|
)
|
|
|
|
|
|
@pytest.fixture
|
|
def synth_lm() -> dspy.LM:
|
|
"""Dummy LM for synthetic generation (ChainOfThought requires reasoning field)."""
|
|
return dspy.utils.DummyLM(
|
|
[
|
|
{"reasoning": "Generating examples.", "examples": json.dumps(["input 1", "input 2", "input 3"])},
|
|
]
|
|
)
|
|
|
|
|
|
class TestDSPyLLMAdapterOwnLM:
|
|
"""Bug #2 fix: DSPyLLMAdapter must use the LM it receives, not the global one."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_uses_provided_lm_not_global(self) -> None:
|
|
local_lm = dspy.utils.DummyLM([{"output": "local response"}])
|
|
global_lm = dspy.utils.DummyLM([{"output": "global response"}])
|
|
dspy.configure(lm=global_lm)
|
|
|
|
adapter = DSPyLLMAdapter(lm=local_lm)
|
|
result = await adapter.execute(Prompt(text="test"), "input")
|
|
|
|
assert result == "local response"
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_does_not_affect_global_lm(self) -> None:
|
|
local_lm = dspy.utils.DummyLM([{"output": "local response"}])
|
|
global_lm = dspy.utils.DummyLM([{"output": "global response"}])
|
|
dspy.configure(lm=global_lm)
|
|
|
|
adapter = DSPyLLMAdapter(lm=local_lm)
|
|
await adapter.execute(Prompt(text="test"), "input")
|
|
|
|
# Global LM should still be the same
|
|
assert dspy.settings.lm is global_lm
|
|
|
|
|
|
class TestDSPyJudgeAdapterOwnLM:
|
|
"""DSPyJudgeAdapter must use its own LM instance."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_uses_provided_lm(self, judge_lm: dspy.LM) -> None:
|
|
adapter = DSPyJudgeAdapter(lm=judge_lm)
|
|
results = await adapter.judge_batch(
|
|
task_description="Test task",
|
|
pairs=[("input 1", "output 1")],
|
|
)
|
|
assert len(results) == 1
|
|
score, feedback = results[0]
|
|
assert score == 0.8
|
|
assert feedback == "Good response."
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_does_not_use_global_lm(self) -> None:
|
|
judge_lm = dspy.utils.DummyLM(
|
|
[{"reasoning": "ok", "score": "0.9", "feedback": "Judge-specific response", "dimension_scores": "{}"}]
|
|
)
|
|
global_lm = dspy.utils.DummyLM([{"reasoning": "no", "score": "0.1", "feedback": "Wrong LM!", "dimension_scores": "{}"}])
|
|
dspy.configure(lm=global_lm)
|
|
|
|
adapter = DSPyJudgeAdapter(lm=judge_lm)
|
|
results = await adapter.judge_batch("task", [("in", "out")])
|
|
assert results[0][0] == 0.9
|
|
|
|
|
|
class TestDSPyProposerAdapterOwnLM:
|
|
"""DSPyProposerAdapter must use its own LM instance."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_uses_provided_lm(self, proposer_lm: dspy.LM) -> None:
|
|
adapter = DSPyProposerAdapter(lm=proposer_lm)
|
|
trajectories = [
|
|
Trajectory(
|
|
input_text="test input",
|
|
output_text="test output",
|
|
score=0.3,
|
|
feedback="bad",
|
|
prompt_used="old prompt",
|
|
)
|
|
]
|
|
result = await adapter.propose(
|
|
current_prompt=Prompt(text="old prompt"),
|
|
trajectories=trajectories,
|
|
task_description="Test task",
|
|
)
|
|
assert "Improved prompt" in result.text
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_does_not_use_global_lm(self) -> None:
|
|
proposer_lm = dspy.utils.DummyLM(
|
|
[{"reasoning": "ok", "new_instruction": "proposer-specific"}]
|
|
)
|
|
global_lm = dspy.utils.DummyLM(
|
|
[{"reasoning": "no", "new_instruction": "wrong-global"}]
|
|
)
|
|
dspy.configure(lm=global_lm)
|
|
|
|
adapter = DSPyProposerAdapter(lm=proposer_lm)
|
|
result = await adapter.propose(
|
|
current_prompt=Prompt(text="test"),
|
|
trajectories=[],
|
|
task_description="task",
|
|
)
|
|
assert result.text == "proposer-specific"
|
|
|
|
|
|
class TestDSPySyntheticAdapterOwnLM:
|
|
"""DSPySyntheticAdapter must use its own LM instance."""
|
|
|
|
def test_uses_provided_lm(self, synth_lm: dspy.LM) -> None:
|
|
adapter = DSPySyntheticAdapter(lm=synth_lm)
|
|
results = adapter.generate_inputs("Test task", 3)
|
|
assert len(results) == 3
|
|
assert all(isinstance(ex, SyntheticExample) for ex in results)
|
|
|
|
def test_does_not_use_global_lm(self) -> None:
|
|
synth_lm = dspy.utils.DummyLM(
|
|
[{"reasoning": "ok", "examples": json.dumps(["synth-specific"])}]
|
|
)
|
|
global_lm = dspy.utils.DummyLM(
|
|
[{"reasoning": "no", "examples": json.dumps(["wrong-global"])}]
|
|
)
|
|
dspy.configure(lm=global_lm)
|
|
|
|
adapter = DSPySyntheticAdapter(lm=synth_lm)
|
|
results = adapter.generate_inputs("task", 1)
|
|
assert results[0].input_text == "synth-specific"
|
|
|
|
|
|
class TestPerModelOverrides:
|
|
"""Verify that per-model api_base/api_key_env are passed through to dspy.LM."""
|
|
|
|
@patch("prometheus.cli.commands.optimize.dspy.LM")
|
|
def test_per_model_api_base_override(self, mock_lm_cls: MagicMock) -> None:
|
|
"""Per-model api_base should be used instead of global."""
|
|
mock_lm_cls.return_value = MagicMock()
|
|
|
|
from prometheus.application.dto import OptimizationConfig
|
|
|
|
config = OptimizationConfig(
|
|
seed_prompt="test",
|
|
task_description="test",
|
|
task_model="openai/gpt-4o-mini",
|
|
judge_model="openai/gpt-4o",
|
|
proposer_model="openai/gpt-4o",
|
|
synth_model="openai/gpt-4o",
|
|
judge_api_base="https://judge.example.com/v1",
|
|
judge_api_key_env="JUDGE_API_KEY",
|
|
)
|
|
|
|
# Verify config carries the overrides
|
|
assert config.judge_api_base == "https://judge.example.com/v1"
|
|
assert config.judge_api_key_env == "JUDGE_API_KEY"
|
|
assert config.task_api_base is None
|
|
|
|
def test_config_defaults_to_none(self) -> None:
|
|
from prometheus.application.dto import OptimizationConfig
|
|
|
|
config = OptimizationConfig(seed_prompt="test", task_description="test")
|
|
assert config.task_api_base is None
|
|
assert config.task_api_key_env is None
|
|
assert config.judge_api_base is None
|
|
assert config.judge_api_key_env is None
|
|
assert config.proposer_api_base is None
|
|
assert config.proposer_api_key_env is None
|
|
assert config.synth_api_base is None
|
|
assert config.synth_api_key_env is None
|