- Clean architecture (domain/application/infrastructure) - DSPy-based evolution engine with scoring - CLI via pyproject.toml entry point - Unit + integration tests (~300 tests) - Configs for glm-5.1 and glm-4.5-air models - Z.AI endpoint integration
199 lines
6.4 KiB
Python
199 lines
6.4 KiB
Python
"""Unit tests for DSPy module parsing logic."""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
import dspy
|
|
import pytest
|
|
|
|
from prometheus.infrastructure.dspy_modules import (
|
|
InstructionProposer,
|
|
OutputJudge,
|
|
SyntheticInputGenerator,
|
|
)
|
|
|
|
|
|
class TestSyntheticInputGeneratorParseFallback:
|
|
"""Tests for _parse_fallback — regex-based JSON recovery."""
|
|
|
|
def test_extracts_quoted_strings(self) -> None:
|
|
text = 'Here are some: "first example" and "second example" done.'
|
|
result = SyntheticInputGenerator._parse_fallback(text)
|
|
assert result == ["first example", "second example"]
|
|
|
|
def test_single_quoted_string(self) -> None:
|
|
text = 'Just one: "hello world"'
|
|
result = SyntheticInputGenerator._parse_fallback(text)
|
|
assert result == ["hello world"]
|
|
|
|
def test_no_quotes_returns_raw_text(self) -> None:
|
|
text = "no quotes at all here"
|
|
result = SyntheticInputGenerator._parse_fallback(text)
|
|
assert result == ["no quotes at all here"]
|
|
|
|
def test_empty_string_returns_itself(self) -> None:
|
|
result = SyntheticInputGenerator._parse_fallback("")
|
|
assert result == [""]
|
|
|
|
def test_mixed_json_with_extra_text(self) -> None:
|
|
text = 'Results: "alpha", "beta", "gamma" — take your pick.'
|
|
result = SyntheticInputGenerator._parse_fallback(text)
|
|
assert result == ["alpha", "beta", "gamma"]
|
|
|
|
|
|
class TestOutputJudgeForward:
|
|
"""Tests for OutputJudge score parsing and clamping.
|
|
|
|
Mocks the internal ChainOfThought module to isolate parsing logic.
|
|
"""
|
|
|
|
@pytest.fixture
|
|
def judge(self) -> OutputJudge:
|
|
return OutputJudge()
|
|
|
|
def test_valid_numeric_score(self, judge: OutputJudge) -> None:
|
|
judge.judge = MagicMock(
|
|
return_value=dspy.Prediction(score="0.8", feedback="Good output.")
|
|
)
|
|
result = judge.forward("task", "input", "output")
|
|
|
|
assert result.score == 0.8
|
|
assert result.feedback == "Good output."
|
|
|
|
def test_non_numeric_score_falls_back_to_half(
|
|
self, judge: OutputJudge
|
|
) -> None:
|
|
judge.judge = MagicMock(
|
|
return_value=dspy.Prediction(
|
|
score="not-a-number", feedback="N/A"
|
|
)
|
|
)
|
|
result = judge.forward("task", "input", "output")
|
|
|
|
assert result.score == 0.5
|
|
|
|
def test_score_clamped_to_upper_bound(self, judge: OutputJudge) -> None:
|
|
judge.judge = MagicMock(
|
|
return_value=dspy.Prediction(score="1.5", feedback="Great!")
|
|
)
|
|
result = judge.forward("task", "input", "output")
|
|
|
|
assert result.score == 1.0
|
|
|
|
def test_score_clamped_to_lower_bound(self, judge: OutputJudge) -> None:
|
|
judge.judge = MagicMock(
|
|
return_value=dspy.Prediction(score="-0.3", feedback="Terrible.")
|
|
)
|
|
result = judge.forward("task", "input", "output")
|
|
|
|
assert result.score == 0.0
|
|
|
|
def test_empty_score_string_falls_back(self, judge: OutputJudge) -> None:
|
|
judge.judge = MagicMock(
|
|
return_value=dspy.Prediction(score="", feedback="No score.")
|
|
)
|
|
result = judge.forward("task", "input", "output")
|
|
|
|
assert result.score == 0.5
|
|
|
|
def test_boundary_score_one(self, judge: OutputJudge) -> None:
|
|
judge.judge = MagicMock(
|
|
return_value=dspy.Prediction(score="1.0", feedback="Perfect.")
|
|
)
|
|
result = judge.forward("task", "input", "output")
|
|
|
|
assert result.score == 1.0
|
|
|
|
def test_boundary_score_zero(self, judge: OutputJudge) -> None:
|
|
judge.judge = MagicMock(
|
|
return_value=dspy.Prediction(score="0.0", feedback="Wrong.")
|
|
)
|
|
result = judge.forward("task", "input", "output")
|
|
|
|
assert result.score == 0.0
|
|
|
|
def test_none_score_falls_back(self, judge: OutputJudge) -> None:
|
|
judge.judge = MagicMock(
|
|
return_value=dspy.Prediction(score=None, feedback="Missing.")
|
|
)
|
|
result = judge.forward("task", "input", "output")
|
|
|
|
assert result.score == 0.5
|
|
|
|
|
|
class TestSyntheticInputGeneratorForward:
|
|
"""Tests for SyntheticInputGenerator.forward JSON/fallback parsing.
|
|
|
|
Mocks the internal ChainOfThought module to isolate parsing logic.
|
|
"""
|
|
|
|
@pytest.fixture
|
|
def generator(self) -> SyntheticInputGenerator:
|
|
return SyntheticInputGenerator()
|
|
|
|
def test_valid_json_parsed_correctly(
|
|
self, generator: SyntheticInputGenerator
|
|
) -> None:
|
|
examples_json = json.dumps(["q1", "q2", "q3"])
|
|
generator.generate = MagicMock(
|
|
return_value=dspy.Prediction(examples=examples_json)
|
|
)
|
|
result = generator.forward("task desc", 3)
|
|
|
|
assert result.examples == ["q1", "q2", "q3"]
|
|
|
|
def test_malformed_json_triggers_fallback(
|
|
self, generator: SyntheticInputGenerator
|
|
) -> None:
|
|
generator.generate = MagicMock(
|
|
return_value=dspy.Prediction(
|
|
examples='Here: "fallback item" and "another one"'
|
|
)
|
|
)
|
|
result = generator.forward("task desc", 2)
|
|
|
|
assert result.examples == ["fallback item", "another one"]
|
|
|
|
def test_empty_json_array(self, generator: SyntheticInputGenerator) -> None:
|
|
generator.generate = MagicMock(
|
|
return_value=dspy.Prediction(examples="[]")
|
|
)
|
|
result = generator.forward("task desc", 0)
|
|
|
|
assert result.examples == []
|
|
|
|
|
|
class TestInstructionProposerForward:
|
|
"""Tests for InstructionProposer.forward."""
|
|
|
|
@pytest.fixture
|
|
def proposer(self) -> InstructionProposer:
|
|
return InstructionProposer()
|
|
|
|
def test_returns_new_instruction(self, proposer: InstructionProposer) -> None:
|
|
proposer.propose = MagicMock(
|
|
return_value=dspy.Prediction(
|
|
new_instruction="Be concise and accurate."
|
|
)
|
|
)
|
|
result = proposer.forward(
|
|
"Be helpful.", "Answer questions.", "Failed: too verbose"
|
|
)
|
|
|
|
assert result.new_instruction == "Be concise and accurate."
|
|
|
|
def test_passes_correct_arguments(
|
|
self, proposer: InstructionProposer
|
|
) -> None:
|
|
proposer.propose = MagicMock(
|
|
return_value=dspy.Prediction(new_instruction="improved")
|
|
)
|
|
proposer.forward("current", "task desc", "failures")
|
|
|
|
proposer.propose.assert_called_once_with(
|
|
current_instruction="current",
|
|
task_description="task desc",
|
|
failure_examples="failures",
|
|
)
|