feat: v0.2.0 sprint — ground truth eval, crossover/mutation, checkpointing, similarity guards, dataset loader, CLI commands, extended test coverage
Aggregates all v0.2.0 sprint work (GARAA-30 through GARAA-40) and fixes 2 integration tests that broke when the codebase went async (DSPyLLMAdapter and full pipeline tests now properly await coroutines). 277 tests pass (260 unit + 17 integration). Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
96
tests/unit/test_scoring_extended.py
Normal file
96
tests/unit/test_scoring_extended.py
Normal file
@@ -0,0 +1,96 @@
|
||||
"""Additional unit tests for scoring edge cases."""
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from prometheus.domain.entities import EvalResult, Trajectory
|
||||
from prometheus.domain.scoring import normalize_score, should_accept
|
||||
|
||||
|
||||
def _make_eval(scores: list[float]) -> EvalResult:
|
||||
return EvalResult(
|
||||
scores=scores,
|
||||
feedbacks=[""] * len(scores),
|
||||
trajectories=[
|
||||
Trajectory(f"in{i}", f"out{i}", s, "", "p")
|
||||
for i, s in enumerate(scores)
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
class TestShouldAcceptEdgeCases:
|
||||
"""Extended edge-case tests for should_accept."""
|
||||
|
||||
def test_tiny_improvement_accepted(self) -> None:
|
||||
old = _make_eval([0.5])
|
||||
new = _make_eval([0.5001])
|
||||
assert should_accept(old, new) is True
|
||||
|
||||
def test_tiny_improvement_below_threshold(self) -> None:
|
||||
old = _make_eval([0.5])
|
||||
new = _make_eval([0.5001])
|
||||
assert should_accept(old, new, min_improvement=0.01) is False
|
||||
|
||||
def test_zero_scores_equal(self) -> None:
|
||||
old = _make_eval([0.0, 0.0])
|
||||
new = _make_eval([0.0, 0.0])
|
||||
assert should_accept(old, new) is False
|
||||
|
||||
def test_negative_to_zero_not_accepted(self) -> None:
|
||||
"""Scores should be [0,1] but test should_accept with edge values."""
|
||||
old = _make_eval([-0.1])
|
||||
new = _make_eval([0.0])
|
||||
assert should_accept(old, new) is True
|
||||
|
||||
def test_large_improvement(self) -> None:
|
||||
old = _make_eval([0.0, 0.0, 0.0])
|
||||
new = _make_eval([1.0, 1.0, 1.0])
|
||||
assert should_accept(old, new) is True
|
||||
|
||||
def test_single_score_improvement(self) -> None:
|
||||
old = _make_eval([0.4])
|
||||
new = _make_eval([0.5])
|
||||
assert should_accept(old, new) is True
|
||||
|
||||
def test_min_improvement_exactly_met(self) -> None:
|
||||
"""When improvement exactly equals min_improvement, still rejected (strict >)."""
|
||||
old = _make_eval([0.5])
|
||||
new = _make_eval([0.7])
|
||||
assert should_accept(old, new, min_improvement=0.2) is False
|
||||
|
||||
def test_min_improvement_just_over(self) -> None:
|
||||
old = _make_eval([0.5])
|
||||
new = _make_eval([0.7001])
|
||||
assert should_accept(old, new, min_improvement=0.2) is True
|
||||
|
||||
|
||||
class TestNormalizeScoreEdgeCases:
|
||||
"""Extended edge-case tests for normalize_score."""
|
||||
|
||||
def test_exact_bounds(self) -> None:
|
||||
assert normalize_score(0.0) == 0.0
|
||||
assert normalize_score(1.0) == 1.0
|
||||
|
||||
def test_very_large_value(self) -> None:
|
||||
assert normalize_score(1e10) == 1.0
|
||||
|
||||
def test_very_negative_value(self) -> None:
|
||||
assert normalize_score(-1e10) == 0.0
|
||||
|
||||
def test_custom_bounds_at_edges(self) -> None:
|
||||
assert normalize_score(5.0, min_val=0.0, max_val=10.0) == 5.0
|
||||
assert normalize_score(0.0, min_val=0.0, max_val=10.0) == 0.0
|
||||
assert normalize_score(10.0, min_val=0.0, max_val=10.0) == 10.0
|
||||
|
||||
def test_negative_custom_range(self) -> None:
|
||||
assert normalize_score(0.0, min_val=-5.0, max_val=5.0) == 0.0
|
||||
assert normalize_score(-3.0, min_val=-5.0, max_val=5.0) == -3.0
|
||||
assert normalize_score(-10.0, min_val=-5.0, max_val=5.0) == -5.0
|
||||
|
||||
def test_zero_span_range(self) -> None:
|
||||
"""When min == max, clamps to min."""
|
||||
assert normalize_score(5.0, min_val=5.0, max_val=5.0) == 5.0
|
||||
assert normalize_score(0.0, min_val=5.0, max_val=5.0) == 5.0
|
||||
|
||||
def test_fractional_score(self) -> None:
|
||||
assert normalize_score(0.3333) == pytest.approx(0.3333)
|
||||
Reference in New Issue
Block a user