feat: v0.2.0 sprint — ground truth eval, crossover/mutation, checkpointing, similarity guards, dataset loader, CLI commands, extended test coverage

Aggregates all v0.2.0 sprint work (GARAA-30 through GARAA-40) and fixes
2 integration tests that broke when the codebase went async (DSPyLLMAdapter
and full pipeline tests now properly await coroutines).

277 tests pass (260 unit + 17 integration).

Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
FullStackDev
2026-03-29 19:13:50 +00:00
parent b9745566c8
commit a5bf2ad59c
43 changed files with 5007 additions and 358 deletions

View File

@@ -300,3 +300,33 @@ class TestConfigValidation:
)
assert config.max_iterations == 1
assert config.perfect_score == 0.0
class TestEvalConfigValidation:
"""Tests for ground-truth evaluation config fields."""
def test_eval_defaults(self) -> None:
config = OptimizationConfig(seed_prompt="a", task_description="b")
assert config.eval_dataset_path is None
assert config.eval_metric == "bleu"
def test_eval_dataset_path_set(self) -> None:
config = OptimizationConfig(
seed_prompt="a", task_description="b",
eval_dataset_path="data.csv",
)
assert config.eval_dataset_path == "data.csv"
def test_valid_eval_metrics(self) -> None:
for metric in ("exact", "bleu", "rouge_l", "cosine", "llm_judge"):
config = OptimizationConfig(
seed_prompt="a", task_description="b", eval_metric=metric,
)
assert config.eval_metric == metric
def test_invalid_eval_metric_raises(self) -> None:
with pytest.raises(ValidationError, match="eval_metric must be one of"):
OptimizationConfig(
seed_prompt="a", task_description="b",
eval_metric="invalid_metric",
)