feat: v0.2.0 sprint — ground truth eval, crossover/mutation, checkpointing, similarity guards, dataset loader, CLI commands, extended test coverage

Aggregates all v0.2.0 sprint work (GARAA-30 through GARAA-40) and fixes 2 integration tests that broke when the codebase went async (DSPyLLMAdapter and full pipeline tests now properly await coroutines). 277 tests pass (260 unit + 17 integration). Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-03-29 19:13:50 +00:00
parent b9745566c8
commit a5bf2ad59c
43 changed files with 5007 additions and 358 deletions
--- a/tests/unit/test_config.py
+++ b/tests/unit/test_config.py
@@ -300,3 +300,33 @@ class TestConfigValidation:
        )
        assert config.max_iterations == 1
        assert config.perfect_score == 0.0
+
+
+class TestEvalConfigValidation:
+    """Tests for ground-truth evaluation config fields."""
+
+    def test_eval_defaults(self) -> None:
+        config = OptimizationConfig(seed_prompt="a", task_description="b")
+        assert config.eval_dataset_path is None
+        assert config.eval_metric == "bleu"
+
+    def test_eval_dataset_path_set(self) -> None:
+        config = OptimizationConfig(
+            seed_prompt="a", task_description="b",
+            eval_dataset_path="data.csv",
+        )
+        assert config.eval_dataset_path == "data.csv"
+
+    def test_valid_eval_metrics(self) -> None:
+        for metric in ("exact", "bleu", "rouge_l", "cosine", "llm_judge"):
+            config = OptimizationConfig(
+                seed_prompt="a", task_description="b", eval_metric=metric,
+            )
+            assert config.eval_metric == metric
+
+    def test_invalid_eval_metric_raises(self) -> None:
+        with pytest.raises(ValidationError, match="eval_metric must be one of"):
+            OptimizationConfig(
+                seed_prompt="a", task_description="b",
+                eval_metric="invalid_metric",
+            )