feat: v0.2.0 sprint — ground truth eval, crossover/mutation, checkpointing, similarity guards, dataset loader, CLI commands, extended test coverage
Aggregates all v0.2.0 sprint work (GARAA-30 through GARAA-40) and fixes 2 integration tests that broke when the codebase went async (DSPyLLMAdapter and full pipeline tests now properly await coroutines). 277 tests pass (260 unit + 17 integration). Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
@@ -300,3 +300,33 @@ class TestConfigValidation:
|
||||
)
|
||||
assert config.max_iterations == 1
|
||||
assert config.perfect_score == 0.0
|
||||
|
||||
|
||||
class TestEvalConfigValidation:
|
||||
"""Tests for ground-truth evaluation config fields."""
|
||||
|
||||
def test_eval_defaults(self) -> None:
|
||||
config = OptimizationConfig(seed_prompt="a", task_description="b")
|
||||
assert config.eval_dataset_path is None
|
||||
assert config.eval_metric == "bleu"
|
||||
|
||||
def test_eval_dataset_path_set(self) -> None:
|
||||
config = OptimizationConfig(
|
||||
seed_prompt="a", task_description="b",
|
||||
eval_dataset_path="data.csv",
|
||||
)
|
||||
assert config.eval_dataset_path == "data.csv"
|
||||
|
||||
def test_valid_eval_metrics(self) -> None:
|
||||
for metric in ("exact", "bleu", "rouge_l", "cosine", "llm_judge"):
|
||||
config = OptimizationConfig(
|
||||
seed_prompt="a", task_description="b", eval_metric=metric,
|
||||
)
|
||||
assert config.eval_metric == metric
|
||||
|
||||
def test_invalid_eval_metric_raises(self) -> None:
|
||||
with pytest.raises(ValidationError, match="eval_metric must be one of"):
|
||||
OptimizationConfig(
|
||||
seed_prompt="a", task_description="b",
|
||||
eval_metric="invalid_metric",
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user