feat: v0.2.0 sprint — ground truth eval, crossover/mutation, checkpointing, similarity guards, dataset loader, CLI commands, extended test coverage
Aggregates all v0.2.0 sprint work (GARAA-30 through GARAA-40) and fixes 2 integration tests that broke when the codebase went async (DSPyLLMAdapter and full pipeline tests now properly await coroutines). 277 tests pass (260 unit + 17 integration). Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
@@ -59,6 +59,7 @@ class DSPyJudgeAdapter(JudgePort):
|
||||
if self._judge_dimensions
|
||||
else {}
|
||||
)
|
||||
self.call_count: int = 0
|
||||
|
||||
async def judge_batch(
|
||||
self,
|
||||
@@ -104,13 +105,15 @@ class DSPyJudgeAdapter(JudgePort):
|
||||
|
||||
def _sync_judge(self, task_description: str, input_text: str, output_text: str):
|
||||
with dspy.context(lm=self._lm):
|
||||
return self._judge(
|
||||
result = self._judge(
|
||||
task_description=task_description,
|
||||
input_text=input_text,
|
||||
output_text=output_text,
|
||||
judge_criteria=self._judge_criteria,
|
||||
dimension_names=self._dimension_names,
|
||||
)
|
||||
self.call_count += 1
|
||||
return result
|
||||
|
||||
def _aggregate_result(self, pred: Any) -> tuple[float, str]:
|
||||
"""Compute weighted aggregate score from dimension scores if available."""
|
||||
|
||||
Reference in New Issue
Block a user