- Clean architecture (domain/application/infrastructure) - DSPy-based evolution engine with scoring - CLI via pyproject.toml entry point - Unit + integration tests (~300 tests) - Configs for glm-5.1 and glm-4.5-air models - Z.AI endpoint integration
80 lines
2.4 KiB
Python
80 lines
2.4 KiB
Python
"""
|
|
DSPy Signatures — declarative LLM contracts.
|
|
|
|
Defines WHAT each LLM call does, not HOW.
|
|
DSPy Signature = input_fields → output_fields + instruction.
|
|
DSPy handles prompting, parsing, and structuring.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import dspy
|
|
|
|
|
|
class GenerateSyntheticInputs(dspy.Signature):
|
|
"""Generate diverse, realistic input examples for a given task."""
|
|
|
|
task_description: str = dspy.InputField(
|
|
desc="Description of the task the prompt should accomplish."
|
|
)
|
|
n_examples: int = dspy.InputField(
|
|
desc="Number of examples to generate."
|
|
)
|
|
examples: str = dspy.OutputField(
|
|
desc=(
|
|
"A JSON array of strings, each being a realistic input "
|
|
"for the task. Cover: normal cases, edge cases, long inputs, "
|
|
"short inputs, ambiguous cases, and tricky scenarios."
|
|
),
|
|
)
|
|
|
|
|
|
class JudgeOutput(dspy.Signature):
|
|
"""Evaluate the quality of an LLM output for a given task and input.
|
|
|
|
Score: 0.0 (completely wrong) to 1.0 (perfect).
|
|
Feedback: specific, actionable criticism.
|
|
"""
|
|
|
|
task_description: str = dspy.InputField(
|
|
desc="What the assistant is supposed to do."
|
|
)
|
|
input_text: str = dspy.InputField(
|
|
desc="The input provided to the assistant."
|
|
)
|
|
output_text: str = dspy.InputField(
|
|
desc="The assistant's response to evaluate."
|
|
)
|
|
score: float = dspy.OutputField(
|
|
desc="Quality score from 0.0 (wrong) to 1.0 (perfect)."
|
|
)
|
|
feedback: str = dspy.OutputField(
|
|
desc=(
|
|
"Specific, actionable feedback explaining what's wrong "
|
|
"with the output and how to improve it. Be critical."
|
|
),
|
|
)
|
|
|
|
|
|
class ProposeInstruction(dspy.Signature):
|
|
"""Given a current prompt and examples of where it fails with feedback,
|
|
propose an improved version of the prompt.
|
|
|
|
The new prompt should address all the issues identified in the feedback.
|
|
"""
|
|
|
|
current_instruction: str = dspy.InputField(
|
|
desc="The current prompt/instruction to improve."
|
|
)
|
|
task_description: str = dspy.InputField(
|
|
desc="Description of the task."
|
|
)
|
|
failure_examples: str = dspy.InputField(
|
|
desc=(
|
|
"Examples of inputs, outputs, scores, and feedback "
|
|
"showing where the current instruction fails."
|
|
),
|
|
)
|
|
new_instruction: str = dspy.OutputField(
|
|
desc="An improved version of the instruction."
|
|
)
|