Custom Metrics

DSPy.rb’s evaluation framework allows you to define custom metrics for domain-specific evaluation scenarios. While the framework provides basic built-in metrics, you can create sophisticated evaluation logic tailored to your specific use cases and business requirements.

Overview

Custom metrics in DSPy.rb:

  • Proc-based Implementation: Define metrics as Ruby procedures
  • Domain-specific Logic: Create evaluation criteria specific to your use case
  • Flexible Scoring: Support for boolean, numeric, and composite scoring
  • Integration: Work seamlessly with DSPy’s evaluation and optimization systems

Basic Custom Metrics

Simple Custom Metric

# Define a custom accuracy metric
accuracy_metric = ->(example, prediction) do
  return false unless prediction && prediction.respond_to?(:answer)
  prediction.answer.downcase.strip == example.expected_answer.downcase.strip
end

# Use with evaluator
evaluator = DSPy::Evaluate.new(metric: accuracy_metric)

result = evaluator.evaluate(examples: test_examples) do |example|
  predictor.call(input: example.input)
end

puts "Custom accuracy: #{result.score}"

Weighted Accuracy Metric

# Metric that considers example difficulty/importance
weighted_accuracy = ->(example, prediction) do
  return false unless prediction && prediction.respond_to?(:answer)
  
  # Base correctness
  correct = prediction.answer.downcase.strip == example.expected_answer.downcase.strip
  return false unless correct
  
  # Apply weight based on example metadata
  weight = example.metadata[:difficulty] || 1.0
  
  # Return weighted score (true/false gets converted to 1.0/0.0)
  weight
end

# Use in evaluation
evaluator = DSPy::Evaluate.new(metric: weighted_accuracy)

Confidence-Aware Metric

# Metric that considers prediction confidence
confidence_metric = ->(example, prediction) do
  return 0.0 unless prediction
  
  # Check if prediction has required fields
  return 0.0 unless prediction.respond_to?(:answer) && prediction.respond_to?(:confidence)
  
  # Base accuracy
  correct = prediction.answer.downcase == example.expected_answer.downcase
  return 0.0 unless correct
  
  # Bonus for high confidence on correct answers
  base_score = 1.0
  confidence_bonus = prediction.confidence > 0.8 ? 0.2 : 0.0
  
  base_score + confidence_bonus
end

Domain-Specific Metrics

Customer Service Quality Metric

customer_service_metric = ->(example, prediction) do
  return 0.0 unless prediction
  
  score = 0.0
  total_criteria = 5
  
  # Criterion 1: Answers the question
  if prediction.answer.downcase.include?(example.expected_keywords.map(&:downcase))
    score += 1.0
  end
  
  # Criterion 2: Professional tone
  unprofessional_words = ['stupid', 'dumb', 'whatever', 'i don\'t know']
  unless unprofessional_words.any? { |word| prediction.answer.downcase.include?(word) }
    score += 1.0
  end
  
  # Criterion 3: Helpful length (not too short, not too long)
  answer_length = prediction.answer.length
  if answer_length >= 50 && answer_length <= 500
    score += 1.0
  end
  
  # Criterion 4: Empathy/politeness
  polite_words = ['please', 'thank you', 'sorry', 'understand', 'apologize']
  if polite_words.any? { |word| prediction.answer.downcase.include?(word) }
    score += 1.0
  end
  
  # Criterion 5: Actionable advice
  action_words = ['try', 'can', 'will', 'should', 'recommend', 'suggest']
  if action_words.any? { |word| prediction.answer.downcase.include?(word) }
    score += 1.0
  end
  
  # Return normalized score
  score / total_criteria
end

# Use in evaluation
evaluator = DSPy::Evaluate.new(metric: customer_service_metric)

Medical Information Accuracy Metric

medical_accuracy_metric = ->(example, prediction) do
  return 0.0 unless prediction && prediction.respond_to?(:diagnosis)
  
  score = 0.0
  
  # Primary diagnosis match
  if prediction.diagnosis.downcase == example.expected_diagnosis.downcase
    score += 0.5
  end
  
  # Symptom coverage
  predicted_symptoms = prediction.symptoms || []
  expected_symptoms = example.expected_symptoms || []
  
  if expected_symptoms.any?
    covered_symptoms = predicted_symptoms & expected_symptoms
    symptom_coverage = covered_symptoms.size.to_f / expected_symptoms.size
    score += symptom_coverage * 0.3
  end
  
  # Safety check - penalize dangerous advice
  dangerous_phrases = ['ignore symptoms', 'don\'t see doctor', 'definitely not serious']
  if dangerous_phrases.any? { |phrase| prediction.answer.downcase.include?(phrase) }
    score = 0.0  # Fail completely for dangerous advice
  end
  
  # Confidence appropriateness
  if prediction.respond_to?(:confidence)
    # Penalize overconfidence on uncertain cases
    if example.metadata[:uncertainty] == 'high' && prediction.confidence > 0.9
      score *= 0.7
    end
  end
  
  score
end

Financial Risk Assessment Metric

financial_risk_metric = ->(example, prediction) do
  return 0.0 unless prediction
  
  total_score = 0.0
  weights = {
    risk_level: 0.4,
    reasoning: 0.3,
    recommendations: 0.2,
    compliance: 0.1
  }
  
  # Risk level accuracy
  if prediction.risk_level == example.expected_risk_level
    total_score += weights[:risk_level]
  elsif (prediction.risk_level == 'medium' && ['low', 'high'].include?(example.expected_risk_level))
    total_score += weights[:risk_level] * 0.5  # Partial credit for adjacent levels
  end
  
  # Reasoning quality
  reasoning_keywords = example.expected_reasoning_keywords || []
  if reasoning_keywords.any?
    mentioned_keywords = reasoning_keywords.select do |keyword|
      prediction.reasoning.downcase.include?(keyword.downcase)
    end
    keyword_coverage = mentioned_keywords.size.to_f / reasoning_keywords.size
    total_score += weights[:reasoning] * keyword_coverage
  end
  
  # Recommendation appropriateness
  if prediction.respond_to?(:recommendations)
    appropriate_recommendations = example.expected_recommendations || []
    if appropriate_recommendations.any?
      rec_match = (prediction.recommendations & appropriate_recommendations).size.to_f / appropriate_recommendations.size
      total_score += weights[:recommendations] * rec_match
    end
  end
  
  # Compliance check
  compliant = true
  prohibited_advice = ['guaranteed returns', 'no risk', 'insider information']
  prohibited_advice.each do |phrase|
    if prediction.reasoning.downcase.include?(phrase)
      compliant = false
      break
    end
  end
  
  total_score += weights[:compliance] if compliant
  
  total_score
end

Multi-Objective Metrics

Composite Quality Metric

composite_quality_metric = ->(example, prediction) do
  return 0.0 unless prediction
  
  scores = {}
  
  # Accuracy component
  accuracy = prediction.answer.downcase == example.expected_answer.downcase ? 1.0 : 0.0
  scores[:accuracy] = accuracy
  
  # Completeness component
  required_points = example.required_points || []
  covered_points = required_points.select do |point|
    prediction.answer.downcase.include?(point.downcase)
  end
  completeness = required_points.empty? ? 1.0 : covered_points.size.to_f / required_points.size
  scores[:completeness] = completeness
  
  # Conciseness component (penalize excessive length)
  ideal_length = example.ideal_length || 200
  actual_length = prediction.answer.length
  length_ratio = actual_length.to_f / ideal_length
  conciseness = length_ratio <= 1.0 ? 1.0 : (1.0 / length_ratio)
  scores[:conciseness] = conciseness
  
  # Clarity component (based on readability heuristics)
  sentences = prediction.answer.split(/[.!?]+/)
  avg_sentence_length = sentences.map(&:split).map(&:size).sum.to_f / sentences.size
  clarity = avg_sentence_length <= 20 ? 1.0 : (20.0 / avg_sentence_length)
  scores[:clarity] = clarity
  
  # Weighted combination
  weights = {
    accuracy: 0.4,
    completeness: 0.3,
    conciseness: 0.15,
    clarity: 0.15
  }
  
  final_score = weights.map { |component, weight| scores[component] * weight }.sum
  
  # Store component scores for analysis
  prediction.instance_variable_set(:@component_scores, scores) if prediction.respond_to?(:instance_variable_set)
  
  final_score
end

Business ROI Metric

business_roi_metric = ->(example, prediction) do
  return 0.0 unless prediction
  
  # Calculate business value based on prediction quality
  base_value = 100.0  # Base value per correct prediction
  
  # Accuracy bonus
  accuracy_bonus = prediction.answer == example.expected_answer ? base_value : 0.0
  
  # Speed bonus (if prediction includes timing)
  speed_bonus = 0.0
  if prediction.respond_to?(:processing_time) && prediction.processing_time
    # Bonus for fast responses (under 2 seconds)
    speed_bonus = prediction.processing_time < 2.0 ? 25.0 : 0.0
  end
  
  # Confidence penalty for wrong answers
  confidence_penalty = 0.0
  if prediction.respond_to?(:confidence) && accuracy_bonus == 0.0
    # Penalty for being confidently wrong
    confidence_penalty = prediction.confidence > 0.8 ? -50.0 : 0.0
  end
  
  # Cost consideration
  estimated_cost = example.metadata[:estimated_cost] || 0.01
  cost_efficiency = accuracy_bonus > 0 ? (accuracy_bonus / estimated_cost) : 0.0
  
  total_value = accuracy_bonus + speed_bonus + confidence_penalty
  roi = cost_efficiency
  
  # Normalize to 0-1 scale for evaluation framework
  [roi / 1000.0, 1.0].min
end

Evaluation with Custom Metrics

Using Multiple Metrics

# Define multiple metrics for comprehensive evaluation
metrics = {
  accuracy: ->(example, prediction) { 
    prediction.answer == example.expected_answer ? 1.0 : 0.0 
  },
  
  completeness: ->(example, prediction) {
    required_elements = example.required_elements || []
    return 1.0 if required_elements.empty?
    
    found_elements = required_elements.select do |element|
      prediction.answer.include?(element)
    end
    found_elements.size.to_f / required_elements.size
  },
  
  safety: ->(example, prediction) {
    unsafe_content = ['violence', 'harm', 'illegal']
    unsafe_found = unsafe_content.any? { |term| prediction.answer.downcase.include?(term) }
    unsafe_found ? 0.0 : 1.0
  }
}

# Evaluate with each metric
results = {}
metrics.each do |metric_name, metric_proc|
  evaluator = DSPy::Evaluate.new(metric: metric_proc)
  result = evaluator.evaluate(examples: test_examples) do |example|
    predictor.call(input: example.input)
  end
  results[metric_name] = result.score
end

puts "Evaluation Results:"
results.each do |metric, score|
  puts "  #{metric}: #{(score * 100).round(1)}%"
end

Detailed Result Analysis

def detailed_evaluation(predictor, test_examples)
  detailed_metric = ->(example, prediction) do
    result = {
      correct: prediction.answer == example.expected_answer,
      answer_length: prediction.answer.length,
      response_time: prediction.metadata[:response_time] || 0,
      confidence: prediction.confidence || 0
    }
    
    # Return 1.0 or 0.0 for the evaluator, but store details
    example.instance_variable_set(:@detailed_result, result)
    result[:correct] ? 1.0 : 0.0
  end
  
  evaluator = DSPy::Evaluate.new(metric: detailed_metric)
  evaluation_result = evaluator.evaluate(examples: test_examples) do |example|
    predictor.call(input: example.input)
  end
  
  # Extract detailed results
  detailed_results = test_examples.map do |example|
    example.instance_variable_get(:@detailed_result)
  end
  
  # Analyze patterns
  correct_results = detailed_results.select { |r| r[:correct] }
  incorrect_results = detailed_results.reject { |r| r[:correct] }
  
  analysis = {
    overall_accuracy: evaluation_result.score,
    avg_response_time: detailed_results.map { |r| r[:response_time] }.sum / detailed_results.size,
    avg_confidence_correct: correct_results.map { |r| r[:confidence] }.sum / correct_results.size,
    avg_confidence_incorrect: incorrect_results.empty? ? 0 : incorrect_results.map { |r| r[:confidence] }.sum / incorrect_results.size,
    avg_length_correct: correct_results.map { |r| r[:answer_length] }.sum / correct_results.size,
    avg_length_incorrect: incorrect_results.empty? ? 0 : incorrect_results.map { |r| r[:answer_length] }.sum / incorrect_results.size
  }
  
  analysis
end

# Usage
analysis = detailed_evaluation(predictor, test_examples)
puts "Detailed Analysis:"
puts "  Overall Accuracy: #{(analysis[:overall_accuracy] * 100).round(1)}%"
puts "  Avg Response Time: #{analysis[:avg_response_time].round(2)}s"
puts "  Confidence (Correct): #{analysis[:avg_confidence_correct].round(2)}"
puts "  Confidence (Incorrect): #{analysis[:avg_confidence_incorrect].round(2)}"

Integration with Optimization

Custom Metric in MIPROv2

# Use custom metric in optimization
domain_specific_metric = ->(example, prediction) do
  # Your domain-specific evaluation logic
  score = evaluate_domain_quality(example, prediction)
  score
end

optimizer = DSPy::MIPROv2.new(signature: YourSignature)

result = optimizer.optimize(examples: training_examples) do |predictor, val_examples|
  evaluator = DSPy::Evaluate.new(metric: domain_specific_metric)
  evaluation_result = evaluator.evaluate(examples: val_examples) do |example|
    predictor.call(input: example.input)
  end
  evaluation_result.score
end

puts "Optimized for domain-specific quality: #{result.best_score_value}"

Best Practices

1. Clear Scoring Logic

# Good: Clear, documented scoring
sentiment_accuracy = ->(example, prediction) do
  return 0.0 unless prediction && prediction.respond_to?(:sentiment)
  
  # Exact match gets full score
  return 1.0 if prediction.sentiment == example.expected_sentiment
  
  # Partial credit for related sentiments
  sentiment_similarity = {
    ['positive', 'very_positive'] => 0.8,
    ['negative', 'very_negative'] => 0.8,
    ['neutral', 'mixed'] => 0.6
  }
  
  pair = [prediction.sentiment, example.expected_sentiment].sort
  sentiment_similarity[pair] || 0.0
end

2. Handle Edge Cases

robust_metric = ->(example, prediction) do
  # Handle nil prediction
  return 0.0 unless prediction
  
  # Handle missing fields
  return 0.0 unless prediction.respond_to?(:answer)
  
  # Handle empty responses
  return 0.0 if prediction.answer.strip.empty?
  
  # Your actual evaluation logic
  prediction.answer.downcase == example.expected_answer.downcase ? 1.0 : 0.0
end

3. Consistent Return Values

# Always return numeric values between 0 and 1
normalized_metric = ->(example, prediction) do
  raw_score = calculate_raw_score(example, prediction)
  
  # Normalize to 0-1 range
  max_possible_score = 10.0
  normalized = [raw_score / max_possible_score, 1.0].min
  [normalized, 0.0].max  # Ensure non-negative
end

4. Meaningful Metrics

# Good: Metrics aligned with business goals
customer_satisfaction_metric = ->(example, prediction) do
  # Factors that actually matter for customer satisfaction
  factors = {
    solved_problem: prediction.answer.include?(example.solution_keywords.join(' ')),
    polite_tone: !prediction.answer.match?(/\b(stupid|dumb|obviously)\b/i),
    reasonable_length: prediction.answer.length.between?(50, 300),
    actionable: prediction.answer.match?(/\b(try|can|will|contact)\b/i)
  }
  
  # Weight based on customer feedback data
  weights = { solved_problem: 0.5, polite_tone: 0.2, reasonable_length: 0.15, actionable: 0.15 }
  
  factors.map { |factor, present| present ? weights[factor] : 0 }.sum
end