Agent Skills: Evaluation Metrics

LLM evaluation frameworks, benchmarks, and quality metrics for production systems.

UncategorizedID: pluginagentmarketplace/custom-plugin-ai-engineer/evaluation-metrics

Skill Files

Browse the full folder contents for evaluation-metrics.

Download Skill

Loading file tree…

skills/evaluation-metrics/SKILL.md

Skill Metadata

Name
evaluation-metrics
Description
LLM evaluation frameworks, benchmarks, and quality metrics for production systems.

Evaluation Metrics

Measure and improve LLM quality systematically.

Quick Start

Basic Evaluation with RAGAS

from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall
)
from datasets import Dataset

# Prepare evaluation data
eval_data = {
    "question": ["What is machine learning?"],
    "answer": ["ML is a subset of AI that learns from data."],
    "contexts": [["Machine learning is a field of AI..."]],
    "ground_truth": ["Machine learning is AI that learns patterns."]
}

dataset = Dataset.from_dict(eval_data)

# Run evaluation
results = evaluate(
    dataset,
    metrics=[
        faithfulness,
        answer_relevancy,
        context_precision,
        context_recall
    ]
)

print(results)

LangChain Evaluation

from langchain.evaluation import load_evaluator

# Criteria-based evaluation
evaluator = load_evaluator("criteria", criteria="helpfulness")

result = evaluator.evaluate_strings(
    prediction="Paris is the capital of France.",
    input="What is the capital of France?"
)

print(f"Score: {result['score']}, Reasoning: {result['reasoning']}")

Core Metrics

Text Generation Metrics

from evaluate import load
import numpy as np

class TextMetrics:
    def __init__(self):
        self.bleu = load("bleu")
        self.rouge = load("rouge")
        self.bertscore = load("bertscore")

    def evaluate(self, predictions: list, references: list) -> dict:
        metrics = {}

        # BLEU - Precision-based n-gram overlap
        bleu_result = self.bleu.compute(
            predictions=predictions,
            references=[[r] for r in references]
        )
        metrics['bleu'] = bleu_result['bleu']

        # ROUGE - Recall-based overlap
        rouge_result = self.rouge.compute(
            predictions=predictions,
            references=references
        )
        metrics['rouge1'] = rouge_result['rouge1']
        metrics['rougeL'] = rouge_result['rougeL']

        # BERTScore - Semantic similarity
        bert_result = self.bertscore.compute(
            predictions=predictions,
            references=references,
            lang="en"
        )
        metrics['bertscore_f1'] = np.mean(bert_result['f1'])

        return metrics

RAG-Specific Metrics

class RAGMetrics:
    def __init__(self, llm):
        self.llm = llm

    def faithfulness(self, answer: str, contexts: list) -> float:
        """Check if answer is supported by retrieved contexts."""
        prompt = f"""Given the following context and answer, determine if the answer
is fully supported by the context.

Context: {' '.join(contexts)}

Answer: {answer}

Score from 0 (not supported) to 1 (fully supported):"""

        response = self.llm.generate(prompt)
        return float(response.strip())

    def relevance(self, question: str, answer: str) -> float:
        """Check if answer is relevant to the question."""
        prompt = f"""Rate how relevant this answer is to the question.

Question: {question}
Answer: {answer}

Score from 0 (irrelevant) to 1 (highly relevant):"""

        response = self.llm.generate(prompt)
        return float(response.strip())

    def context_precision(self, question: str, contexts: list) -> float:
        """Check if retrieved contexts are relevant to question."""
        relevant_count = 0
        for ctx in contexts:
            prompt = f"""Is this context relevant to answering the question?

Question: {question}
Context: {ctx}

Answer Yes or No:"""
            if "yes" in self.llm.generate(prompt).lower():
                relevant_count += 1

        return relevant_count / len(contexts)

Hallucination Detection

class HallucinationDetector:
    def __init__(self, llm, knowledge_base=None):
        self.llm = llm
        self.knowledge_base = knowledge_base

    def detect(self, claim: str, source: str = None) -> dict:
        """Detect potential hallucinations in a claim."""
        results = {
            'claim': claim,
            'is_hallucination': False,
            'confidence': 0.0,
            'reason': ''
        }

        # Check against source if provided
        if source:
            prompt = f"""Determine if this claim is supported by the source.

Source: {source}
Claim: {claim}

Is the claim fully supported? Answer with:
SUPPORTED, PARTIALLY_SUPPORTED, or NOT_SUPPORTED
Reason:"""

            response = self.llm.generate(prompt)
            if "NOT_SUPPORTED" in response:
                results['is_hallucination'] = True
                results['confidence'] = 0.9
            elif "PARTIALLY" in response:
                results['confidence'] = 0.5

        # Check for self-consistency
        regenerations = [
            self.llm.generate(f"Verify: {claim}")
            for _ in range(3)
        ]
        consistency = self._check_consistency(regenerations)
        if consistency < 0.7:
            results['is_hallucination'] = True
            results['reason'] = 'Inconsistent across regenerations'

        return results

Benchmark Suites

MMLU (Massive Multitask Language Understanding)

from datasets import load_dataset

def evaluate_mmlu(model, tokenizer, subjects=None):
    dataset = load_dataset("cais/mmlu", "all")

    results = {}
    for subject in subjects or dataset.keys():
        correct = 0
        total = 0

        for example in dataset[subject]:
            question = example['question']
            choices = example['choices']
            answer = example['answer']

            # Format prompt
            prompt = f"{question}\n"
            for i, choice in enumerate(choices):
                prompt += f"{chr(65+i)}. {choice}\n"
            prompt += "Answer:"

            # Get model prediction
            response = model.generate(prompt)
            predicted = response[0].upper()

            if predicted == chr(65 + answer):
                correct += 1
            total += 1

        results[subject] = correct / total

    return results

HumanEval (Code Generation)

def evaluate_humaneval(model):
    from human_eval.data import read_problems
    from human_eval.execution import check_correctness

    problems = read_problems()
    results = []

    for task_id, problem in problems.items():
        prompt = problem['prompt']

        # Generate completions
        completions = [model.generate(prompt) for _ in range(10)]

        # Check correctness
        for completion in completions:
            result = check_correctness(problem, completion, timeout=10.0)
            results.append(result['passed'])

    pass_at_1 = sum(results[:len(problems)]) / len(problems)
    return {'pass@1': pass_at_1}

Evaluation Framework

from dataclasses import dataclass
from typing import List, Dict, Callable

@dataclass
class EvaluationConfig:
    metrics: List[str]
    sample_size: int = 100
    confidence_level: float = 0.95

class LLMEvaluator:
    def __init__(self, model, config: EvaluationConfig):
        self.model = model
        self.config = config
        self.metrics_registry: Dict[str, Callable] = {}

    def register_metric(self, name: str, func: Callable):
        self.metrics_registry[name] = func

    def evaluate(self, test_data: List[dict]) -> dict:
        results = {metric: [] for metric in self.config.metrics}

        for sample in test_data[:self.config.sample_size]:
            prediction = self.model.generate(sample['input'])

            for metric_name in self.config.metrics:
                metric_func = self.metrics_registry[metric_name]
                score = metric_func(
                    prediction=prediction,
                    reference=sample.get('expected'),
                    context=sample.get('context')
                )
                results[metric_name].append(score)

        # Aggregate results
        aggregated = {}
        for metric, scores in results.items():
            aggregated[metric] = {
                'mean': np.mean(scores),
                'std': np.std(scores),
                'min': np.min(scores),
                'max': np.max(scores)
            }

        return aggregated

A/B Testing

class ABTester:
    def __init__(self, model_a, model_b, evaluator):
        self.model_a = model_a
        self.model_b = model_b
        self.evaluator = evaluator

    def run_test(self, test_data: List[dict], metric: str) -> dict:
        scores_a = []
        scores_b = []

        for sample in test_data:
            # Get predictions from both models
            pred_a = self.model_a.generate(sample['input'])
            pred_b = self.model_b.generate(sample['input'])

            # Evaluate
            score_a = self.evaluator.evaluate_single(pred_a, sample)
            score_b = self.evaluator.evaluate_single(pred_b, sample)

            scores_a.append(score_a[metric])
            scores_b.append(score_b[metric])

        # Statistical test
        from scipy import stats
        t_stat, p_value = stats.ttest_rel(scores_a, scores_b)

        return {
            'model_a_mean': np.mean(scores_a),
            'model_b_mean': np.mean(scores_b),
            'improvement': (np.mean(scores_b) - np.mean(scores_a)) / np.mean(scores_a),
            'p_value': p_value,
            'significant': p_value < 0.05
        }

Metrics Quick Reference

| Metric | Range | Best For | Interpretation | |--------|-------|----------|----------------| | BLEU | 0-1 | Translation | Higher = better n-gram match | | ROUGE-L | 0-1 | Summarization | Higher = better recall | | BERTScore | 0-1 | General | Higher = semantic similarity | | Faithfulness | 0-1 | RAG | Higher = grounded in context | | Perplexity | 1-∞ | Language model | Lower = better fluency | | Pass@k | 0-1 | Code gen | Higher = more correct samples |

Best Practices

  1. Use multiple metrics: No single metric captures everything
  2. Human evaluation: Ground truth for subjective quality
  3. Domain-specific metrics: Customize for your use case
  4. Regular benchmarking: Track quality over time
  5. Statistical significance: Use proper sample sizes
  6. Version everything: Models, prompts, and test data

Error Handling & Retry

from tenacity import retry, stop_after_attempt

@retry(stop=stop_after_attempt(3))
def evaluate_with_retry(model_output, reference):
    return evaluator.evaluate(model_output, reference)

def batch_evaluate(samples, batch_size=50):
    results = []
    for i in range(0, len(samples), batch_size):
        batch = samples[i:i+batch_size]
        results.extend([evaluate_with_retry(s) for s in batch])
    return results

Troubleshooting

| Symptom | Cause | Solution | |---------|-------|----------| | Inconsistent scores | High temperature | Set temp=0 for evaluator | | Slow evaluation | No batching | Batch evaluations | | Missing metrics | Wrong format | Check data schema |

Unit Test Template

def test_faithfulness_metric():
    score = evaluate_faithfulness("Answer", ["Context"])
    assert 0 <= score <= 1