Initial commit

2025-11-30 08:51:46 +08:00
commit 00486a9b97
66 changed files with 29954 additions and 0 deletions
--- a/.claude/skills/evaluation-metrics/SKILL.md
+++ b/.claude/skills/evaluation-metrics/SKILL.md
@@ -0,0 +1,761 @@
+---
+name: evaluation-metrics
+description: Automatically applies when evaluating LLM performance. Ensures proper eval datasets, metrics computation, A/B testing, LLM-as-judge patterns, and experiment tracking.
+category: ai-llm
+---
+
+# Evaluation Metrics for LLM Applications
+
+When evaluating LLM performance, follow these patterns for rigorous, reproducible evaluation.
+
+**Trigger Keywords**: evaluation, eval, metrics, benchmark, test set, A/B test, LLM judge, performance testing, accuracy, precision, recall, F1, BLEU, ROUGE, experiment tracking
+
+**Agent Integration**: Used by `ml-system-architect`, `performance-and-cost-engineer-llm`, `llm-app-engineer`
+
+## ✅ Correct Pattern: Evaluation Dataset
+
+```python
+from typing import List, Dict, Optional
+from pydantic import BaseModel, Field
+from datetime import datetime
+import json
+
+
+class EvalExample(BaseModel):
+    """Single evaluation example."""
+
+    id: str
+    input: str
+    expected_output: str
+    metadata: Dict[str, any] = Field(default_factory=dict)
+    tags: List[str] = Field(default_factory=list)
+
+
+class EvalDataset(BaseModel):
+    """Evaluation dataset with metadata."""
+
+    name: str
+    description: str
+    version: str
+    created_at: datetime = Field(default_factory=datetime.utcnow)
+    examples: List[EvalExample]
+
+    def save(self, path: str):
+        """Save dataset to JSON file."""
+        with open(path, "w") as f:
+            json.dump(self.model_dump(), f, indent=2, default=str)
+
+    @classmethod
+    def load(cls, path: str) -> "EvalDataset":
+        """Load dataset from JSON file."""
+        with open(path) as f:
+            data = json.load(f)
+        return cls(**data)
+
+    def filter_by_tag(self, tag: str) -> "EvalDataset":
+        """Filter dataset by tag."""
+        filtered = [ex for ex in self.examples if tag in ex.tags]
+        return EvalDataset(
+            name=f"{self.name}_{tag}",
+            description=f"Filtered by tag: {tag}",
+            version=self.version,
+            examples=filtered
+        )
+
+
+# Create evaluation dataset
+eval_dataset = EvalDataset(
+    name="summarization_eval",
+    description="Evaluation set for document summarization",
+    version="1.0",
+    examples=[
+        EvalExample(
+            id="sum_001",
+            input="Long document text...",
+            expected_output="Concise summary...",
+            tags=["short", "technical"]
+        ),
+        EvalExample(
+            id="sum_002",
+            input="Another document...",
+            expected_output="Another summary...",
+            tags=["long", "business"]
+        )
+    ]
+)
+
+eval_dataset.save("eval_data/summarization_v1.json")
+```
+
+## Evaluation Metrics
+
+```python
+from typing import Protocol, List
+import numpy as np
+from sklearn.metrics import accuracy_score, precision_recall_fscore_support
+import re
+
+
+class Metric(Protocol):
+    """Protocol for evaluation metrics."""
+
+    def compute(
+        self,
+        predictions: List[str],
+        references: List[str]
+    ) -> float:
+        """Compute metric score."""
+        ...
+
+
+class ExactMatch:
+    """Exact match metric (case-insensitive)."""
+
+    def compute(
+        self,
+        predictions: List[str],
+        references: List[str]
+    ) -> float:
+        """
+        Compute exact match accuracy.
+
+        Returns:
+            Fraction of exact matches (0-1)
+        """
+        matches = sum(
+            p.strip().lower() == r.strip().lower()
+            for p, r in zip(predictions, references)
+        )
+        return matches / len(predictions)
+
+
+class TokenOverlap:
+    """Token overlap metric (precision, recall, F1)."""
+
+    def tokenize(self, text: str) -> set:
+        """Simple whitespace tokenization."""
+        return set(text.lower().split())
+
+    def compute_f1(
+        self,
+        prediction: str,
+        reference: str
+    ) -> Dict[str, float]:
+        """
+        Compute precision, recall, F1 for single example.
+
+        Returns:
+            Dict with precision, recall, f1 scores
+        """
+        pred_tokens = self.tokenize(prediction)
+        ref_tokens = self.tokenize(reference)
+
+        if not pred_tokens or not ref_tokens:
+            return {"precision": 0.0, "recall": 0.0, "f1": 0.0}
+
+        overlap = pred_tokens & ref_tokens
+
+        precision = len(overlap) / len(pred_tokens)
+        recall = len(overlap) / len(ref_tokens)
+
+        if precision + recall == 0:
+            f1 = 0.0
+        else:
+            f1 = 2 * (precision * recall) / (precision + recall)
+
+        return {
+            "precision": precision,
+            "recall": recall,
+            "f1": f1
+        }
+
+    def compute(
+        self,
+        predictions: List[str],
+        references: List[str]
+    ) -> Dict[str, float]:
+        """
+        Compute average metrics across all examples.
+
+        Returns:
+            Dict with average precision, recall, f1
+        """
+        scores = [
+            self.compute_f1(p, r)
+            for p, r in zip(predictions, references)
+        ]
+
+        return {
+            "precision": np.mean([s["precision"] for s in scores]),
+            "recall": np.mean([s["recall"] for s in scores]),
+            "f1": np.mean([s["f1"] for s in scores])
+        }
+
+
+class SemanticSimilarity:
+    """Semantic similarity using embeddings."""
+
+    def __init__(self, embedding_model):
+        self.embedding_model = embedding_model
+
+    async def compute(
+        self,
+        predictions: List[str],
+        references: List[str]
+    ) -> float:
+        """
+        Compute average cosine similarity.
+
+        Returns:
+            Average similarity score (0-1)
+        """
+        # Embed predictions and references
+        pred_embeddings = await self.embedding_model.embed(predictions)
+        ref_embeddings = await self.embedding_model.embed(references)
+
+        # Compute cosine similarities
+        similarities = []
+        for pred_emb, ref_emb in zip(pred_embeddings, ref_embeddings):
+            similarity = np.dot(pred_emb, ref_emb) / (
+                np.linalg.norm(pred_emb) * np.linalg.norm(ref_emb)
+            )
+            similarities.append(similarity)
+
+        return float(np.mean(similarities))
+
+
+# Usage
+exact_match = ExactMatch()
+token_overlap = TokenOverlap()
+
+predictions = ["The cat sat on mat", "Python is great"]
+references = ["The cat sat on the mat", "Python is awesome"]
+
+em_score = exact_match.compute(predictions, references)
+overlap_scores = token_overlap.compute(predictions, references)
+
+print(f"Exact Match: {em_score:.2f}")
+print(f"F1 Score: {overlap_scores['f1']:.2f}")
+```
+
+## LLM-as-Judge Evaluation
+
+```python
+class LLMJudge:
+    """Use LLM to evaluate outputs."""
+
+    def __init__(self, llm_client):
+        self.llm = llm_client
+
+    async def judge_single(
+        self,
+        input: str,
+        prediction: str,
+        reference: Optional[str] = None,
+        criteria: List[str] = None
+    ) -> Dict[str, any]:
+        """
+        Evaluate single prediction using LLM.
+
+        Args:
+            input: Original input
+            prediction: Model prediction
+            reference: Optional reference answer
+            criteria: Evaluation criteria
+
+        Returns:
+            Dict with score and reasoning
+        """
+        criteria = criteria or [
+            "accuracy",
+            "relevance",
+            "completeness",
+            "clarity"
+        ]
+
+        prompt = self._build_judge_prompt(
+            input, prediction, reference, criteria
+        )
+
+        response = await self.llm.complete(prompt, temperature=0.0)
+
+        # Parse response (expects JSON)
+        import json
+        try:
+            result = json.loads(response)
+            return result
+        except json.JSONDecodeError:
+            return {
+                "score": 0,
+                "reasoning": "Failed to parse response",
+                "raw_response": response
+            }
+
+    def _build_judge_prompt(
+        self,
+        input: str,
+        prediction: str,
+        reference: Optional[str],
+        criteria: List[str]
+    ) -> str:
+        """Build prompt for LLM judge."""
+        criteria_str = ", ".join(criteria)
+
+        prompt = f"""Evaluate this model output on: {criteria_str}
+
+Input:
+{input}
+
+Model Output:
+{prediction}"""
+
+        if reference:
+            prompt += f"""
+
+Reference Answer:
+{reference}"""
+
+        prompt += """
+
+Provide evaluation as JSON:
+{
+  "score": <1-10>,
+  "reasoning": "<explanation>",
+  "criteria_scores": {
+    "accuracy": <1-10>,
+    "relevance": <1-10>,
+    ...
+  }
+}"""
+
+        return prompt
+
+    async def batch_judge(
+        self,
+        examples: List[Dict[str, str]],
+        criteria: List[str] = None
+    ) -> List[Dict[str, any]]:
+        """
+        Judge multiple examples in batch.
+
+        Args:
+            examples: List of dicts with input, prediction, reference
+            criteria: Evaluation criteria
+
+        Returns:
+            List of judgment results
+        """
+        import asyncio
+
+        tasks = [
+            self.judge_single(
+                input=ex["input"],
+                prediction=ex["prediction"],
+                reference=ex.get("reference"),
+                criteria=criteria
+            )
+            for ex in examples
+        ]
+
+        return await asyncio.gather(*tasks)
+
+
+# Usage
+judge = LLMJudge(llm_client)
+
+result = await judge.judge_single(
+    input="What is Python?",
+    prediction="Python is a programming language.",
+    reference="Python is a high-level programming language.",
+    criteria=["accuracy", "completeness", "clarity"]
+)
+
+print(f"Score: {result['score']}/10")
+print(f"Reasoning: {result['reasoning']}")
+```
+
+## A/B Testing Framework
+
+```python
+from typing import Callable, Dict, List
+from dataclasses import dataclass
+from datetime import datetime
+import random
+
+
+@dataclass
+class Variant:
+    """A/B test variant."""
+
+    name: str
+    model_fn: Callable
+    traffic_weight: float = 0.5
+
+
+@dataclass
+class ABTestResult:
+    """Result from A/B test."""
+
+    variant_name: str
+    example_id: str
+    prediction: str
+    metrics: Dict[str, float]
+    latency_ms: float
+    timestamp: datetime
+
+
+class ABTest:
+    """A/B testing framework for LLM variants."""
+
+    def __init__(
+        self,
+        name: str,
+        variants: List[Variant],
+        metrics: List[Metric]
+    ):
+        self.name = name
+        self.variants = variants
+        self.metrics = metrics
+        self.results: List[ABTestResult] = []
+
+        # Normalize weights
+        total_weight = sum(v.traffic_weight for v in variants)
+        for v in variants:
+            v.traffic_weight /= total_weight
+
+    def select_variant(self) -> Variant:
+        """Select variant based on traffic weight."""
+        r = random.random()
+        cumulative = 0.0
+
+        for variant in self.variants:
+            cumulative += variant.traffic_weight
+            if r <= cumulative:
+                return variant
+
+        return self.variants[-1]
+
+    async def run_test(
+        self,
+        eval_dataset: EvalDataset,
+        samples_per_variant: Optional[int] = None
+    ) -> Dict[str, any]:
+        """
+        Run A/B test on evaluation dataset.
+
+        Args:
+            eval_dataset: Evaluation dataset
+            samples_per_variant: Samples per variant (None = all)
+
+        Returns:
+            Test results with metrics per variant
+        """
+        import time
+
+        samples = samples_per_variant or len(eval_dataset.examples)
+
+        # Run predictions for each variant
+        for variant in self.variants:
+            for i, example in enumerate(eval_dataset.examples[:samples]):
+                start = time.time()
+
+                # Get prediction from variant
+                prediction = await variant.model_fn(example.input)
+
+                latency = (time.time() - start) * 1000
+
+                # Compute metrics
+                variant_metrics = {}
+                for metric in self.metrics:
+                    score = metric.compute([prediction], [example.expected_output])
+                    variant_metrics[metric.__class__.__name__] = score
+
+                # Store result
+                self.results.append(ABTestResult(
+                    variant_name=variant.name,
+                    example_id=example.id,
+                    prediction=prediction,
+                    metrics=variant_metrics,
+                    latency_ms=latency,
+                    timestamp=datetime.utcnow()
+                ))
+
+        return self.analyze_results()
+
+    def analyze_results(self) -> Dict[str, any]:
+        """
+        Analyze A/B test results.
+
+        Returns:
+            Statistics per variant
+        """
+        variant_stats = {}
+
+        for variant in self.variants:
+            variant_results = [
+                r for r in self.results
+                if r.variant_name == variant.name
+            ]
+
+            if not variant_results:
+                continue
+
+            # Aggregate metrics
+            metric_names = variant_results[0].metrics.keys()
+            avg_metrics = {}
+
+            for metric_name in metric_names:
+                scores = [r.metrics[metric_name] for r in variant_results]
+                avg_metrics[metric_name] = {
+                    "mean": np.mean(scores),
+                    "std": np.std(scores),
+                    "min": np.min(scores),
+                    "max": np.max(scores)
+                }
+
+            # Latency stats
+            latencies = [r.latency_ms for r in variant_results]
+
+            variant_stats[variant.name] = {
+                "samples": len(variant_results),
+                "metrics": avg_metrics,
+                "latency": {
+                    "mean_ms": np.mean(latencies),
+                    "p50_ms": np.percentile(latencies, 50),
+                    "p95_ms": np.percentile(latencies, 95),
+                    "p99_ms": np.percentile(latencies, 99)
+                }
+            }
+
+        return variant_stats
+
+
+# Usage
+variants = [
+    Variant(
+        name="baseline",
+        model_fn=lambda x: model_v1.complete(x),
+        traffic_weight=0.5
+    ),
+    Variant(
+        name="candidate",
+        model_fn=lambda x: model_v2.complete(x),
+        traffic_weight=0.5
+    )
+]
+
+ab_test = ABTest(
+    name="summarization_v1_vs_v2",
+    variants=variants,
+    metrics=[ExactMatch(), TokenOverlap()]
+)
+
+results = await ab_test.run_test(eval_dataset, samples_per_variant=100)
+```
+
+## Experiment Tracking
+
+```python
+from typing import Dict, Any, Optional
+import json
+from pathlib import Path
+
+
+class ExperimentTracker:
+    """Track experiments and results."""
+
+    def __init__(self, experiments_dir: str = "experiments"):
+        self.experiments_dir = Path(experiments_dir)
+        self.experiments_dir.mkdir(exist_ok=True)
+
+    def log_experiment(
+        self,
+        name: str,
+        config: Dict[str, Any],
+        metrics: Dict[str, float],
+        metadata: Optional[Dict[str, Any]] = None
+    ) -> str:
+        """
+        Log experiment configuration and results.
+
+        Args:
+            name: Experiment name
+            config: Model configuration
+            metrics: Evaluation metrics
+            metadata: Additional metadata
+
+        Returns:
+            Experiment ID
+        """
+        from datetime import datetime
+        import uuid
+
+        experiment_id = str(uuid.uuid4())[:8]
+        timestamp = datetime.utcnow()
+
+        experiment = {
+            "id": experiment_id,
+            "name": name,
+            "timestamp": timestamp.isoformat(),
+            "config": config,
+            "metrics": metrics,
+            "metadata": metadata or {}
+        }
+
+        # Save to file
+        filename = f"{timestamp.strftime('%Y%m%d_%H%M%S')}_{name}_{experiment_id}.json"
+        filepath = self.experiments_dir / filename
+
+        with open(filepath, "w") as f:
+            json.dump(experiment, f, indent=2)
+
+        return experiment_id
+
+    def load_experiment(self, experiment_id: str) -> Optional[Dict[str, Any]]:
+        """Load experiment by ID."""
+        for filepath in self.experiments_dir.glob(f"*_{experiment_id}.json"):
+            with open(filepath) as f:
+                return json.load(f)
+        return None
+
+    def list_experiments(
+        self,
+        name: Optional[str] = None
+    ) -> List[Dict[str, Any]]:
+        """List all experiments, optionally filtered by name."""
+        experiments = []
+
+        for filepath in sorted(self.experiments_dir.glob("*.json")):
+            with open(filepath) as f:
+                exp = json.load(f)
+                if name is None or exp["name"] == name:
+                    experiments.append(exp)
+
+        return experiments
+
+    def compare_experiments(
+        self,
+        experiment_ids: List[str]
+    ) -> Dict[str, Any]:
+        """Compare multiple experiments."""
+        experiments = [
+            self.load_experiment(exp_id)
+            for exp_id in experiment_ids
+        ]
+
+        # Extract metrics for comparison
+        comparison = {
+            "experiments": []
+        }
+
+        for exp in experiments:
+            if exp:
+                comparison["experiments"].append({
+                    "id": exp["id"],
+                    "name": exp["name"],
+                    "metrics": exp["metrics"]
+                })
+
+        return comparison
+
+
+# Usage
+tracker = ExperimentTracker()
+
+exp_id = tracker.log_experiment(
+    name="summarization_v2",
+    config={
+        "model": "claude-sonnet-4",
+        "temperature": 0.3,
+        "max_tokens": 512,
+        "prompt_version": "2.0"
+    },
+    metrics={
+        "exact_match": 0.45,
+        "f1": 0.78,
+        "semantic_similarity": 0.85
+    },
+    metadata={
+        "dataset": "summarization_v1.json",
+        "num_examples": 100
+    }
+)
+
+print(f"Logged experiment: {exp_id}")
+```
+
+## ❌ Anti-Patterns
+
+```python
+# ❌ No evaluation dataset
+def test_model():
+    result = model("test this")  # Single example!
+    print("Works!")
+
+# ✅ Better: Use proper eval dataset
+eval_dataset = EvalDataset.load("eval_data.json")
+results = await evaluator.run(model, eval_dataset)
+
+
+# ❌ Only exact match metric
+score = sum(p == r for p, r in zip(preds, refs)) / len(preds)
+
+# ✅ Better: Multiple metrics
+metrics = {
+    "exact_match": ExactMatch().compute(preds, refs),
+    "f1": TokenOverlap().compute(preds, refs)["f1"],
+    "semantic_sim": await SemanticSimilarity().compute(preds, refs)
+}
+
+
+# ❌ No experiment tracking
+model_v2_score = 0.78  # Lost context!
+
+# ✅ Better: Track all experiments
+tracker.log_experiment(
+    name="model_v2",
+    config={"version": "2.0"},
+    metrics={"f1": 0.78}
+)
+
+
+# ❌ Cherry-picking examples
+good_examples = [ex for ex in dataset if model(ex) == expected]
+
+# ✅ Better: Use full representative dataset
+results = evaluate_on_full_dataset(model, dataset)
+```
+
+## Best Practices Checklist
+
+- ✅ Create representative evaluation datasets
+- ✅ Version control eval datasets
+- ✅ Use multiple complementary metrics
+- ✅ Include LLM-as-judge for qualitative evaluation
+- ✅ Run A/B tests for variant comparison
+- ✅ Track all experiments with config and metrics
+- ✅ Measure latency alongside quality metrics
+- ✅ Use statistical significance testing
+- ✅ Evaluate on diverse examples (easy, medium, hard)
+- ✅ Include edge cases and adversarial examples
+- ✅ Document evaluation methodology
+- ✅ Set up automated evaluation in CI/CD
+
+## Auto-Apply
+
+When evaluating LLM systems:
+1. Create EvalDataset with representative examples
+2. Compute multiple metrics (exact match, F1, semantic similarity)
+3. Use LLM-as-judge for qualitative assessment
+4. Run A/B tests comparing variants
+5. Track experiments with ExperimentTracker
+6. Measure latency alongside quality
+7. Save results for reproducibility
+
+## Related Skills
+
+- `prompting-patterns` - For prompt engineering
+- `llm-app-architecture` - For LLM integration
+- `monitoring-alerting` - For production metrics
+- `model-selection` - For choosing models
+- `performance-profiling` - For optimization