Initial commit
This commit is contained in:
761
.claude/skills/evaluation-metrics/SKILL.md
Normal file
761
.claude/skills/evaluation-metrics/SKILL.md
Normal file
@@ -0,0 +1,761 @@
|
||||
---
|
||||
name: evaluation-metrics
|
||||
description: Automatically applies when evaluating LLM performance. Ensures proper eval datasets, metrics computation, A/B testing, LLM-as-judge patterns, and experiment tracking.
|
||||
category: ai-llm
|
||||
---
|
||||
|
||||
# Evaluation Metrics for LLM Applications
|
||||
|
||||
When evaluating LLM performance, follow these patterns for rigorous, reproducible evaluation.
|
||||
|
||||
**Trigger Keywords**: evaluation, eval, metrics, benchmark, test set, A/B test, LLM judge, performance testing, accuracy, precision, recall, F1, BLEU, ROUGE, experiment tracking
|
||||
|
||||
**Agent Integration**: Used by `ml-system-architect`, `performance-and-cost-engineer-llm`, `llm-app-engineer`
|
||||
|
||||
## ✅ Correct Pattern: Evaluation Dataset
|
||||
|
||||
```python
|
||||
from typing import List, Dict, Optional
|
||||
from pydantic import BaseModel, Field
|
||||
from datetime import datetime
|
||||
import json
|
||||
|
||||
|
||||
class EvalExample(BaseModel):
|
||||
"""Single evaluation example."""
|
||||
|
||||
id: str
|
||||
input: str
|
||||
expected_output: str
|
||||
metadata: Dict[str, any] = Field(default_factory=dict)
|
||||
tags: List[str] = Field(default_factory=list)
|
||||
|
||||
|
||||
class EvalDataset(BaseModel):
|
||||
"""Evaluation dataset with metadata."""
|
||||
|
||||
name: str
|
||||
description: str
|
||||
version: str
|
||||
created_at: datetime = Field(default_factory=datetime.utcnow)
|
||||
examples: List[EvalExample]
|
||||
|
||||
def save(self, path: str):
|
||||
"""Save dataset to JSON file."""
|
||||
with open(path, "w") as f:
|
||||
json.dump(self.model_dump(), f, indent=2, default=str)
|
||||
|
||||
@classmethod
|
||||
def load(cls, path: str) -> "EvalDataset":
|
||||
"""Load dataset from JSON file."""
|
||||
with open(path) as f:
|
||||
data = json.load(f)
|
||||
return cls(**data)
|
||||
|
||||
def filter_by_tag(self, tag: str) -> "EvalDataset":
|
||||
"""Filter dataset by tag."""
|
||||
filtered = [ex for ex in self.examples if tag in ex.tags]
|
||||
return EvalDataset(
|
||||
name=f"{self.name}_{tag}",
|
||||
description=f"Filtered by tag: {tag}",
|
||||
version=self.version,
|
||||
examples=filtered
|
||||
)
|
||||
|
||||
|
||||
# Create evaluation dataset
|
||||
eval_dataset = EvalDataset(
|
||||
name="summarization_eval",
|
||||
description="Evaluation set for document summarization",
|
||||
version="1.0",
|
||||
examples=[
|
||||
EvalExample(
|
||||
id="sum_001",
|
||||
input="Long document text...",
|
||||
expected_output="Concise summary...",
|
||||
tags=["short", "technical"]
|
||||
),
|
||||
EvalExample(
|
||||
id="sum_002",
|
||||
input="Another document...",
|
||||
expected_output="Another summary...",
|
||||
tags=["long", "business"]
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
eval_dataset.save("eval_data/summarization_v1.json")
|
||||
```
|
||||
|
||||
## Evaluation Metrics
|
||||
|
||||
```python
|
||||
from typing import Protocol, List
|
||||
import numpy as np
|
||||
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
|
||||
import re
|
||||
|
||||
|
||||
class Metric(Protocol):
|
||||
"""Protocol for evaluation metrics."""
|
||||
|
||||
def compute(
|
||||
self,
|
||||
predictions: List[str],
|
||||
references: List[str]
|
||||
) -> float:
|
||||
"""Compute metric score."""
|
||||
...
|
||||
|
||||
|
||||
class ExactMatch:
|
||||
"""Exact match metric (case-insensitive)."""
|
||||
|
||||
def compute(
|
||||
self,
|
||||
predictions: List[str],
|
||||
references: List[str]
|
||||
) -> float:
|
||||
"""
|
||||
Compute exact match accuracy.
|
||||
|
||||
Returns:
|
||||
Fraction of exact matches (0-1)
|
||||
"""
|
||||
matches = sum(
|
||||
p.strip().lower() == r.strip().lower()
|
||||
for p, r in zip(predictions, references)
|
||||
)
|
||||
return matches / len(predictions)
|
||||
|
||||
|
||||
class TokenOverlap:
|
||||
"""Token overlap metric (precision, recall, F1)."""
|
||||
|
||||
def tokenize(self, text: str) -> set:
|
||||
"""Simple whitespace tokenization."""
|
||||
return set(text.lower().split())
|
||||
|
||||
def compute_f1(
|
||||
self,
|
||||
prediction: str,
|
||||
reference: str
|
||||
) -> Dict[str, float]:
|
||||
"""
|
||||
Compute precision, recall, F1 for single example.
|
||||
|
||||
Returns:
|
||||
Dict with precision, recall, f1 scores
|
||||
"""
|
||||
pred_tokens = self.tokenize(prediction)
|
||||
ref_tokens = self.tokenize(reference)
|
||||
|
||||
if not pred_tokens or not ref_tokens:
|
||||
return {"precision": 0.0, "recall": 0.0, "f1": 0.0}
|
||||
|
||||
overlap = pred_tokens & ref_tokens
|
||||
|
||||
precision = len(overlap) / len(pred_tokens)
|
||||
recall = len(overlap) / len(ref_tokens)
|
||||
|
||||
if precision + recall == 0:
|
||||
f1 = 0.0
|
||||
else:
|
||||
f1 = 2 * (precision * recall) / (precision + recall)
|
||||
|
||||
return {
|
||||
"precision": precision,
|
||||
"recall": recall,
|
||||
"f1": f1
|
||||
}
|
||||
|
||||
def compute(
|
||||
self,
|
||||
predictions: List[str],
|
||||
references: List[str]
|
||||
) -> Dict[str, float]:
|
||||
"""
|
||||
Compute average metrics across all examples.
|
||||
|
||||
Returns:
|
||||
Dict with average precision, recall, f1
|
||||
"""
|
||||
scores = [
|
||||
self.compute_f1(p, r)
|
||||
for p, r in zip(predictions, references)
|
||||
]
|
||||
|
||||
return {
|
||||
"precision": np.mean([s["precision"] for s in scores]),
|
||||
"recall": np.mean([s["recall"] for s in scores]),
|
||||
"f1": np.mean([s["f1"] for s in scores])
|
||||
}
|
||||
|
||||
|
||||
class SemanticSimilarity:
|
||||
"""Semantic similarity using embeddings."""
|
||||
|
||||
def __init__(self, embedding_model):
|
||||
self.embedding_model = embedding_model
|
||||
|
||||
async def compute(
|
||||
self,
|
||||
predictions: List[str],
|
||||
references: List[str]
|
||||
) -> float:
|
||||
"""
|
||||
Compute average cosine similarity.
|
||||
|
||||
Returns:
|
||||
Average similarity score (0-1)
|
||||
"""
|
||||
# Embed predictions and references
|
||||
pred_embeddings = await self.embedding_model.embed(predictions)
|
||||
ref_embeddings = await self.embedding_model.embed(references)
|
||||
|
||||
# Compute cosine similarities
|
||||
similarities = []
|
||||
for pred_emb, ref_emb in zip(pred_embeddings, ref_embeddings):
|
||||
similarity = np.dot(pred_emb, ref_emb) / (
|
||||
np.linalg.norm(pred_emb) * np.linalg.norm(ref_emb)
|
||||
)
|
||||
similarities.append(similarity)
|
||||
|
||||
return float(np.mean(similarities))
|
||||
|
||||
|
||||
# Usage
|
||||
exact_match = ExactMatch()
|
||||
token_overlap = TokenOverlap()
|
||||
|
||||
predictions = ["The cat sat on mat", "Python is great"]
|
||||
references = ["The cat sat on the mat", "Python is awesome"]
|
||||
|
||||
em_score = exact_match.compute(predictions, references)
|
||||
overlap_scores = token_overlap.compute(predictions, references)
|
||||
|
||||
print(f"Exact Match: {em_score:.2f}")
|
||||
print(f"F1 Score: {overlap_scores['f1']:.2f}")
|
||||
```
|
||||
|
||||
## LLM-as-Judge Evaluation
|
||||
|
||||
```python
|
||||
class LLMJudge:
|
||||
"""Use LLM to evaluate outputs."""
|
||||
|
||||
def __init__(self, llm_client):
|
||||
self.llm = llm_client
|
||||
|
||||
async def judge_single(
|
||||
self,
|
||||
input: str,
|
||||
prediction: str,
|
||||
reference: Optional[str] = None,
|
||||
criteria: List[str] = None
|
||||
) -> Dict[str, any]:
|
||||
"""
|
||||
Evaluate single prediction using LLM.
|
||||
|
||||
Args:
|
||||
input: Original input
|
||||
prediction: Model prediction
|
||||
reference: Optional reference answer
|
||||
criteria: Evaluation criteria
|
||||
|
||||
Returns:
|
||||
Dict with score and reasoning
|
||||
"""
|
||||
criteria = criteria or [
|
||||
"accuracy",
|
||||
"relevance",
|
||||
"completeness",
|
||||
"clarity"
|
||||
]
|
||||
|
||||
prompt = self._build_judge_prompt(
|
||||
input, prediction, reference, criteria
|
||||
)
|
||||
|
||||
response = await self.llm.complete(prompt, temperature=0.0)
|
||||
|
||||
# Parse response (expects JSON)
|
||||
import json
|
||||
try:
|
||||
result = json.loads(response)
|
||||
return result
|
||||
except json.JSONDecodeError:
|
||||
return {
|
||||
"score": 0,
|
||||
"reasoning": "Failed to parse response",
|
||||
"raw_response": response
|
||||
}
|
||||
|
||||
def _build_judge_prompt(
|
||||
self,
|
||||
input: str,
|
||||
prediction: str,
|
||||
reference: Optional[str],
|
||||
criteria: List[str]
|
||||
) -> str:
|
||||
"""Build prompt for LLM judge."""
|
||||
criteria_str = ", ".join(criteria)
|
||||
|
||||
prompt = f"""Evaluate this model output on: {criteria_str}
|
||||
|
||||
Input:
|
||||
{input}
|
||||
|
||||
Model Output:
|
||||
{prediction}"""
|
||||
|
||||
if reference:
|
||||
prompt += f"""
|
||||
|
||||
Reference Answer:
|
||||
{reference}"""
|
||||
|
||||
prompt += """
|
||||
|
||||
Provide evaluation as JSON:
|
||||
{
|
||||
"score": <1-10>,
|
||||
"reasoning": "<explanation>",
|
||||
"criteria_scores": {
|
||||
"accuracy": <1-10>,
|
||||
"relevance": <1-10>,
|
||||
...
|
||||
}
|
||||
}"""
|
||||
|
||||
return prompt
|
||||
|
||||
async def batch_judge(
|
||||
self,
|
||||
examples: List[Dict[str, str]],
|
||||
criteria: List[str] = None
|
||||
) -> List[Dict[str, any]]:
|
||||
"""
|
||||
Judge multiple examples in batch.
|
||||
|
||||
Args:
|
||||
examples: List of dicts with input, prediction, reference
|
||||
criteria: Evaluation criteria
|
||||
|
||||
Returns:
|
||||
List of judgment results
|
||||
"""
|
||||
import asyncio
|
||||
|
||||
tasks = [
|
||||
self.judge_single(
|
||||
input=ex["input"],
|
||||
prediction=ex["prediction"],
|
||||
reference=ex.get("reference"),
|
||||
criteria=criteria
|
||||
)
|
||||
for ex in examples
|
||||
]
|
||||
|
||||
return await asyncio.gather(*tasks)
|
||||
|
||||
|
||||
# Usage
|
||||
judge = LLMJudge(llm_client)
|
||||
|
||||
result = await judge.judge_single(
|
||||
input="What is Python?",
|
||||
prediction="Python is a programming language.",
|
||||
reference="Python is a high-level programming language.",
|
||||
criteria=["accuracy", "completeness", "clarity"]
|
||||
)
|
||||
|
||||
print(f"Score: {result['score']}/10")
|
||||
print(f"Reasoning: {result['reasoning']}")
|
||||
```
|
||||
|
||||
## A/B Testing Framework
|
||||
|
||||
```python
|
||||
from typing import Callable, Dict, List
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
import random
|
||||
|
||||
|
||||
@dataclass
|
||||
class Variant:
|
||||
"""A/B test variant."""
|
||||
|
||||
name: str
|
||||
model_fn: Callable
|
||||
traffic_weight: float = 0.5
|
||||
|
||||
|
||||
@dataclass
|
||||
class ABTestResult:
|
||||
"""Result from A/B test."""
|
||||
|
||||
variant_name: str
|
||||
example_id: str
|
||||
prediction: str
|
||||
metrics: Dict[str, float]
|
||||
latency_ms: float
|
||||
timestamp: datetime
|
||||
|
||||
|
||||
class ABTest:
|
||||
"""A/B testing framework for LLM variants."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
name: str,
|
||||
variants: List[Variant],
|
||||
metrics: List[Metric]
|
||||
):
|
||||
self.name = name
|
||||
self.variants = variants
|
||||
self.metrics = metrics
|
||||
self.results: List[ABTestResult] = []
|
||||
|
||||
# Normalize weights
|
||||
total_weight = sum(v.traffic_weight for v in variants)
|
||||
for v in variants:
|
||||
v.traffic_weight /= total_weight
|
||||
|
||||
def select_variant(self) -> Variant:
|
||||
"""Select variant based on traffic weight."""
|
||||
r = random.random()
|
||||
cumulative = 0.0
|
||||
|
||||
for variant in self.variants:
|
||||
cumulative += variant.traffic_weight
|
||||
if r <= cumulative:
|
||||
return variant
|
||||
|
||||
return self.variants[-1]
|
||||
|
||||
async def run_test(
|
||||
self,
|
||||
eval_dataset: EvalDataset,
|
||||
samples_per_variant: Optional[int] = None
|
||||
) -> Dict[str, any]:
|
||||
"""
|
||||
Run A/B test on evaluation dataset.
|
||||
|
||||
Args:
|
||||
eval_dataset: Evaluation dataset
|
||||
samples_per_variant: Samples per variant (None = all)
|
||||
|
||||
Returns:
|
||||
Test results with metrics per variant
|
||||
"""
|
||||
import time
|
||||
|
||||
samples = samples_per_variant or len(eval_dataset.examples)
|
||||
|
||||
# Run predictions for each variant
|
||||
for variant in self.variants:
|
||||
for i, example in enumerate(eval_dataset.examples[:samples]):
|
||||
start = time.time()
|
||||
|
||||
# Get prediction from variant
|
||||
prediction = await variant.model_fn(example.input)
|
||||
|
||||
latency = (time.time() - start) * 1000
|
||||
|
||||
# Compute metrics
|
||||
variant_metrics = {}
|
||||
for metric in self.metrics:
|
||||
score = metric.compute([prediction], [example.expected_output])
|
||||
variant_metrics[metric.__class__.__name__] = score
|
||||
|
||||
# Store result
|
||||
self.results.append(ABTestResult(
|
||||
variant_name=variant.name,
|
||||
example_id=example.id,
|
||||
prediction=prediction,
|
||||
metrics=variant_metrics,
|
||||
latency_ms=latency,
|
||||
timestamp=datetime.utcnow()
|
||||
))
|
||||
|
||||
return self.analyze_results()
|
||||
|
||||
def analyze_results(self) -> Dict[str, any]:
|
||||
"""
|
||||
Analyze A/B test results.
|
||||
|
||||
Returns:
|
||||
Statistics per variant
|
||||
"""
|
||||
variant_stats = {}
|
||||
|
||||
for variant in self.variants:
|
||||
variant_results = [
|
||||
r for r in self.results
|
||||
if r.variant_name == variant.name
|
||||
]
|
||||
|
||||
if not variant_results:
|
||||
continue
|
||||
|
||||
# Aggregate metrics
|
||||
metric_names = variant_results[0].metrics.keys()
|
||||
avg_metrics = {}
|
||||
|
||||
for metric_name in metric_names:
|
||||
scores = [r.metrics[metric_name] for r in variant_results]
|
||||
avg_metrics[metric_name] = {
|
||||
"mean": np.mean(scores),
|
||||
"std": np.std(scores),
|
||||
"min": np.min(scores),
|
||||
"max": np.max(scores)
|
||||
}
|
||||
|
||||
# Latency stats
|
||||
latencies = [r.latency_ms for r in variant_results]
|
||||
|
||||
variant_stats[variant.name] = {
|
||||
"samples": len(variant_results),
|
||||
"metrics": avg_metrics,
|
||||
"latency": {
|
||||
"mean_ms": np.mean(latencies),
|
||||
"p50_ms": np.percentile(latencies, 50),
|
||||
"p95_ms": np.percentile(latencies, 95),
|
||||
"p99_ms": np.percentile(latencies, 99)
|
||||
}
|
||||
}
|
||||
|
||||
return variant_stats
|
||||
|
||||
|
||||
# Usage
|
||||
variants = [
|
||||
Variant(
|
||||
name="baseline",
|
||||
model_fn=lambda x: model_v1.complete(x),
|
||||
traffic_weight=0.5
|
||||
),
|
||||
Variant(
|
||||
name="candidate",
|
||||
model_fn=lambda x: model_v2.complete(x),
|
||||
traffic_weight=0.5
|
||||
)
|
||||
]
|
||||
|
||||
ab_test = ABTest(
|
||||
name="summarization_v1_vs_v2",
|
||||
variants=variants,
|
||||
metrics=[ExactMatch(), TokenOverlap()]
|
||||
)
|
||||
|
||||
results = await ab_test.run_test(eval_dataset, samples_per_variant=100)
|
||||
```
|
||||
|
||||
## Experiment Tracking
|
||||
|
||||
```python
|
||||
from typing import Dict, Any, Optional
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class ExperimentTracker:
|
||||
"""Track experiments and results."""
|
||||
|
||||
def __init__(self, experiments_dir: str = "experiments"):
|
||||
self.experiments_dir = Path(experiments_dir)
|
||||
self.experiments_dir.mkdir(exist_ok=True)
|
||||
|
||||
def log_experiment(
|
||||
self,
|
||||
name: str,
|
||||
config: Dict[str, Any],
|
||||
metrics: Dict[str, float],
|
||||
metadata: Optional[Dict[str, Any]] = None
|
||||
) -> str:
|
||||
"""
|
||||
Log experiment configuration and results.
|
||||
|
||||
Args:
|
||||
name: Experiment name
|
||||
config: Model configuration
|
||||
metrics: Evaluation metrics
|
||||
metadata: Additional metadata
|
||||
|
||||
Returns:
|
||||
Experiment ID
|
||||
"""
|
||||
from datetime import datetime
|
||||
import uuid
|
||||
|
||||
experiment_id = str(uuid.uuid4())[:8]
|
||||
timestamp = datetime.utcnow()
|
||||
|
||||
experiment = {
|
||||
"id": experiment_id,
|
||||
"name": name,
|
||||
"timestamp": timestamp.isoformat(),
|
||||
"config": config,
|
||||
"metrics": metrics,
|
||||
"metadata": metadata or {}
|
||||
}
|
||||
|
||||
# Save to file
|
||||
filename = f"{timestamp.strftime('%Y%m%d_%H%M%S')}_{name}_{experiment_id}.json"
|
||||
filepath = self.experiments_dir / filename
|
||||
|
||||
with open(filepath, "w") as f:
|
||||
json.dump(experiment, f, indent=2)
|
||||
|
||||
return experiment_id
|
||||
|
||||
def load_experiment(self, experiment_id: str) -> Optional[Dict[str, Any]]:
|
||||
"""Load experiment by ID."""
|
||||
for filepath in self.experiments_dir.glob(f"*_{experiment_id}.json"):
|
||||
with open(filepath) as f:
|
||||
return json.load(f)
|
||||
return None
|
||||
|
||||
def list_experiments(
|
||||
self,
|
||||
name: Optional[str] = None
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""List all experiments, optionally filtered by name."""
|
||||
experiments = []
|
||||
|
||||
for filepath in sorted(self.experiments_dir.glob("*.json")):
|
||||
with open(filepath) as f:
|
||||
exp = json.load(f)
|
||||
if name is None or exp["name"] == name:
|
||||
experiments.append(exp)
|
||||
|
||||
return experiments
|
||||
|
||||
def compare_experiments(
|
||||
self,
|
||||
experiment_ids: List[str]
|
||||
) -> Dict[str, Any]:
|
||||
"""Compare multiple experiments."""
|
||||
experiments = [
|
||||
self.load_experiment(exp_id)
|
||||
for exp_id in experiment_ids
|
||||
]
|
||||
|
||||
# Extract metrics for comparison
|
||||
comparison = {
|
||||
"experiments": []
|
||||
}
|
||||
|
||||
for exp in experiments:
|
||||
if exp:
|
||||
comparison["experiments"].append({
|
||||
"id": exp["id"],
|
||||
"name": exp["name"],
|
||||
"metrics": exp["metrics"]
|
||||
})
|
||||
|
||||
return comparison
|
||||
|
||||
|
||||
# Usage
|
||||
tracker = ExperimentTracker()
|
||||
|
||||
exp_id = tracker.log_experiment(
|
||||
name="summarization_v2",
|
||||
config={
|
||||
"model": "claude-sonnet-4",
|
||||
"temperature": 0.3,
|
||||
"max_tokens": 512,
|
||||
"prompt_version": "2.0"
|
||||
},
|
||||
metrics={
|
||||
"exact_match": 0.45,
|
||||
"f1": 0.78,
|
||||
"semantic_similarity": 0.85
|
||||
},
|
||||
metadata={
|
||||
"dataset": "summarization_v1.json",
|
||||
"num_examples": 100
|
||||
}
|
||||
)
|
||||
|
||||
print(f"Logged experiment: {exp_id}")
|
||||
```
|
||||
|
||||
## ❌ Anti-Patterns
|
||||
|
||||
```python
|
||||
# ❌ No evaluation dataset
|
||||
def test_model():
|
||||
result = model("test this") # Single example!
|
||||
print("Works!")
|
||||
|
||||
# ✅ Better: Use proper eval dataset
|
||||
eval_dataset = EvalDataset.load("eval_data.json")
|
||||
results = await evaluator.run(model, eval_dataset)
|
||||
|
||||
|
||||
# ❌ Only exact match metric
|
||||
score = sum(p == r for p, r in zip(preds, refs)) / len(preds)
|
||||
|
||||
# ✅ Better: Multiple metrics
|
||||
metrics = {
|
||||
"exact_match": ExactMatch().compute(preds, refs),
|
||||
"f1": TokenOverlap().compute(preds, refs)["f1"],
|
||||
"semantic_sim": await SemanticSimilarity().compute(preds, refs)
|
||||
}
|
||||
|
||||
|
||||
# ❌ No experiment tracking
|
||||
model_v2_score = 0.78 # Lost context!
|
||||
|
||||
# ✅ Better: Track all experiments
|
||||
tracker.log_experiment(
|
||||
name="model_v2",
|
||||
config={"version": "2.0"},
|
||||
metrics={"f1": 0.78}
|
||||
)
|
||||
|
||||
|
||||
# ❌ Cherry-picking examples
|
||||
good_examples = [ex for ex in dataset if model(ex) == expected]
|
||||
|
||||
# ✅ Better: Use full representative dataset
|
||||
results = evaluate_on_full_dataset(model, dataset)
|
||||
```
|
||||
|
||||
## Best Practices Checklist
|
||||
|
||||
- ✅ Create representative evaluation datasets
|
||||
- ✅ Version control eval datasets
|
||||
- ✅ Use multiple complementary metrics
|
||||
- ✅ Include LLM-as-judge for qualitative evaluation
|
||||
- ✅ Run A/B tests for variant comparison
|
||||
- ✅ Track all experiments with config and metrics
|
||||
- ✅ Measure latency alongside quality metrics
|
||||
- ✅ Use statistical significance testing
|
||||
- ✅ Evaluate on diverse examples (easy, medium, hard)
|
||||
- ✅ Include edge cases and adversarial examples
|
||||
- ✅ Document evaluation methodology
|
||||
- ✅ Set up automated evaluation in CI/CD
|
||||
|
||||
## Auto-Apply
|
||||
|
||||
When evaluating LLM systems:
|
||||
1. Create EvalDataset with representative examples
|
||||
2. Compute multiple metrics (exact match, F1, semantic similarity)
|
||||
3. Use LLM-as-judge for qualitative assessment
|
||||
4. Run A/B tests comparing variants
|
||||
5. Track experiments with ExperimentTracker
|
||||
6. Measure latency alongside quality
|
||||
7. Save results for reproducibility
|
||||
|
||||
## Related Skills
|
||||
|
||||
- `prompting-patterns` - For prompt engineering
|
||||
- `llm-app-architecture` - For LLM integration
|
||||
- `monitoring-alerting` - For production metrics
|
||||
- `model-selection` - For choosing models
|
||||
- `performance-profiling` - For optimization
|
||||
Reference in New Issue
Block a user