Initial commit
This commit is contained in:
340
skills/fine-tune/evaluation_metrics.md
Normal file
340
skills/fine-tune/evaluation_metrics.md
Normal file
@@ -0,0 +1,340 @@
|
||||
# Evaluation Metrics Design
|
||||
|
||||
Definitions and calculation methods for evaluation metrics in LangGraph application fine-tuning.
|
||||
|
||||
**💡 Tip**: For practical evaluation scripts and report templates, see [examples.md](examples.md#phase-2-baseline-evaluation-examples).
|
||||
|
||||
## 📊 Importance of Evaluation
|
||||
|
||||
In fine-tuning, evaluation provides:
|
||||
- **Quantifying Improvements**: Objective progress measurement
|
||||
- **Basis for Decision-Making**: Data-driven prioritization
|
||||
- **Quality Assurance**: Prevention of regressions
|
||||
- **ROI Demonstration**: Visualization of business value
|
||||
|
||||
## 🎯 Evaluation Metric Categories
|
||||
|
||||
### 1. Quality Metrics
|
||||
|
||||
#### Accuracy
|
||||
```python
|
||||
def calculate_accuracy(predictions: List, ground_truth: List) -> float:
|
||||
"""Calculate accuracy"""
|
||||
correct = sum(p == g for p, g in zip(predictions, ground_truth))
|
||||
return (correct / len(predictions)) * 100
|
||||
|
||||
# Example
|
||||
predictions = ["product", "technical", "billing", "general"]
|
||||
ground_truth = ["product", "billing", "billing", "general"]
|
||||
accuracy = calculate_accuracy(predictions, ground_truth)
|
||||
# => 50.0% (2/4 correct)
|
||||
```
|
||||
|
||||
#### F1 Score (Multi-class Classification)
|
||||
```python
|
||||
from sklearn.metrics import f1_score, classification_report
|
||||
|
||||
def calculate_f1(predictions: List, ground_truth: List, average='weighted') -> float:
|
||||
"""Calculate F1 score (multi-class support)"""
|
||||
return f1_score(ground_truth, predictions, average=average)
|
||||
|
||||
# Detailed report
|
||||
report = classification_report(ground_truth, predictions)
|
||||
print(report)
|
||||
"""
|
||||
precision recall f1-score support
|
||||
|
||||
product 1.00 1.00 1.00 1
|
||||
technical 0.00 0.00 0.00 1
|
||||
billing 0.50 1.00 0.67 1
|
||||
general 1.00 1.00 1.00 1
|
||||
|
||||
accuracy 0.75 4
|
||||
macro avg 0.62 0.75 0.67 4
|
||||
weighted avg 0.62 0.75 0.67 4
|
||||
"""
|
||||
```
|
||||
|
||||
#### Semantic Similarity
|
||||
```python
|
||||
from sentence_transformers import SentenceTransformer, util
|
||||
|
||||
def calculate_semantic_similarity(
|
||||
generated: str,
|
||||
reference: str,
|
||||
model_name: str = "all-MiniLM-L6-v2"
|
||||
) -> float:
|
||||
"""Calculate semantic similarity between generated and reference text"""
|
||||
model = SentenceTransformer(model_name)
|
||||
|
||||
embeddings = model.encode([generated, reference], convert_to_tensor=True)
|
||||
similarity = util.pytorch_cos_sim(embeddings[0], embeddings[1])
|
||||
|
||||
return similarity.item()
|
||||
|
||||
# Example
|
||||
generated = "Our premium plan costs $49 per month."
|
||||
reference = "The premium subscription is $49/month."
|
||||
similarity = calculate_semantic_similarity(generated, reference)
|
||||
# => 0.87 (high similarity)
|
||||
```
|
||||
|
||||
#### BLEU Score (Text Generation Quality)
|
||||
```python
|
||||
from nltk.translate.bleu_score import sentence_bleu
|
||||
|
||||
def calculate_bleu(generated: str, reference: str) -> float:
|
||||
"""Calculate BLEU score"""
|
||||
reference_tokens = [reference.split()]
|
||||
generated_tokens = generated.split()
|
||||
|
||||
return sentence_bleu(reference_tokens, generated_tokens)
|
||||
|
||||
# Example
|
||||
generated = "The product costs forty nine dollars"
|
||||
reference = "The product costs $49"
|
||||
bleu = calculate_bleu(generated, reference)
|
||||
# => 0.45
|
||||
```
|
||||
|
||||
### 2. Performance Metrics
|
||||
|
||||
#### Latency (Response Time)
|
||||
```python
|
||||
import time
|
||||
from typing import Dict, List
|
||||
|
||||
def measure_latency(test_cases: List[Dict]) -> Dict:
|
||||
"""Measure latency for each node and total"""
|
||||
results = {
|
||||
"total": [],
|
||||
"by_node": {}
|
||||
}
|
||||
|
||||
for case in test_cases:
|
||||
start_time = time.time()
|
||||
|
||||
# Measurement by node
|
||||
node_times = {}
|
||||
|
||||
# Node 1: analyze_intent
|
||||
node_start = time.time()
|
||||
analyze_result = analyze_intent(case["input"])
|
||||
node_times["analyze_intent"] = time.time() - node_start
|
||||
|
||||
# Node 2: retrieve_context
|
||||
node_start = time.time()
|
||||
context = retrieve_context(analyze_result)
|
||||
node_times["retrieve_context"] = time.time() - node_start
|
||||
|
||||
# Node 3: generate_response
|
||||
node_start = time.time()
|
||||
response = generate_response(context, case["input"])
|
||||
node_times["generate_response"] = time.time() - node_start
|
||||
|
||||
total_time = time.time() - start_time
|
||||
|
||||
results["total"].append(total_time)
|
||||
for node, duration in node_times.items():
|
||||
if node not in results["by_node"]:
|
||||
results["by_node"][node] = []
|
||||
results["by_node"][node].append(duration)
|
||||
|
||||
# Statistical calculation
|
||||
import numpy as np
|
||||
summary = {
|
||||
"total": {
|
||||
"mean": np.mean(results["total"]),
|
||||
"p50": np.percentile(results["total"], 50),
|
||||
"p95": np.percentile(results["total"], 95),
|
||||
"p99": np.percentile(results["total"], 99),
|
||||
}
|
||||
}
|
||||
|
||||
for node, times in results["by_node"].items():
|
||||
summary[node] = {
|
||||
"mean": np.mean(times),
|
||||
"p50": np.percentile(times, 50),
|
||||
"p95": np.percentile(times, 95),
|
||||
}
|
||||
|
||||
return summary
|
||||
|
||||
# Usage example
|
||||
latency_results = measure_latency(test_cases)
|
||||
print(f"Mean latency: {latency_results['total']['mean']:.2f}s")
|
||||
print(f"P95 latency: {latency_results['total']['p95']:.2f}s")
|
||||
```
|
||||
|
||||
#### Throughput
|
||||
```python
|
||||
import concurrent.futures
|
||||
from typing import List, Dict
|
||||
|
||||
def measure_throughput(
|
||||
test_cases: List[Dict],
|
||||
max_workers: int = 10,
|
||||
duration_seconds: int = 60
|
||||
) -> Dict:
|
||||
"""Measure number of requests processed within a given time"""
|
||||
start_time = time.time()
|
||||
completed = 0
|
||||
errors = 0
|
||||
|
||||
def process_case(case):
|
||||
try:
|
||||
result = run_langgraph_app(case["input"])
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
while time.time() - start_time < duration_seconds:
|
||||
# Loop through test cases
|
||||
for case in test_cases:
|
||||
if time.time() - start_time >= duration_seconds:
|
||||
break
|
||||
|
||||
future = executor.submit(process_case, case)
|
||||
if future.result():
|
||||
completed += 1
|
||||
else:
|
||||
errors += 1
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
return {
|
||||
"completed": completed,
|
||||
"errors": errors,
|
||||
"elapsed": elapsed,
|
||||
"throughput": completed / elapsed, # requests per second
|
||||
"error_rate": errors / (completed + errors) if (completed + errors) > 0 else 0
|
||||
}
|
||||
|
||||
# Usage example
|
||||
throughput = measure_throughput(test_cases, max_workers=5, duration_seconds=30)
|
||||
print(f"Throughput: {throughput['throughput']:.2f} req/s")
|
||||
print(f"Error rate: {throughput['error_rate']*100:.2f}%")
|
||||
```
|
||||
|
||||
### 3. Cost Metrics
|
||||
|
||||
#### Token Usage and Cost
|
||||
```python
|
||||
from typing import Dict
|
||||
|
||||
# Pricing table by model (as of November 2024)
|
||||
PRICING = {
|
||||
"claude-3-5-sonnet-20241022": {
|
||||
"input": 3.0 / 1_000_000, # $3.00 per 1M input tokens
|
||||
"output": 15.0 / 1_000_000, # $15.00 per 1M output tokens
|
||||
},
|
||||
"claude-3-5-haiku-20241022": {
|
||||
"input": 0.8 / 1_000_000, # $0.80 per 1M input tokens
|
||||
"output": 4.0 / 1_000_000, # $4.00 per 1M output tokens
|
||||
}
|
||||
}
|
||||
|
||||
def calculate_cost(token_usage: Dict, model: str) -> Dict:
|
||||
"""Calculate cost from token usage"""
|
||||
pricing = PRICING.get(model, PRICING["claude-3-5-sonnet-20241022"])
|
||||
|
||||
input_cost = token_usage["input_tokens"] * pricing["input"]
|
||||
output_cost = token_usage["output_tokens"] * pricing["output"]
|
||||
total_cost = input_cost + output_cost
|
||||
|
||||
return {
|
||||
"input_tokens": token_usage["input_tokens"],
|
||||
"output_tokens": token_usage["output_tokens"],
|
||||
"total_tokens": token_usage["input_tokens"] + token_usage["output_tokens"],
|
||||
"input_cost": input_cost,
|
||||
"output_cost": output_cost,
|
||||
"total_cost": total_cost,
|
||||
"cost_breakdown": {
|
||||
"input_pct": (input_cost / total_cost * 100) if total_cost > 0 else 0,
|
||||
"output_pct": (output_cost / total_cost * 100) if total_cost > 0 else 0
|
||||
}
|
||||
}
|
||||
|
||||
# Usage example
|
||||
token_usage = {"input_tokens": 1500, "output_tokens": 800}
|
||||
cost = calculate_cost(token_usage, "claude-3-5-sonnet-20241022")
|
||||
print(f"Total cost: ${cost['total_cost']:.4f}")
|
||||
print(f"Input: ${cost['input_cost']:.4f} ({cost['cost_breakdown']['input_pct']:.1f}%)")
|
||||
print(f"Output: ${cost['output_cost']:.4f} ({cost['cost_breakdown']['output_pct']:.1f}%)")
|
||||
```
|
||||
|
||||
#### Cost per Request
|
||||
```python
|
||||
def calculate_cost_per_request(
|
||||
test_results: List[Dict],
|
||||
model: str
|
||||
) -> Dict:
|
||||
"""Calculate cost per request"""
|
||||
total_cost = 0
|
||||
total_input_tokens = 0
|
||||
total_output_tokens = 0
|
||||
|
||||
for result in test_results:
|
||||
cost = calculate_cost(result["token_usage"], model)
|
||||
total_cost += cost["total_cost"]
|
||||
total_input_tokens += result["token_usage"]["input_tokens"]
|
||||
total_output_tokens += result["token_usage"]["output_tokens"]
|
||||
|
||||
num_requests = len(test_results)
|
||||
|
||||
return {
|
||||
"total_requests": num_requests,
|
||||
"total_cost": total_cost,
|
||||
"cost_per_request": total_cost / num_requests,
|
||||
"avg_input_tokens": total_input_tokens / num_requests,
|
||||
"avg_output_tokens": total_output_tokens / num_requests,
|
||||
"total_tokens": total_input_tokens + total_output_tokens
|
||||
}
|
||||
```
|
||||
|
||||
### 4. Reliability Metrics
|
||||
|
||||
#### Error Rate
|
||||
```python
|
||||
def calculate_error_rate(results: List[Dict]) -> Dict:
|
||||
"""Analyze error rate and error types"""
|
||||
total = len(results)
|
||||
errors = [r for r in results if r.get("error")]
|
||||
|
||||
error_types = {}
|
||||
for error in errors:
|
||||
error_type = error["error"]["type"]
|
||||
if error_type not in error_types:
|
||||
error_types[error_type] = 0
|
||||
error_types[error_type] += 1
|
||||
|
||||
return {
|
||||
"total_requests": total,
|
||||
"total_errors": len(errors),
|
||||
"error_rate": len(errors) / total if total > 0 else 0,
|
||||
"error_types": error_types,
|
||||
"success_rate": (total - len(errors)) / total if total > 0 else 0
|
||||
}
|
||||
```
|
||||
|
||||
#### Retry Rate
|
||||
```python
|
||||
def calculate_retry_rate(results: List[Dict]) -> Dict:
|
||||
"""Proportion of cases that required retries"""
|
||||
total = len(results)
|
||||
retried = [r for r in results if r.get("retry_count", 0) > 0]
|
||||
|
||||
return {
|
||||
"total_requests": total,
|
||||
"retried_requests": len(retried),
|
||||
"retry_rate": len(retried) / total if total > 0 else 0,
|
||||
"avg_retries": sum(r.get("retry_count", 0) for r in retried) / len(retried) if retried else 0
|
||||
}
|
||||
```
|
||||
|
||||
## 📋 Related Documentation
|
||||
|
||||
- [Test Case Design](./evaluation_testcases.md) - Test case structure and coverage
|
||||
- [Statistical Significance Testing](./evaluation_statistics.md) - Multiple runs and statistical analysis
|
||||
- [Evaluation Best Practices](./evaluation_practices.md) - Consistency, visualization, reporting
|
||||
Reference in New Issue
Block a user