Initial commit
This commit is contained in:
904
skills/ai/chunking-strategy/references/evaluation.md
Normal file
904
skills/ai/chunking-strategy/references/evaluation.md
Normal file
@@ -0,0 +1,904 @@
|
||||
# Performance Evaluation Framework
|
||||
|
||||
This document provides comprehensive methodologies for evaluating chunking strategy performance and effectiveness.
|
||||
|
||||
## Evaluation Metrics
|
||||
|
||||
### Core Retrieval Metrics
|
||||
|
||||
#### Retrieval Precision
|
||||
Measures the fraction of retrieved chunks that are relevant to the query.
|
||||
|
||||
```python
|
||||
def calculate_precision(retrieved_chunks: List[Dict], relevant_chunks: List[Dict]) -> float:
|
||||
"""
|
||||
Calculate retrieval precision
|
||||
Precision = |Relevant ∩ Retrieved| / |Retrieved|
|
||||
"""
|
||||
retrieved_ids = {chunk.get('id') for chunk in retrieved_chunks}
|
||||
relevant_ids = {chunk.get('id') for chunk in relevant_chunks}
|
||||
|
||||
intersection = retrieved_ids & relevant_ids
|
||||
|
||||
if not retrieved_ids:
|
||||
return 0.0
|
||||
|
||||
return len(intersection) / len(retrieved_ids)
|
||||
```
|
||||
|
||||
#### Retrieval Recall
|
||||
Measures the fraction of relevant chunks that are successfully retrieved.
|
||||
|
||||
```python
|
||||
def calculate_recall(retrieved_chunks: List[Dict], relevant_chunks: List[Dict]) -> float:
|
||||
"""
|
||||
Calculate retrieval recall
|
||||
Recall = |Relevant ∩ Retrieved| / |Relevant|
|
||||
"""
|
||||
retrieved_ids = {chunk.get('id') for chunk in retrieved_chunks}
|
||||
relevant_ids = {chunk.get('id') for chunk in relevant_chunks}
|
||||
|
||||
intersection = retrieved_ids & relevant_ids
|
||||
|
||||
if not relevant_ids:
|
||||
return 0.0
|
||||
|
||||
return len(intersection) / len(relevant_ids)
|
||||
```
|
||||
|
||||
#### F1-Score
|
||||
Harmonic mean of precision and recall.
|
||||
|
||||
```python
|
||||
def calculate_f1_score(precision: float, recall: float) -> float:
|
||||
"""
|
||||
Calculate F1-score
|
||||
F1 = 2 * (Precision * Recall) / (Precision + Recall)
|
||||
"""
|
||||
if precision + recall == 0:
|
||||
return 0.0
|
||||
|
||||
return 2 * (precision * recall) / (precision + recall)
|
||||
```
|
||||
|
||||
### Mean Reciprocal Rank (MRR)
|
||||
Measures the rank of the first relevant result.
|
||||
|
||||
```python
|
||||
def calculate_mrr(queries: List[Dict], results: List[List[Dict]]) -> float:
|
||||
"""
|
||||
Calculate Mean Reciprocal Rank
|
||||
"""
|
||||
reciprocal_ranks = []
|
||||
|
||||
for query, query_results in zip(queries, results):
|
||||
relevant_found = False
|
||||
|
||||
for rank, result in enumerate(query_results, 1):
|
||||
if result.get('is_relevant', False):
|
||||
reciprocal_ranks.append(1.0 / rank)
|
||||
relevant_found = True
|
||||
break
|
||||
|
||||
if not relevant_found:
|
||||
reciprocal_ranks.append(0.0)
|
||||
|
||||
return sum(reciprocal_ranks) / len(reciprocal_ranks)
|
||||
```
|
||||
|
||||
### Mean Average Precision (MAP)
|
||||
Considers both precision and the ranking of relevant documents.
|
||||
|
||||
```python
|
||||
def calculate_average_precision(retrieved_chunks: List[Dict], relevant_chunks: List[Dict]) -> float:
|
||||
"""
|
||||
Calculate Average Precision for a single query
|
||||
"""
|
||||
retrieved_ids = {chunk.get('id') for chunk in retrieved_chunks}
|
||||
relevant_ids = {chunk.get('id') for chunk in relevant_chunks}
|
||||
|
||||
if not relevant_ids:
|
||||
return 0.0
|
||||
|
||||
precisions = []
|
||||
relevant_count = 0
|
||||
|
||||
for rank, chunk in enumerate(retrieved_chunks, 1):
|
||||
if chunk.get('id') in relevant_ids:
|
||||
relevant_count += 1
|
||||
precision_at_rank = relevant_count / rank
|
||||
precisions.append(precision_at_rank)
|
||||
|
||||
return sum(precisions) / len(relevant_ids) if relevant_ids else 0.0
|
||||
|
||||
def calculate_map(queries: List[Dict], results: List[List[Dict]]) -> float:
|
||||
"""
|
||||
Calculate Mean Average Precision across multiple queries
|
||||
"""
|
||||
average_precisions = []
|
||||
|
||||
for query, query_results in zip(queries, results):
|
||||
ap = calculate_average_precision(query_results, query.get('relevant_chunks', []))
|
||||
average_precisions.append(ap)
|
||||
|
||||
return sum(average_precisions) / len(average_precisions) if average_precisions else 0.0
|
||||
```
|
||||
|
||||
### Normalized Discounted Cumulative Gain (NDCG)
|
||||
Measures ranking quality with emphasis on highly relevant results.
|
||||
|
||||
```python
|
||||
def calculate_dcg(retrieved_chunks: List[Dict]) -> float:
|
||||
"""
|
||||
Calculate Discounted Cumulative Gain
|
||||
"""
|
||||
dcg = 0.0
|
||||
|
||||
for rank, chunk in enumerate(retrieved_chunks, 1):
|
||||
relevance = chunk.get('relevance_score', 0)
|
||||
dcg += relevance / np.log2(rank + 1)
|
||||
|
||||
return dcg
|
||||
|
||||
def calculate_ndcg(retrieved_chunks: List[Dict], ideal_chunks: List[Dict]) -> float:
|
||||
"""
|
||||
Calculate Normalized Discounted Cumulative Gain
|
||||
"""
|
||||
dcg = calculate_dcg(retrieved_chunks)
|
||||
idcg = calculate_dcg(ideal_chunks)
|
||||
|
||||
if idcg == 0:
|
||||
return 0.0
|
||||
|
||||
return dcg / idcg
|
||||
```
|
||||
|
||||
## End-to-End RAG Evaluation
|
||||
|
||||
### Answer Quality Metrics
|
||||
|
||||
#### Factual Consistency
|
||||
Measures how well the generated answer aligns with retrieved chunks.
|
||||
|
||||
```python
|
||||
import spacy
|
||||
from transformers import pipeline
|
||||
|
||||
class FactualConsistencyEvaluator:
|
||||
def __init__(self):
|
||||
self.nlp = spacy.load("en_core_web_sm")
|
||||
self.nli_pipeline = pipeline("text-classification",
|
||||
model="roberta-large-mnli")
|
||||
|
||||
def evaluate_consistency(self, answer: str, retrieved_chunks: List[str]) -> float:
|
||||
"""
|
||||
Evaluate factual consistency between answer and retrieved context
|
||||
"""
|
||||
if not retrieved_chunks:
|
||||
return 0.0
|
||||
|
||||
# Combine retrieved chunks as context
|
||||
context = " ".join(retrieved_chunks[:3]) # Use top 3 chunks
|
||||
|
||||
# Use Natural Language Inference to check consistency
|
||||
result = self.nli_pipeline(f"premise: {context} hypothesis: {answer}")
|
||||
|
||||
# Extract consistency score (entailment probability)
|
||||
for item in result:
|
||||
if item['label'] == 'ENTAILMENT':
|
||||
return item['score']
|
||||
elif item['label'] == 'CONTRADICTION':
|
||||
return 1.0 - item['score']
|
||||
|
||||
return 0.5 # Neutral if NLI is inconclusive
|
||||
```
|
||||
|
||||
#### Answer Completeness
|
||||
Measures how completely the answer addresses the user's query.
|
||||
|
||||
```python
|
||||
def evaluate_completeness(answer: str, query: str, reference_answer: str = None) -> float:
|
||||
"""
|
||||
Evaluate answer completeness
|
||||
"""
|
||||
# Extract key entities from query
|
||||
query_entities = extract_entities(query)
|
||||
answer_entities = extract_entities(answer)
|
||||
|
||||
# Calculate entity coverage
|
||||
if not query_entities:
|
||||
return 0.5 # Neutral if no entities in query
|
||||
|
||||
covered_entities = query_entities & answer_entities
|
||||
entity_coverage = len(covered_entities) / len(query_entities)
|
||||
|
||||
# If reference answer is available, compare against it
|
||||
if reference_answer:
|
||||
reference_entities = extract_entities(reference_answer)
|
||||
answer_reference_overlap = len(answer_entities & reference_entities) / max(len(reference_entities), 1)
|
||||
return (entity_coverage + answer_reference_overlap) / 2
|
||||
|
||||
return entity_coverage
|
||||
|
||||
def extract_entities(text: str) -> set:
|
||||
"""
|
||||
Extract named entities from text (simplified)
|
||||
"""
|
||||
# This would use a proper NER model in practice
|
||||
import re
|
||||
|
||||
# Simple noun phrase extraction as placeholder
|
||||
noun_phrases = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', text)
|
||||
return set(noun_phrases)
|
||||
```
|
||||
|
||||
#### Response Relevance
|
||||
Measures how relevant the answer is to the original query.
|
||||
|
||||
```python
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
|
||||
class RelevanceEvaluator:
|
||||
def __init__(self, model_name="all-MiniLM-L6-v2"):
|
||||
self.model = SentenceTransformer(model_name)
|
||||
|
||||
def evaluate_relevance(self, query: str, answer: str) -> float:
|
||||
"""
|
||||
Evaluate semantic relevance between query and answer
|
||||
"""
|
||||
# Generate embeddings
|
||||
query_embedding = self.model.encode([query])
|
||||
answer_embedding = self.model.encode([answer])
|
||||
|
||||
# Calculate cosine similarity
|
||||
similarity = cosine_similarity(query_embedding, answer_embedding)[0][0]
|
||||
|
||||
return float(similarity)
|
||||
```
|
||||
|
||||
## Performance Metrics
|
||||
|
||||
### Processing Time
|
||||
|
||||
```python
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Dict
|
||||
|
||||
@dataclass
|
||||
class PerformanceMetrics:
|
||||
total_time: float
|
||||
chunking_time: float
|
||||
embedding_time: float
|
||||
search_time: float
|
||||
generation_time: float
|
||||
throughput: float # documents per second
|
||||
|
||||
class PerformanceProfiler:
|
||||
def __init__(self):
|
||||
self.timings = {}
|
||||
self.start_times = {}
|
||||
|
||||
def start_timer(self, operation: str):
|
||||
self.start_times[operation] = time.time()
|
||||
|
||||
def end_timer(self, operation: str):
|
||||
if operation in self.start_times:
|
||||
duration = time.time() - self.start_times[operation]
|
||||
if operation not in self.timings:
|
||||
self.timings[operation] = []
|
||||
self.timings[operation].append(duration)
|
||||
return duration
|
||||
return 0.0
|
||||
|
||||
def get_performance_metrics(self, document_count: int) -> PerformanceMetrics:
|
||||
total_time = sum(sum(times) for times in self.timings.values())
|
||||
|
||||
return PerformanceMetrics(
|
||||
total_time=total_time,
|
||||
chunking_time=sum(self.timings.get('chunking', [0])),
|
||||
embedding_time=sum(self.timings.get('embedding', [0])),
|
||||
search_time=sum(self.timings.get('search', [0])),
|
||||
generation_time=sum(self.timings.get('generation', [0])),
|
||||
throughput=document_count / total_time if total_time > 0 else 0
|
||||
)
|
||||
```
|
||||
|
||||
### Memory Usage
|
||||
|
||||
```python
|
||||
import psutil
|
||||
import os
|
||||
from typing import Dict, List
|
||||
|
||||
class MemoryProfiler:
|
||||
def __init__(self):
|
||||
self.process = psutil.Process(os.getpid())
|
||||
self.memory_snapshots = []
|
||||
|
||||
def take_memory_snapshot(self, label: str):
|
||||
"""Take a snapshot of current memory usage"""
|
||||
memory_info = self.process.memory_info()
|
||||
memory_mb = memory_info.rss / 1024 / 1024 # Convert to MB
|
||||
|
||||
self.memory_snapshots.append({
|
||||
'label': label,
|
||||
'memory_mb': memory_mb,
|
||||
'timestamp': time.time()
|
||||
})
|
||||
|
||||
def get_peak_memory_usage(self) -> float:
|
||||
"""Get peak memory usage in MB"""
|
||||
if not self.memory_snapshots:
|
||||
return 0.0
|
||||
return max(snapshot['memory_mb'] for snapshot in self.memory_snapshots)
|
||||
|
||||
def get_memory_usage_by_operation(self) -> Dict[str, float]:
|
||||
"""Get memory usage breakdown by operation"""
|
||||
if not self.memory_snapshots:
|
||||
return {}
|
||||
|
||||
memory_by_op = {}
|
||||
for i in range(1, len(self.memory_snapshots)):
|
||||
prev_snapshot = self.memory_snapshots[i-1]
|
||||
curr_snapshot = self.memory_snapshots[i]
|
||||
|
||||
operation = curr_snapshot['label']
|
||||
memory_delta = curr_snapshot['memory_mb'] - prev_snapshot['memory_mb']
|
||||
|
||||
if operation not in memory_by_op:
|
||||
memory_by_op[operation] = []
|
||||
memory_by_op[operation].append(memory_delta)
|
||||
|
||||
return {op: sum(deltas) for op, deltas in memory_by_op.items()}
|
||||
```
|
||||
|
||||
## Evaluation Datasets
|
||||
|
||||
### Standardized Test Sets
|
||||
|
||||
#### Question-Answer Pairs
|
||||
|
||||
```python
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional
|
||||
import json
|
||||
|
||||
@dataclass
|
||||
class EvaluationQuery:
|
||||
id: str
|
||||
question: str
|
||||
reference_answer: Optional[str]
|
||||
relevant_chunk_ids: List[str]
|
||||
query_type: str # factoid, analytical, comparative
|
||||
difficulty: str # easy, medium, hard
|
||||
domain: str # finance, medical, legal, technical
|
||||
|
||||
class EvaluationDataset:
|
||||
def __init__(self, name: str):
|
||||
self.name = name
|
||||
self.queries: List[EvaluationQuery] = []
|
||||
self.documents: Dict[str, str] = {}
|
||||
self.chunks: Dict[str, Dict] = {}
|
||||
|
||||
def add_query(self, query: EvaluationQuery):
|
||||
self.queries.append(query)
|
||||
|
||||
def add_document(self, doc_id: str, content: str):
|
||||
self.documents[doc_id] = content
|
||||
|
||||
def add_chunk(self, chunk_id: str, content: str, doc_id: str, metadata: Dict):
|
||||
self.chunks[chunk_id] = {
|
||||
'id': chunk_id,
|
||||
'content': content,
|
||||
'doc_id': doc_id,
|
||||
'metadata': metadata
|
||||
}
|
||||
|
||||
def save_to_file(self, filepath: str):
|
||||
data = {
|
||||
'name': self.name,
|
||||
'queries': [
|
||||
{
|
||||
'id': q.id,
|
||||
'question': q.question,
|
||||
'reference_answer': q.reference_answer,
|
||||
'relevant_chunk_ids': q.relevant_chunk_ids,
|
||||
'query_type': q.query_type,
|
||||
'difficulty': q.difficulty,
|
||||
'domain': q.domain
|
||||
}
|
||||
for q in self.queries
|
||||
],
|
||||
'documents': self.documents,
|
||||
'chunks': self.chunks
|
||||
}
|
||||
|
||||
with open(filepath, 'w') as f:
|
||||
json.dump(data, f, indent=2)
|
||||
|
||||
@classmethod
|
||||
def load_from_file(cls, filepath: str):
|
||||
with open(filepath, 'r') as f:
|
||||
data = json.load(f)
|
||||
|
||||
dataset = cls(data['name'])
|
||||
dataset.documents = data['documents']
|
||||
dataset.chunks = data['chunks']
|
||||
|
||||
for q_data in data['queries']:
|
||||
query = EvaluationQuery(
|
||||
id=q_data['id'],
|
||||
question=q_data['question'],
|
||||
reference_answer=q_data.get('reference_answer'),
|
||||
relevant_chunk_ids=q_data['relevant_chunk_ids'],
|
||||
query_type=q_data['query_type'],
|
||||
difficulty=q_data['difficulty'],
|
||||
domain=q_data['domain']
|
||||
)
|
||||
dataset.add_query(query)
|
||||
|
||||
return dataset
|
||||
```
|
||||
|
||||
### Dataset Generation
|
||||
|
||||
#### Synthetic Query Generation
|
||||
|
||||
```python
|
||||
import random
|
||||
from typing import List, Dict
|
||||
|
||||
class SyntheticQueryGenerator:
|
||||
def __init__(self):
|
||||
self.query_templates = {
|
||||
'factoid': [
|
||||
"What is {concept}?",
|
||||
"When did {event} occur?",
|
||||
"Who developed {technology}?",
|
||||
"How many {items} are mentioned?",
|
||||
"What is the value of {metric}?"
|
||||
],
|
||||
'analytical': [
|
||||
"Compare and contrast {concept1} and {concept2}.",
|
||||
"Analyze the impact of {concept} on {domain}.",
|
||||
"What are the advantages and disadvantages of {technology}?",
|
||||
"Explain the relationship between {concept1} and {concept2}.",
|
||||
"Evaluate the effectiveness of {approach} for {problem}."
|
||||
],
|
||||
'comparative': [
|
||||
"Which is better: {option1} or {option2}?",
|
||||
"How does {method1} differ from {method2}?",
|
||||
"Compare the performance of {system1} and {system2}.",
|
||||
"What are the key differences between {approach1} and {approach2}?"
|
||||
]
|
||||
}
|
||||
|
||||
def generate_queries_from_chunks(self, chunks: List[Dict], num_queries: int = 100) -> List[EvaluationQuery]:
|
||||
"""Generate synthetic queries from document chunks"""
|
||||
queries = []
|
||||
|
||||
# Extract entities and concepts from chunks
|
||||
entities = self._extract_entities_from_chunks(chunks)
|
||||
|
||||
for i in range(num_queries):
|
||||
query_type = random.choice(['factoid', 'analytical', 'comparative'])
|
||||
template = random.choice(self.query_templates[query_type])
|
||||
|
||||
# Fill template with extracted entities
|
||||
query_text = self._fill_template(template, entities)
|
||||
|
||||
# Find relevant chunks for this query
|
||||
relevant_chunks = self._find_relevant_chunks(query_text, chunks)
|
||||
|
||||
query = EvaluationQuery(
|
||||
id=f"synthetic_{i}",
|
||||
question=query_text,
|
||||
reference_answer=None, # Would need generation model
|
||||
relevant_chunk_ids=[chunk['id'] for chunk in relevant_chunks],
|
||||
query_type=query_type,
|
||||
difficulty=random.choice(['easy', 'medium', 'hard']),
|
||||
domain='synthetic'
|
||||
)
|
||||
|
||||
queries.append(query)
|
||||
|
||||
return queries
|
||||
|
||||
def _extract_entities_from_chunks(self, chunks: List[Dict]) -> Dict[str, List[str]]:
|
||||
"""Extract entities, concepts, and relationships from chunks"""
|
||||
# This would use proper NER in practice
|
||||
entities = {
|
||||
'concepts': [],
|
||||
'technologies': [],
|
||||
'methods': [],
|
||||
'metrics': [],
|
||||
'events': []
|
||||
}
|
||||
|
||||
for chunk in chunks:
|
||||
content = chunk['content']
|
||||
# Simplified entity extraction
|
||||
words = content.split()
|
||||
entities['concepts'].extend([word for word in words if len(word) > 6])
|
||||
entities['technologies'].extend([word for word in words if 'technology' in word.lower()])
|
||||
entities['methods'].extend([word for word in words if 'method' in word.lower()])
|
||||
entities['metrics'].extend([word for word in words if '%' in word or '$' in word])
|
||||
|
||||
# Remove duplicates and limit
|
||||
for key in entities:
|
||||
entities[key] = list(set(entities[key]))[:50]
|
||||
|
||||
return entities
|
||||
|
||||
def _fill_template(self, template: str, entities: Dict[str, List[str]]) -> str:
|
||||
"""Fill query template with random entities"""
|
||||
import re
|
||||
|
||||
def replace_placeholder(match):
|
||||
placeholder = match.group(1)
|
||||
|
||||
# Map placeholders to entity types
|
||||
entity_mapping = {
|
||||
'concept': 'concepts',
|
||||
'concept1': 'concepts',
|
||||
'concept2': 'concepts',
|
||||
'technology': 'technologies',
|
||||
'method': 'methods',
|
||||
'method1': 'methods',
|
||||
'method2': 'methods',
|
||||
'metric': 'metrics',
|
||||
'event': 'events',
|
||||
'items': 'concepts',
|
||||
'option1': 'concepts',
|
||||
'option2': 'concepts',
|
||||
'approach': 'methods',
|
||||
'problem': 'concepts',
|
||||
'domain': 'concepts',
|
||||
'system1': 'concepts',
|
||||
'system2': 'concepts'
|
||||
}
|
||||
|
||||
entity_type = entity_mapping.get(placeholder, 'concepts')
|
||||
available_entities = entities.get(entity_type, ['something'])
|
||||
|
||||
if available_entities:
|
||||
return random.choice(available_entities)
|
||||
else:
|
||||
return 'something'
|
||||
|
||||
return re.sub(r'\{(\w+)\}', replace_placeholder, template)
|
||||
|
||||
def _find_relevant_chunks(self, query: str, chunks: List[Dict], k: int = 3) -> List[Dict]:
|
||||
"""Find chunks most relevant to the query"""
|
||||
# Simple keyword matching for synthetic generation
|
||||
query_words = set(query.lower().split())
|
||||
|
||||
chunk_scores = []
|
||||
for chunk in chunks:
|
||||
chunk_words = set(chunk['content'].lower().split())
|
||||
overlap = len(query_words & chunk_words)
|
||||
chunk_scores.append((overlap, chunk))
|
||||
|
||||
# Sort by overlap and return top k
|
||||
chunk_scores.sort(key=lambda x: x[0], reverse=True)
|
||||
return [chunk for _, chunk in chunk_scores[:k]]
|
||||
```
|
||||
|
||||
## A/B Testing Framework
|
||||
|
||||
### Statistical Significance Testing
|
||||
|
||||
```python
|
||||
import numpy as np
|
||||
from scipy import stats
|
||||
from typing import List, Dict, Tuple
|
||||
|
||||
class ABTestAnalyzer:
|
||||
def __init__(self):
|
||||
self.significance_level = 0.05
|
||||
|
||||
def compare_metrics(self, control_metrics: List[float],
|
||||
treatment_metrics: List[float],
|
||||
metric_name: str) -> Dict:
|
||||
"""
|
||||
Compare metrics between control and treatment groups
|
||||
"""
|
||||
control_mean = np.mean(control_metrics)
|
||||
treatment_mean = np.mean(treatment_metrics)
|
||||
|
||||
control_std = np.std(control_metrics)
|
||||
treatment_std = np.std(treatment_metrics)
|
||||
|
||||
# Perform t-test
|
||||
t_statistic, p_value = stats.ttest_ind(control_metrics, treatment_metrics)
|
||||
|
||||
# Calculate effect size (Cohen's d)
|
||||
pooled_std = np.sqrt(((len(control_metrics) - 1) * control_std**2 +
|
||||
(len(treatment_metrics) - 1) * treatment_std**2) /
|
||||
(len(control_metrics) + len(treatment_metrics) - 2))
|
||||
|
||||
cohens_d = (treatment_mean - control_mean) / pooled_std if pooled_std > 0 else 0
|
||||
|
||||
# Determine significance
|
||||
is_significant = p_value < self.significance_level
|
||||
|
||||
return {
|
||||
'metric_name': metric_name,
|
||||
'control_mean': control_mean,
|
||||
'treatment_mean': treatment_mean,
|
||||
'absolute_difference': treatment_mean - control_mean,
|
||||
'relative_difference': ((treatment_mean - control_mean) / control_mean * 100) if control_mean != 0 else 0,
|
||||
'control_std': control_std,
|
||||
'treatment_std': treatment_std,
|
||||
't_statistic': t_statistic,
|
||||
'p_value': p_value,
|
||||
'is_significant': is_significant,
|
||||
'effect_size': cohens_d,
|
||||
'significance_level': self.significance_level
|
||||
}
|
||||
|
||||
def analyze_ab_test_results(self,
|
||||
control_results: Dict[str, List[float]],
|
||||
treatment_results: Dict[str, List[float]]) -> Dict:
|
||||
"""
|
||||
Analyze A/B test results across multiple metrics
|
||||
"""
|
||||
analysis_results = {}
|
||||
|
||||
# Ensure both dictionaries have the same keys
|
||||
all_metrics = set(control_results.keys()) & set(treatment_results.keys())
|
||||
|
||||
for metric in all_metrics:
|
||||
if metric in control_results and metric in treatment_results:
|
||||
analysis_results[metric] = self.compare_metrics(
|
||||
control_results[metric],
|
||||
treatment_results[metric],
|
||||
metric
|
||||
)
|
||||
|
||||
# Calculate overall summary
|
||||
significant_improvements = sum(1 for result in analysis_results.values()
|
||||
if result['is_significant'] and result['relative_difference'] > 0)
|
||||
significant_degradations = sum(1 for result in analysis_results.values()
|
||||
if result['is_significant'] and result['relative_difference'] < 0)
|
||||
|
||||
analysis_results['summary'] = {
|
||||
'total_metrics_compared': len(analysis_results),
|
||||
'significant_improvements': significant_improvements,
|
||||
'significant_degradations': significant_degradations,
|
||||
'no_significant_change': len(analysis_results) - significant_improvements - significant_degradations
|
||||
}
|
||||
|
||||
return analysis_results
|
||||
```
|
||||
|
||||
## Automated Evaluation Pipeline
|
||||
|
||||
### End-to-End Evaluation
|
||||
|
||||
```python
|
||||
class ChunkingEvaluationPipeline:
|
||||
def __init__(self, strategies: Dict[str, Any], dataset: EvaluationDataset):
|
||||
self.strategies = strategies
|
||||
self.dataset = dataset
|
||||
self.results = {}
|
||||
self.profiler = PerformanceProfiler()
|
||||
self.memory_profiler = MemoryProfiler()
|
||||
|
||||
def run_evaluation(self) -> Dict:
|
||||
"""Run comprehensive evaluation of all strategies"""
|
||||
evaluation_results = {}
|
||||
|
||||
for strategy_name, strategy in self.strategies.items():
|
||||
print(f"Evaluating strategy: {strategy_name}")
|
||||
|
||||
# Reset profilers for each strategy
|
||||
self.profiler = PerformanceProfiler()
|
||||
self.memory_profiler = MemoryProfiler()
|
||||
|
||||
# Evaluate strategy
|
||||
strategy_results = self._evaluate_strategy(strategy, strategy_name)
|
||||
evaluation_results[strategy_name] = strategy_results
|
||||
|
||||
# Compare strategies
|
||||
comparison_results = self._compare_strategies(evaluation_results)
|
||||
|
||||
return {
|
||||
'individual_results': evaluation_results,
|
||||
'comparison': comparison_results,
|
||||
'recommendations': self._generate_recommendations(comparison_results)
|
||||
}
|
||||
|
||||
def _evaluate_strategy(self, strategy: Any, strategy_name: str) -> Dict:
|
||||
"""Evaluate a single chunking strategy"""
|
||||
results = {
|
||||
'strategy_name': strategy_name,
|
||||
'retrieval_metrics': {},
|
||||
'quality_metrics': {},
|
||||
'performance_metrics': {}
|
||||
}
|
||||
|
||||
# Track memory usage
|
||||
self.memory_profiler.take_memory_snapshot(f"{strategy_name}_start")
|
||||
|
||||
# Process all documents
|
||||
self.profiler.start_timer('total_processing')
|
||||
|
||||
all_chunks = {}
|
||||
for doc_id, content in self.dataset.documents.items():
|
||||
self.profiler.start_timer('chunking')
|
||||
chunks = strategy.chunk(content)
|
||||
self.profiler.end_timer('chunking')
|
||||
|
||||
all_chunks[doc_id] = chunks
|
||||
|
||||
self.memory_profiler.take_memory_snapshot(f"{strategy_name}_after_chunking")
|
||||
|
||||
# Generate embeddings for chunks
|
||||
self.profiler.start_timer('embedding')
|
||||
chunk_embeddings = self._generate_embeddings(all_chunks)
|
||||
self.profiler.end_timer('embedding')
|
||||
|
||||
self.memory_profiler.take_memory_snapshot(f"{strategy_name}_after_embedding")
|
||||
|
||||
# Evaluate retrieval performance
|
||||
retrieval_results = self._evaluate_retrieval(all_chunks, chunk_embeddings)
|
||||
results['retrieval_metrics'] = retrieval_results
|
||||
|
||||
# Evaluate chunk quality
|
||||
quality_results = self._evaluate_chunk_quality(all_chunks)
|
||||
results['quality_metrics'] = quality_results
|
||||
|
||||
# Get performance metrics
|
||||
self.profiler.end_timer('total_processing')
|
||||
performance_metrics = self.profiler.get_performance_metrics(len(self.dataset.documents))
|
||||
results['performance_metrics'] = performance_metrics.__dict__
|
||||
|
||||
# Get memory metrics
|
||||
self.memory_profiler.take_memory_snapshot(f"{strategy_name}_end")
|
||||
results['memory_metrics'] = {
|
||||
'peak_memory_mb': self.memory_profiler.get_peak_memory_usage(),
|
||||
'memory_by_operation': self.memory_profiler.get_memory_usage_by_operation()
|
||||
}
|
||||
|
||||
return results
|
||||
|
||||
def _evaluate_retrieval(self, all_chunks: Dict, chunk_embeddings: Dict) -> Dict:
|
||||
"""Evaluate retrieval performance"""
|
||||
retrieval_metrics = {
|
||||
'precision': [],
|
||||
'recall': [],
|
||||
'f1_score': [],
|
||||
'mrr': [],
|
||||
'map': []
|
||||
}
|
||||
|
||||
for query in self.dataset.queries:
|
||||
# Perform retrieval
|
||||
self.profiler.start_timer('search')
|
||||
retrieved_chunks = self._retrieve_chunks(query.question, chunk_embeddings, k=10)
|
||||
self.profiler.end_timer('search')
|
||||
|
||||
# Get relevant chunks for this query
|
||||
relevant_chunk_ids = set(query.relevant_chunk_ids)
|
||||
relevant_chunks = [chunk for chunk in retrieved_chunks
|
||||
if chunk.get('id') in relevant_chunk_ids]
|
||||
|
||||
# Calculate metrics
|
||||
precision = calculate_precision(retrieved_chunks, relevant_chunks)
|
||||
recall = calculate_recall(retrieved_chunks, relevant_chunks)
|
||||
f1 = calculate_f1_score(precision, recall)
|
||||
|
||||
retrieval_metrics['precision'].append(precision)
|
||||
retrieval_metrics['recall'].append(recall)
|
||||
retrieval_metrics['f1_score'].append(f1)
|
||||
|
||||
# Calculate averages
|
||||
return {metric: np.mean(values) for metric, values in retrieval_metrics.items()}
|
||||
|
||||
def _evaluate_chunk_quality(self, all_chunks: Dict) -> Dict:
|
||||
"""Evaluate quality of generated chunks"""
|
||||
quality_assessor = ChunkQualityAssessor()
|
||||
quality_scores = []
|
||||
|
||||
for doc_id, chunks in all_chunks.items():
|
||||
# Analyze document
|
||||
content = self.dataset.documents[doc_id]
|
||||
analyzer = DocumentAnalyzer()
|
||||
analysis = analyzer.analyze(content)
|
||||
|
||||
# Assess chunk quality
|
||||
scores = quality_assessor.assess_chunks(chunks, analysis)
|
||||
quality_scores.append(scores)
|
||||
|
||||
# Aggregate quality scores
|
||||
if quality_scores:
|
||||
avg_scores = {}
|
||||
for metric in quality_scores[0].keys():
|
||||
avg_scores[metric] = np.mean([scores[metric] for scores in quality_scores])
|
||||
return avg_scores
|
||||
|
||||
return {}
|
||||
|
||||
def _compare_strategies(self, evaluation_results: Dict) -> Dict:
|
||||
"""Compare performance across strategies"""
|
||||
ab_analyzer = ABTestAnalyzer()
|
||||
|
||||
comparison = {}
|
||||
|
||||
# Compare each metric across strategies
|
||||
strategy_names = list(evaluation_results.keys())
|
||||
|
||||
for i in range(len(strategy_names)):
|
||||
for j in range(i + 1, len(strategy_names)):
|
||||
strategy1 = strategy_names[i]
|
||||
strategy2 = strategy_names[j]
|
||||
|
||||
comparison_key = f"{strategy1}_vs_{strategy2}"
|
||||
comparison[comparison_key] = {}
|
||||
|
||||
# Compare retrieval metrics
|
||||
for metric in ['precision', 'recall', 'f1_score']:
|
||||
if (metric in evaluation_results[strategy1]['retrieval_metrics'] and
|
||||
metric in evaluation_results[strategy2]['retrieval_metrics']):
|
||||
|
||||
comparison[comparison_key][f"retrieval_{metric}"] = ab_analyzer.compare_metrics(
|
||||
[evaluation_results[strategy1]['retrieval_metrics'][metric]],
|
||||
[evaluation_results[strategy2]['retrieval_metrics'][metric]],
|
||||
f"retrieval_{metric}"
|
||||
)
|
||||
|
||||
return comparison
|
||||
|
||||
def _generate_recommendations(self, comparison_results: Dict) -> Dict:
|
||||
"""Generate recommendations based on evaluation results"""
|
||||
recommendations = {
|
||||
'best_overall': None,
|
||||
'best_for_precision': None,
|
||||
'best_for_recall': None,
|
||||
'best_for_performance': None,
|
||||
'trade_offs': []
|
||||
}
|
||||
|
||||
# This would analyze the comparison results and generate specific recommendations
|
||||
# Implementation depends on specific use case requirements
|
||||
|
||||
return recommendations
|
||||
|
||||
def _generate_embeddings(self, all_chunks: Dict) -> Dict:
|
||||
"""Generate embeddings for all chunks"""
|
||||
# This would use the actual embedding model
|
||||
# Placeholder implementation
|
||||
embeddings = {}
|
||||
|
||||
for doc_id, chunks in all_chunks.items():
|
||||
embeddings[doc_id] = []
|
||||
for chunk in chunks:
|
||||
# Generate embedding for chunk content
|
||||
embedding = np.random.rand(384) # Placeholder
|
||||
embeddings[doc_id].append({
|
||||
'chunk': chunk,
|
||||
'embedding': embedding
|
||||
})
|
||||
|
||||
return embeddings
|
||||
|
||||
def _retrieve_chunks(self, query: str, chunk_embeddings: Dict, k: int = 10) -> List[Dict]:
|
||||
"""Retrieve most relevant chunks for a query"""
|
||||
# This would use actual similarity search
|
||||
# Placeholder implementation
|
||||
all_chunks = []
|
||||
|
||||
for doc_embeddings in chunk_embeddings.values():
|
||||
for chunk_data in doc_embeddings:
|
||||
all_chunks.append(chunk_data['chunk'])
|
||||
|
||||
# Simple random selection as placeholder
|
||||
selected = random.sample(all_chunks, min(k, len(all_chunks)))
|
||||
|
||||
return selected
|
||||
```
|
||||
|
||||
This comprehensive evaluation framework provides the tools needed to thoroughly assess chunking strategies across multiple dimensions: retrieval effectiveness, answer quality, system performance, and statistical significance. The modular design allows for easy extension and customization based on specific requirements and use cases.
|
||||
Reference in New Issue
Block a user