904 lines
30 KiB
Markdown
904 lines
30 KiB
Markdown
# Performance Evaluation Framework
|
|
|
|
This document provides comprehensive methodologies for evaluating chunking strategy performance and effectiveness.
|
|
|
|
## Evaluation Metrics
|
|
|
|
### Core Retrieval Metrics
|
|
|
|
#### Retrieval Precision
|
|
Measures the fraction of retrieved chunks that are relevant to the query.
|
|
|
|
```python
|
|
def calculate_precision(retrieved_chunks: List[Dict], relevant_chunks: List[Dict]) -> float:
|
|
"""
|
|
Calculate retrieval precision
|
|
Precision = |Relevant ∩ Retrieved| / |Retrieved|
|
|
"""
|
|
retrieved_ids = {chunk.get('id') for chunk in retrieved_chunks}
|
|
relevant_ids = {chunk.get('id') for chunk in relevant_chunks}
|
|
|
|
intersection = retrieved_ids & relevant_ids
|
|
|
|
if not retrieved_ids:
|
|
return 0.0
|
|
|
|
return len(intersection) / len(retrieved_ids)
|
|
```
|
|
|
|
#### Retrieval Recall
|
|
Measures the fraction of relevant chunks that are successfully retrieved.
|
|
|
|
```python
|
|
def calculate_recall(retrieved_chunks: List[Dict], relevant_chunks: List[Dict]) -> float:
|
|
"""
|
|
Calculate retrieval recall
|
|
Recall = |Relevant ∩ Retrieved| / |Relevant|
|
|
"""
|
|
retrieved_ids = {chunk.get('id') for chunk in retrieved_chunks}
|
|
relevant_ids = {chunk.get('id') for chunk in relevant_chunks}
|
|
|
|
intersection = retrieved_ids & relevant_ids
|
|
|
|
if not relevant_ids:
|
|
return 0.0
|
|
|
|
return len(intersection) / len(relevant_ids)
|
|
```
|
|
|
|
#### F1-Score
|
|
Harmonic mean of precision and recall.
|
|
|
|
```python
|
|
def calculate_f1_score(precision: float, recall: float) -> float:
|
|
"""
|
|
Calculate F1-score
|
|
F1 = 2 * (Precision * Recall) / (Precision + Recall)
|
|
"""
|
|
if precision + recall == 0:
|
|
return 0.0
|
|
|
|
return 2 * (precision * recall) / (precision + recall)
|
|
```
|
|
|
|
### Mean Reciprocal Rank (MRR)
|
|
Measures the rank of the first relevant result.
|
|
|
|
```python
|
|
def calculate_mrr(queries: List[Dict], results: List[List[Dict]]) -> float:
|
|
"""
|
|
Calculate Mean Reciprocal Rank
|
|
"""
|
|
reciprocal_ranks = []
|
|
|
|
for query, query_results in zip(queries, results):
|
|
relevant_found = False
|
|
|
|
for rank, result in enumerate(query_results, 1):
|
|
if result.get('is_relevant', False):
|
|
reciprocal_ranks.append(1.0 / rank)
|
|
relevant_found = True
|
|
break
|
|
|
|
if not relevant_found:
|
|
reciprocal_ranks.append(0.0)
|
|
|
|
return sum(reciprocal_ranks) / len(reciprocal_ranks)
|
|
```
|
|
|
|
### Mean Average Precision (MAP)
|
|
Considers both precision and the ranking of relevant documents.
|
|
|
|
```python
|
|
def calculate_average_precision(retrieved_chunks: List[Dict], relevant_chunks: List[Dict]) -> float:
|
|
"""
|
|
Calculate Average Precision for a single query
|
|
"""
|
|
retrieved_ids = {chunk.get('id') for chunk in retrieved_chunks}
|
|
relevant_ids = {chunk.get('id') for chunk in relevant_chunks}
|
|
|
|
if not relevant_ids:
|
|
return 0.0
|
|
|
|
precisions = []
|
|
relevant_count = 0
|
|
|
|
for rank, chunk in enumerate(retrieved_chunks, 1):
|
|
if chunk.get('id') in relevant_ids:
|
|
relevant_count += 1
|
|
precision_at_rank = relevant_count / rank
|
|
precisions.append(precision_at_rank)
|
|
|
|
return sum(precisions) / len(relevant_ids) if relevant_ids else 0.0
|
|
|
|
def calculate_map(queries: List[Dict], results: List[List[Dict]]) -> float:
|
|
"""
|
|
Calculate Mean Average Precision across multiple queries
|
|
"""
|
|
average_precisions = []
|
|
|
|
for query, query_results in zip(queries, results):
|
|
ap = calculate_average_precision(query_results, query.get('relevant_chunks', []))
|
|
average_precisions.append(ap)
|
|
|
|
return sum(average_precisions) / len(average_precisions) if average_precisions else 0.0
|
|
```
|
|
|
|
### Normalized Discounted Cumulative Gain (NDCG)
|
|
Measures ranking quality with emphasis on highly relevant results.
|
|
|
|
```python
|
|
def calculate_dcg(retrieved_chunks: List[Dict]) -> float:
|
|
"""
|
|
Calculate Discounted Cumulative Gain
|
|
"""
|
|
dcg = 0.0
|
|
|
|
for rank, chunk in enumerate(retrieved_chunks, 1):
|
|
relevance = chunk.get('relevance_score', 0)
|
|
dcg += relevance / np.log2(rank + 1)
|
|
|
|
return dcg
|
|
|
|
def calculate_ndcg(retrieved_chunks: List[Dict], ideal_chunks: List[Dict]) -> float:
|
|
"""
|
|
Calculate Normalized Discounted Cumulative Gain
|
|
"""
|
|
dcg = calculate_dcg(retrieved_chunks)
|
|
idcg = calculate_dcg(ideal_chunks)
|
|
|
|
if idcg == 0:
|
|
return 0.0
|
|
|
|
return dcg / idcg
|
|
```
|
|
|
|
## End-to-End RAG Evaluation
|
|
|
|
### Answer Quality Metrics
|
|
|
|
#### Factual Consistency
|
|
Measures how well the generated answer aligns with retrieved chunks.
|
|
|
|
```python
|
|
import spacy
|
|
from transformers import pipeline
|
|
|
|
class FactualConsistencyEvaluator:
|
|
def __init__(self):
|
|
self.nlp = spacy.load("en_core_web_sm")
|
|
self.nli_pipeline = pipeline("text-classification",
|
|
model="roberta-large-mnli")
|
|
|
|
def evaluate_consistency(self, answer: str, retrieved_chunks: List[str]) -> float:
|
|
"""
|
|
Evaluate factual consistency between answer and retrieved context
|
|
"""
|
|
if not retrieved_chunks:
|
|
return 0.0
|
|
|
|
# Combine retrieved chunks as context
|
|
context = " ".join(retrieved_chunks[:3]) # Use top 3 chunks
|
|
|
|
# Use Natural Language Inference to check consistency
|
|
result = self.nli_pipeline(f"premise: {context} hypothesis: {answer}")
|
|
|
|
# Extract consistency score (entailment probability)
|
|
for item in result:
|
|
if item['label'] == 'ENTAILMENT':
|
|
return item['score']
|
|
elif item['label'] == 'CONTRADICTION':
|
|
return 1.0 - item['score']
|
|
|
|
return 0.5 # Neutral if NLI is inconclusive
|
|
```
|
|
|
|
#### Answer Completeness
|
|
Measures how completely the answer addresses the user's query.
|
|
|
|
```python
|
|
def evaluate_completeness(answer: str, query: str, reference_answer: str = None) -> float:
|
|
"""
|
|
Evaluate answer completeness
|
|
"""
|
|
# Extract key entities from query
|
|
query_entities = extract_entities(query)
|
|
answer_entities = extract_entities(answer)
|
|
|
|
# Calculate entity coverage
|
|
if not query_entities:
|
|
return 0.5 # Neutral if no entities in query
|
|
|
|
covered_entities = query_entities & answer_entities
|
|
entity_coverage = len(covered_entities) / len(query_entities)
|
|
|
|
# If reference answer is available, compare against it
|
|
if reference_answer:
|
|
reference_entities = extract_entities(reference_answer)
|
|
answer_reference_overlap = len(answer_entities & reference_entities) / max(len(reference_entities), 1)
|
|
return (entity_coverage + answer_reference_overlap) / 2
|
|
|
|
return entity_coverage
|
|
|
|
def extract_entities(text: str) -> set:
|
|
"""
|
|
Extract named entities from text (simplified)
|
|
"""
|
|
# This would use a proper NER model in practice
|
|
import re
|
|
|
|
# Simple noun phrase extraction as placeholder
|
|
noun_phrases = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', text)
|
|
return set(noun_phrases)
|
|
```
|
|
|
|
#### Response Relevance
|
|
Measures how relevant the answer is to the original query.
|
|
|
|
```python
|
|
from sentence_transformers import SentenceTransformer
|
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
|
class RelevanceEvaluator:
|
|
def __init__(self, model_name="all-MiniLM-L6-v2"):
|
|
self.model = SentenceTransformer(model_name)
|
|
|
|
def evaluate_relevance(self, query: str, answer: str) -> float:
|
|
"""
|
|
Evaluate semantic relevance between query and answer
|
|
"""
|
|
# Generate embeddings
|
|
query_embedding = self.model.encode([query])
|
|
answer_embedding = self.model.encode([answer])
|
|
|
|
# Calculate cosine similarity
|
|
similarity = cosine_similarity(query_embedding, answer_embedding)[0][0]
|
|
|
|
return float(similarity)
|
|
```
|
|
|
|
## Performance Metrics
|
|
|
|
### Processing Time
|
|
|
|
```python
|
|
import time
|
|
from dataclasses import dataclass
|
|
from typing import List, Dict
|
|
|
|
@dataclass
|
|
class PerformanceMetrics:
|
|
total_time: float
|
|
chunking_time: float
|
|
embedding_time: float
|
|
search_time: float
|
|
generation_time: float
|
|
throughput: float # documents per second
|
|
|
|
class PerformanceProfiler:
|
|
def __init__(self):
|
|
self.timings = {}
|
|
self.start_times = {}
|
|
|
|
def start_timer(self, operation: str):
|
|
self.start_times[operation] = time.time()
|
|
|
|
def end_timer(self, operation: str):
|
|
if operation in self.start_times:
|
|
duration = time.time() - self.start_times[operation]
|
|
if operation not in self.timings:
|
|
self.timings[operation] = []
|
|
self.timings[operation].append(duration)
|
|
return duration
|
|
return 0.0
|
|
|
|
def get_performance_metrics(self, document_count: int) -> PerformanceMetrics:
|
|
total_time = sum(sum(times) for times in self.timings.values())
|
|
|
|
return PerformanceMetrics(
|
|
total_time=total_time,
|
|
chunking_time=sum(self.timings.get('chunking', [0])),
|
|
embedding_time=sum(self.timings.get('embedding', [0])),
|
|
search_time=sum(self.timings.get('search', [0])),
|
|
generation_time=sum(self.timings.get('generation', [0])),
|
|
throughput=document_count / total_time if total_time > 0 else 0
|
|
)
|
|
```
|
|
|
|
### Memory Usage
|
|
|
|
```python
|
|
import psutil
|
|
import os
|
|
from typing import Dict, List
|
|
|
|
class MemoryProfiler:
|
|
def __init__(self):
|
|
self.process = psutil.Process(os.getpid())
|
|
self.memory_snapshots = []
|
|
|
|
def take_memory_snapshot(self, label: str):
|
|
"""Take a snapshot of current memory usage"""
|
|
memory_info = self.process.memory_info()
|
|
memory_mb = memory_info.rss / 1024 / 1024 # Convert to MB
|
|
|
|
self.memory_snapshots.append({
|
|
'label': label,
|
|
'memory_mb': memory_mb,
|
|
'timestamp': time.time()
|
|
})
|
|
|
|
def get_peak_memory_usage(self) -> float:
|
|
"""Get peak memory usage in MB"""
|
|
if not self.memory_snapshots:
|
|
return 0.0
|
|
return max(snapshot['memory_mb'] for snapshot in self.memory_snapshots)
|
|
|
|
def get_memory_usage_by_operation(self) -> Dict[str, float]:
|
|
"""Get memory usage breakdown by operation"""
|
|
if not self.memory_snapshots:
|
|
return {}
|
|
|
|
memory_by_op = {}
|
|
for i in range(1, len(self.memory_snapshots)):
|
|
prev_snapshot = self.memory_snapshots[i-1]
|
|
curr_snapshot = self.memory_snapshots[i]
|
|
|
|
operation = curr_snapshot['label']
|
|
memory_delta = curr_snapshot['memory_mb'] - prev_snapshot['memory_mb']
|
|
|
|
if operation not in memory_by_op:
|
|
memory_by_op[operation] = []
|
|
memory_by_op[operation].append(memory_delta)
|
|
|
|
return {op: sum(deltas) for op, deltas in memory_by_op.items()}
|
|
```
|
|
|
|
## Evaluation Datasets
|
|
|
|
### Standardized Test Sets
|
|
|
|
#### Question-Answer Pairs
|
|
|
|
```python
|
|
from dataclasses import dataclass
|
|
from typing import List, Optional
|
|
import json
|
|
|
|
@dataclass
|
|
class EvaluationQuery:
|
|
id: str
|
|
question: str
|
|
reference_answer: Optional[str]
|
|
relevant_chunk_ids: List[str]
|
|
query_type: str # factoid, analytical, comparative
|
|
difficulty: str # easy, medium, hard
|
|
domain: str # finance, medical, legal, technical
|
|
|
|
class EvaluationDataset:
|
|
def __init__(self, name: str):
|
|
self.name = name
|
|
self.queries: List[EvaluationQuery] = []
|
|
self.documents: Dict[str, str] = {}
|
|
self.chunks: Dict[str, Dict] = {}
|
|
|
|
def add_query(self, query: EvaluationQuery):
|
|
self.queries.append(query)
|
|
|
|
def add_document(self, doc_id: str, content: str):
|
|
self.documents[doc_id] = content
|
|
|
|
def add_chunk(self, chunk_id: str, content: str, doc_id: str, metadata: Dict):
|
|
self.chunks[chunk_id] = {
|
|
'id': chunk_id,
|
|
'content': content,
|
|
'doc_id': doc_id,
|
|
'metadata': metadata
|
|
}
|
|
|
|
def save_to_file(self, filepath: str):
|
|
data = {
|
|
'name': self.name,
|
|
'queries': [
|
|
{
|
|
'id': q.id,
|
|
'question': q.question,
|
|
'reference_answer': q.reference_answer,
|
|
'relevant_chunk_ids': q.relevant_chunk_ids,
|
|
'query_type': q.query_type,
|
|
'difficulty': q.difficulty,
|
|
'domain': q.domain
|
|
}
|
|
for q in self.queries
|
|
],
|
|
'documents': self.documents,
|
|
'chunks': self.chunks
|
|
}
|
|
|
|
with open(filepath, 'w') as f:
|
|
json.dump(data, f, indent=2)
|
|
|
|
@classmethod
|
|
def load_from_file(cls, filepath: str):
|
|
with open(filepath, 'r') as f:
|
|
data = json.load(f)
|
|
|
|
dataset = cls(data['name'])
|
|
dataset.documents = data['documents']
|
|
dataset.chunks = data['chunks']
|
|
|
|
for q_data in data['queries']:
|
|
query = EvaluationQuery(
|
|
id=q_data['id'],
|
|
question=q_data['question'],
|
|
reference_answer=q_data.get('reference_answer'),
|
|
relevant_chunk_ids=q_data['relevant_chunk_ids'],
|
|
query_type=q_data['query_type'],
|
|
difficulty=q_data['difficulty'],
|
|
domain=q_data['domain']
|
|
)
|
|
dataset.add_query(query)
|
|
|
|
return dataset
|
|
```
|
|
|
|
### Dataset Generation
|
|
|
|
#### Synthetic Query Generation
|
|
|
|
```python
|
|
import random
|
|
from typing import List, Dict
|
|
|
|
class SyntheticQueryGenerator:
|
|
def __init__(self):
|
|
self.query_templates = {
|
|
'factoid': [
|
|
"What is {concept}?",
|
|
"When did {event} occur?",
|
|
"Who developed {technology}?",
|
|
"How many {items} are mentioned?",
|
|
"What is the value of {metric}?"
|
|
],
|
|
'analytical': [
|
|
"Compare and contrast {concept1} and {concept2}.",
|
|
"Analyze the impact of {concept} on {domain}.",
|
|
"What are the advantages and disadvantages of {technology}?",
|
|
"Explain the relationship between {concept1} and {concept2}.",
|
|
"Evaluate the effectiveness of {approach} for {problem}."
|
|
],
|
|
'comparative': [
|
|
"Which is better: {option1} or {option2}?",
|
|
"How does {method1} differ from {method2}?",
|
|
"Compare the performance of {system1} and {system2}.",
|
|
"What are the key differences between {approach1} and {approach2}?"
|
|
]
|
|
}
|
|
|
|
def generate_queries_from_chunks(self, chunks: List[Dict], num_queries: int = 100) -> List[EvaluationQuery]:
|
|
"""Generate synthetic queries from document chunks"""
|
|
queries = []
|
|
|
|
# Extract entities and concepts from chunks
|
|
entities = self._extract_entities_from_chunks(chunks)
|
|
|
|
for i in range(num_queries):
|
|
query_type = random.choice(['factoid', 'analytical', 'comparative'])
|
|
template = random.choice(self.query_templates[query_type])
|
|
|
|
# Fill template with extracted entities
|
|
query_text = self._fill_template(template, entities)
|
|
|
|
# Find relevant chunks for this query
|
|
relevant_chunks = self._find_relevant_chunks(query_text, chunks)
|
|
|
|
query = EvaluationQuery(
|
|
id=f"synthetic_{i}",
|
|
question=query_text,
|
|
reference_answer=None, # Would need generation model
|
|
relevant_chunk_ids=[chunk['id'] for chunk in relevant_chunks],
|
|
query_type=query_type,
|
|
difficulty=random.choice(['easy', 'medium', 'hard']),
|
|
domain='synthetic'
|
|
)
|
|
|
|
queries.append(query)
|
|
|
|
return queries
|
|
|
|
def _extract_entities_from_chunks(self, chunks: List[Dict]) -> Dict[str, List[str]]:
|
|
"""Extract entities, concepts, and relationships from chunks"""
|
|
# This would use proper NER in practice
|
|
entities = {
|
|
'concepts': [],
|
|
'technologies': [],
|
|
'methods': [],
|
|
'metrics': [],
|
|
'events': []
|
|
}
|
|
|
|
for chunk in chunks:
|
|
content = chunk['content']
|
|
# Simplified entity extraction
|
|
words = content.split()
|
|
entities['concepts'].extend([word for word in words if len(word) > 6])
|
|
entities['technologies'].extend([word for word in words if 'technology' in word.lower()])
|
|
entities['methods'].extend([word for word in words if 'method' in word.lower()])
|
|
entities['metrics'].extend([word for word in words if '%' in word or '$' in word])
|
|
|
|
# Remove duplicates and limit
|
|
for key in entities:
|
|
entities[key] = list(set(entities[key]))[:50]
|
|
|
|
return entities
|
|
|
|
def _fill_template(self, template: str, entities: Dict[str, List[str]]) -> str:
|
|
"""Fill query template with random entities"""
|
|
import re
|
|
|
|
def replace_placeholder(match):
|
|
placeholder = match.group(1)
|
|
|
|
# Map placeholders to entity types
|
|
entity_mapping = {
|
|
'concept': 'concepts',
|
|
'concept1': 'concepts',
|
|
'concept2': 'concepts',
|
|
'technology': 'technologies',
|
|
'method': 'methods',
|
|
'method1': 'methods',
|
|
'method2': 'methods',
|
|
'metric': 'metrics',
|
|
'event': 'events',
|
|
'items': 'concepts',
|
|
'option1': 'concepts',
|
|
'option2': 'concepts',
|
|
'approach': 'methods',
|
|
'problem': 'concepts',
|
|
'domain': 'concepts',
|
|
'system1': 'concepts',
|
|
'system2': 'concepts'
|
|
}
|
|
|
|
entity_type = entity_mapping.get(placeholder, 'concepts')
|
|
available_entities = entities.get(entity_type, ['something'])
|
|
|
|
if available_entities:
|
|
return random.choice(available_entities)
|
|
else:
|
|
return 'something'
|
|
|
|
return re.sub(r'\{(\w+)\}', replace_placeholder, template)
|
|
|
|
def _find_relevant_chunks(self, query: str, chunks: List[Dict], k: int = 3) -> List[Dict]:
|
|
"""Find chunks most relevant to the query"""
|
|
# Simple keyword matching for synthetic generation
|
|
query_words = set(query.lower().split())
|
|
|
|
chunk_scores = []
|
|
for chunk in chunks:
|
|
chunk_words = set(chunk['content'].lower().split())
|
|
overlap = len(query_words & chunk_words)
|
|
chunk_scores.append((overlap, chunk))
|
|
|
|
# Sort by overlap and return top k
|
|
chunk_scores.sort(key=lambda x: x[0], reverse=True)
|
|
return [chunk for _, chunk in chunk_scores[:k]]
|
|
```
|
|
|
|
## A/B Testing Framework
|
|
|
|
### Statistical Significance Testing
|
|
|
|
```python
|
|
import numpy as np
|
|
from scipy import stats
|
|
from typing import List, Dict, Tuple
|
|
|
|
class ABTestAnalyzer:
|
|
def __init__(self):
|
|
self.significance_level = 0.05
|
|
|
|
def compare_metrics(self, control_metrics: List[float],
|
|
treatment_metrics: List[float],
|
|
metric_name: str) -> Dict:
|
|
"""
|
|
Compare metrics between control and treatment groups
|
|
"""
|
|
control_mean = np.mean(control_metrics)
|
|
treatment_mean = np.mean(treatment_metrics)
|
|
|
|
control_std = np.std(control_metrics)
|
|
treatment_std = np.std(treatment_metrics)
|
|
|
|
# Perform t-test
|
|
t_statistic, p_value = stats.ttest_ind(control_metrics, treatment_metrics)
|
|
|
|
# Calculate effect size (Cohen's d)
|
|
pooled_std = np.sqrt(((len(control_metrics) - 1) * control_std**2 +
|
|
(len(treatment_metrics) - 1) * treatment_std**2) /
|
|
(len(control_metrics) + len(treatment_metrics) - 2))
|
|
|
|
cohens_d = (treatment_mean - control_mean) / pooled_std if pooled_std > 0 else 0
|
|
|
|
# Determine significance
|
|
is_significant = p_value < self.significance_level
|
|
|
|
return {
|
|
'metric_name': metric_name,
|
|
'control_mean': control_mean,
|
|
'treatment_mean': treatment_mean,
|
|
'absolute_difference': treatment_mean - control_mean,
|
|
'relative_difference': ((treatment_mean - control_mean) / control_mean * 100) if control_mean != 0 else 0,
|
|
'control_std': control_std,
|
|
'treatment_std': treatment_std,
|
|
't_statistic': t_statistic,
|
|
'p_value': p_value,
|
|
'is_significant': is_significant,
|
|
'effect_size': cohens_d,
|
|
'significance_level': self.significance_level
|
|
}
|
|
|
|
def analyze_ab_test_results(self,
|
|
control_results: Dict[str, List[float]],
|
|
treatment_results: Dict[str, List[float]]) -> Dict:
|
|
"""
|
|
Analyze A/B test results across multiple metrics
|
|
"""
|
|
analysis_results = {}
|
|
|
|
# Ensure both dictionaries have the same keys
|
|
all_metrics = set(control_results.keys()) & set(treatment_results.keys())
|
|
|
|
for metric in all_metrics:
|
|
if metric in control_results and metric in treatment_results:
|
|
analysis_results[metric] = self.compare_metrics(
|
|
control_results[metric],
|
|
treatment_results[metric],
|
|
metric
|
|
)
|
|
|
|
# Calculate overall summary
|
|
significant_improvements = sum(1 for result in analysis_results.values()
|
|
if result['is_significant'] and result['relative_difference'] > 0)
|
|
significant_degradations = sum(1 for result in analysis_results.values()
|
|
if result['is_significant'] and result['relative_difference'] < 0)
|
|
|
|
analysis_results['summary'] = {
|
|
'total_metrics_compared': len(analysis_results),
|
|
'significant_improvements': significant_improvements,
|
|
'significant_degradations': significant_degradations,
|
|
'no_significant_change': len(analysis_results) - significant_improvements - significant_degradations
|
|
}
|
|
|
|
return analysis_results
|
|
```
|
|
|
|
## Automated Evaluation Pipeline
|
|
|
|
### End-to-End Evaluation
|
|
|
|
```python
|
|
class ChunkingEvaluationPipeline:
|
|
def __init__(self, strategies: Dict[str, Any], dataset: EvaluationDataset):
|
|
self.strategies = strategies
|
|
self.dataset = dataset
|
|
self.results = {}
|
|
self.profiler = PerformanceProfiler()
|
|
self.memory_profiler = MemoryProfiler()
|
|
|
|
def run_evaluation(self) -> Dict:
|
|
"""Run comprehensive evaluation of all strategies"""
|
|
evaluation_results = {}
|
|
|
|
for strategy_name, strategy in self.strategies.items():
|
|
print(f"Evaluating strategy: {strategy_name}")
|
|
|
|
# Reset profilers for each strategy
|
|
self.profiler = PerformanceProfiler()
|
|
self.memory_profiler = MemoryProfiler()
|
|
|
|
# Evaluate strategy
|
|
strategy_results = self._evaluate_strategy(strategy, strategy_name)
|
|
evaluation_results[strategy_name] = strategy_results
|
|
|
|
# Compare strategies
|
|
comparison_results = self._compare_strategies(evaluation_results)
|
|
|
|
return {
|
|
'individual_results': evaluation_results,
|
|
'comparison': comparison_results,
|
|
'recommendations': self._generate_recommendations(comparison_results)
|
|
}
|
|
|
|
def _evaluate_strategy(self, strategy: Any, strategy_name: str) -> Dict:
|
|
"""Evaluate a single chunking strategy"""
|
|
results = {
|
|
'strategy_name': strategy_name,
|
|
'retrieval_metrics': {},
|
|
'quality_metrics': {},
|
|
'performance_metrics': {}
|
|
}
|
|
|
|
# Track memory usage
|
|
self.memory_profiler.take_memory_snapshot(f"{strategy_name}_start")
|
|
|
|
# Process all documents
|
|
self.profiler.start_timer('total_processing')
|
|
|
|
all_chunks = {}
|
|
for doc_id, content in self.dataset.documents.items():
|
|
self.profiler.start_timer('chunking')
|
|
chunks = strategy.chunk(content)
|
|
self.profiler.end_timer('chunking')
|
|
|
|
all_chunks[doc_id] = chunks
|
|
|
|
self.memory_profiler.take_memory_snapshot(f"{strategy_name}_after_chunking")
|
|
|
|
# Generate embeddings for chunks
|
|
self.profiler.start_timer('embedding')
|
|
chunk_embeddings = self._generate_embeddings(all_chunks)
|
|
self.profiler.end_timer('embedding')
|
|
|
|
self.memory_profiler.take_memory_snapshot(f"{strategy_name}_after_embedding")
|
|
|
|
# Evaluate retrieval performance
|
|
retrieval_results = self._evaluate_retrieval(all_chunks, chunk_embeddings)
|
|
results['retrieval_metrics'] = retrieval_results
|
|
|
|
# Evaluate chunk quality
|
|
quality_results = self._evaluate_chunk_quality(all_chunks)
|
|
results['quality_metrics'] = quality_results
|
|
|
|
# Get performance metrics
|
|
self.profiler.end_timer('total_processing')
|
|
performance_metrics = self.profiler.get_performance_metrics(len(self.dataset.documents))
|
|
results['performance_metrics'] = performance_metrics.__dict__
|
|
|
|
# Get memory metrics
|
|
self.memory_profiler.take_memory_snapshot(f"{strategy_name}_end")
|
|
results['memory_metrics'] = {
|
|
'peak_memory_mb': self.memory_profiler.get_peak_memory_usage(),
|
|
'memory_by_operation': self.memory_profiler.get_memory_usage_by_operation()
|
|
}
|
|
|
|
return results
|
|
|
|
def _evaluate_retrieval(self, all_chunks: Dict, chunk_embeddings: Dict) -> Dict:
|
|
"""Evaluate retrieval performance"""
|
|
retrieval_metrics = {
|
|
'precision': [],
|
|
'recall': [],
|
|
'f1_score': [],
|
|
'mrr': [],
|
|
'map': []
|
|
}
|
|
|
|
for query in self.dataset.queries:
|
|
# Perform retrieval
|
|
self.profiler.start_timer('search')
|
|
retrieved_chunks = self._retrieve_chunks(query.question, chunk_embeddings, k=10)
|
|
self.profiler.end_timer('search')
|
|
|
|
# Get relevant chunks for this query
|
|
relevant_chunk_ids = set(query.relevant_chunk_ids)
|
|
relevant_chunks = [chunk for chunk in retrieved_chunks
|
|
if chunk.get('id') in relevant_chunk_ids]
|
|
|
|
# Calculate metrics
|
|
precision = calculate_precision(retrieved_chunks, relevant_chunks)
|
|
recall = calculate_recall(retrieved_chunks, relevant_chunks)
|
|
f1 = calculate_f1_score(precision, recall)
|
|
|
|
retrieval_metrics['precision'].append(precision)
|
|
retrieval_metrics['recall'].append(recall)
|
|
retrieval_metrics['f1_score'].append(f1)
|
|
|
|
# Calculate averages
|
|
return {metric: np.mean(values) for metric, values in retrieval_metrics.items()}
|
|
|
|
def _evaluate_chunk_quality(self, all_chunks: Dict) -> Dict:
|
|
"""Evaluate quality of generated chunks"""
|
|
quality_assessor = ChunkQualityAssessor()
|
|
quality_scores = []
|
|
|
|
for doc_id, chunks in all_chunks.items():
|
|
# Analyze document
|
|
content = self.dataset.documents[doc_id]
|
|
analyzer = DocumentAnalyzer()
|
|
analysis = analyzer.analyze(content)
|
|
|
|
# Assess chunk quality
|
|
scores = quality_assessor.assess_chunks(chunks, analysis)
|
|
quality_scores.append(scores)
|
|
|
|
# Aggregate quality scores
|
|
if quality_scores:
|
|
avg_scores = {}
|
|
for metric in quality_scores[0].keys():
|
|
avg_scores[metric] = np.mean([scores[metric] for scores in quality_scores])
|
|
return avg_scores
|
|
|
|
return {}
|
|
|
|
def _compare_strategies(self, evaluation_results: Dict) -> Dict:
|
|
"""Compare performance across strategies"""
|
|
ab_analyzer = ABTestAnalyzer()
|
|
|
|
comparison = {}
|
|
|
|
# Compare each metric across strategies
|
|
strategy_names = list(evaluation_results.keys())
|
|
|
|
for i in range(len(strategy_names)):
|
|
for j in range(i + 1, len(strategy_names)):
|
|
strategy1 = strategy_names[i]
|
|
strategy2 = strategy_names[j]
|
|
|
|
comparison_key = f"{strategy1}_vs_{strategy2}"
|
|
comparison[comparison_key] = {}
|
|
|
|
# Compare retrieval metrics
|
|
for metric in ['precision', 'recall', 'f1_score']:
|
|
if (metric in evaluation_results[strategy1]['retrieval_metrics'] and
|
|
metric in evaluation_results[strategy2]['retrieval_metrics']):
|
|
|
|
comparison[comparison_key][f"retrieval_{metric}"] = ab_analyzer.compare_metrics(
|
|
[evaluation_results[strategy1]['retrieval_metrics'][metric]],
|
|
[evaluation_results[strategy2]['retrieval_metrics'][metric]],
|
|
f"retrieval_{metric}"
|
|
)
|
|
|
|
return comparison
|
|
|
|
def _generate_recommendations(self, comparison_results: Dict) -> Dict:
|
|
"""Generate recommendations based on evaluation results"""
|
|
recommendations = {
|
|
'best_overall': None,
|
|
'best_for_precision': None,
|
|
'best_for_recall': None,
|
|
'best_for_performance': None,
|
|
'trade_offs': []
|
|
}
|
|
|
|
# This would analyze the comparison results and generate specific recommendations
|
|
# Implementation depends on specific use case requirements
|
|
|
|
return recommendations
|
|
|
|
def _generate_embeddings(self, all_chunks: Dict) -> Dict:
|
|
"""Generate embeddings for all chunks"""
|
|
# This would use the actual embedding model
|
|
# Placeholder implementation
|
|
embeddings = {}
|
|
|
|
for doc_id, chunks in all_chunks.items():
|
|
embeddings[doc_id] = []
|
|
for chunk in chunks:
|
|
# Generate embedding for chunk content
|
|
embedding = np.random.rand(384) # Placeholder
|
|
embeddings[doc_id].append({
|
|
'chunk': chunk,
|
|
'embedding': embedding
|
|
})
|
|
|
|
return embeddings
|
|
|
|
def _retrieve_chunks(self, query: str, chunk_embeddings: Dict, k: int = 10) -> List[Dict]:
|
|
"""Retrieve most relevant chunks for a query"""
|
|
# This would use actual similarity search
|
|
# Placeholder implementation
|
|
all_chunks = []
|
|
|
|
for doc_embeddings in chunk_embeddings.values():
|
|
for chunk_data in doc_embeddings:
|
|
all_chunks.append(chunk_data['chunk'])
|
|
|
|
# Simple random selection as placeholder
|
|
selected = random.sample(all_chunks, min(k, len(all_chunks)))
|
|
|
|
return selected
|
|
```
|
|
|
|
This comprehensive evaluation framework provides the tools needed to thoroughly assess chunking strategies across multiple dimensions: retrieval effectiveness, answer quality, system performance, and statistical significance. The modular design allows for easy extension and customization based on specific requirements and use cases. |