# Performance Evaluation Framework This document provides comprehensive methodologies for evaluating chunking strategy performance and effectiveness. ## Evaluation Metrics ### Core Retrieval Metrics #### Retrieval Precision Measures the fraction of retrieved chunks that are relevant to the query. ```python def calculate_precision(retrieved_chunks: List[Dict], relevant_chunks: List[Dict]) -> float: """ Calculate retrieval precision Precision = |Relevant ∩ Retrieved| / |Retrieved| """ retrieved_ids = {chunk.get('id') for chunk in retrieved_chunks} relevant_ids = {chunk.get('id') for chunk in relevant_chunks} intersection = retrieved_ids & relevant_ids if not retrieved_ids: return 0.0 return len(intersection) / len(retrieved_ids) ``` #### Retrieval Recall Measures the fraction of relevant chunks that are successfully retrieved. ```python def calculate_recall(retrieved_chunks: List[Dict], relevant_chunks: List[Dict]) -> float: """ Calculate retrieval recall Recall = |Relevant ∩ Retrieved| / |Relevant| """ retrieved_ids = {chunk.get('id') for chunk in retrieved_chunks} relevant_ids = {chunk.get('id') for chunk in relevant_chunks} intersection = retrieved_ids & relevant_ids if not relevant_ids: return 0.0 return len(intersection) / len(relevant_ids) ``` #### F1-Score Harmonic mean of precision and recall. ```python def calculate_f1_score(precision: float, recall: float) -> float: """ Calculate F1-score F1 = 2 * (Precision * Recall) / (Precision + Recall) """ if precision + recall == 0: return 0.0 return 2 * (precision * recall) / (precision + recall) ``` ### Mean Reciprocal Rank (MRR) Measures the rank of the first relevant result. ```python def calculate_mrr(queries: List[Dict], results: List[List[Dict]]) -> float: """ Calculate Mean Reciprocal Rank """ reciprocal_ranks = [] for query, query_results in zip(queries, results): relevant_found = False for rank, result in enumerate(query_results, 1): if result.get('is_relevant', False): reciprocal_ranks.append(1.0 / rank) relevant_found = True break if not relevant_found: reciprocal_ranks.append(0.0) return sum(reciprocal_ranks) / len(reciprocal_ranks) ``` ### Mean Average Precision (MAP) Considers both precision and the ranking of relevant documents. ```python def calculate_average_precision(retrieved_chunks: List[Dict], relevant_chunks: List[Dict]) -> float: """ Calculate Average Precision for a single query """ retrieved_ids = {chunk.get('id') for chunk in retrieved_chunks} relevant_ids = {chunk.get('id') for chunk in relevant_chunks} if not relevant_ids: return 0.0 precisions = [] relevant_count = 0 for rank, chunk in enumerate(retrieved_chunks, 1): if chunk.get('id') in relevant_ids: relevant_count += 1 precision_at_rank = relevant_count / rank precisions.append(precision_at_rank) return sum(precisions) / len(relevant_ids) if relevant_ids else 0.0 def calculate_map(queries: List[Dict], results: List[List[Dict]]) -> float: """ Calculate Mean Average Precision across multiple queries """ average_precisions = [] for query, query_results in zip(queries, results): ap = calculate_average_precision(query_results, query.get('relevant_chunks', [])) average_precisions.append(ap) return sum(average_precisions) / len(average_precisions) if average_precisions else 0.0 ``` ### Normalized Discounted Cumulative Gain (NDCG) Measures ranking quality with emphasis on highly relevant results. ```python def calculate_dcg(retrieved_chunks: List[Dict]) -> float: """ Calculate Discounted Cumulative Gain """ dcg = 0.0 for rank, chunk in enumerate(retrieved_chunks, 1): relevance = chunk.get('relevance_score', 0) dcg += relevance / np.log2(rank + 1) return dcg def calculate_ndcg(retrieved_chunks: List[Dict], ideal_chunks: List[Dict]) -> float: """ Calculate Normalized Discounted Cumulative Gain """ dcg = calculate_dcg(retrieved_chunks) idcg = calculate_dcg(ideal_chunks) if idcg == 0: return 0.0 return dcg / idcg ``` ## End-to-End RAG Evaluation ### Answer Quality Metrics #### Factual Consistency Measures how well the generated answer aligns with retrieved chunks. ```python import spacy from transformers import pipeline class FactualConsistencyEvaluator: def __init__(self): self.nlp = spacy.load("en_core_web_sm") self.nli_pipeline = pipeline("text-classification", model="roberta-large-mnli") def evaluate_consistency(self, answer: str, retrieved_chunks: List[str]) -> float: """ Evaluate factual consistency between answer and retrieved context """ if not retrieved_chunks: return 0.0 # Combine retrieved chunks as context context = " ".join(retrieved_chunks[:3]) # Use top 3 chunks # Use Natural Language Inference to check consistency result = self.nli_pipeline(f"premise: {context} hypothesis: {answer}") # Extract consistency score (entailment probability) for item in result: if item['label'] == 'ENTAILMENT': return item['score'] elif item['label'] == 'CONTRADICTION': return 1.0 - item['score'] return 0.5 # Neutral if NLI is inconclusive ``` #### Answer Completeness Measures how completely the answer addresses the user's query. ```python def evaluate_completeness(answer: str, query: str, reference_answer: str = None) -> float: """ Evaluate answer completeness """ # Extract key entities from query query_entities = extract_entities(query) answer_entities = extract_entities(answer) # Calculate entity coverage if not query_entities: return 0.5 # Neutral if no entities in query covered_entities = query_entities & answer_entities entity_coverage = len(covered_entities) / len(query_entities) # If reference answer is available, compare against it if reference_answer: reference_entities = extract_entities(reference_answer) answer_reference_overlap = len(answer_entities & reference_entities) / max(len(reference_entities), 1) return (entity_coverage + answer_reference_overlap) / 2 return entity_coverage def extract_entities(text: str) -> set: """ Extract named entities from text (simplified) """ # This would use a proper NER model in practice import re # Simple noun phrase extraction as placeholder noun_phrases = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', text) return set(noun_phrases) ``` #### Response Relevance Measures how relevant the answer is to the original query. ```python from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity class RelevanceEvaluator: def __init__(self, model_name="all-MiniLM-L6-v2"): self.model = SentenceTransformer(model_name) def evaluate_relevance(self, query: str, answer: str) -> float: """ Evaluate semantic relevance between query and answer """ # Generate embeddings query_embedding = self.model.encode([query]) answer_embedding = self.model.encode([answer]) # Calculate cosine similarity similarity = cosine_similarity(query_embedding, answer_embedding)[0][0] return float(similarity) ``` ## Performance Metrics ### Processing Time ```python import time from dataclasses import dataclass from typing import List, Dict @dataclass class PerformanceMetrics: total_time: float chunking_time: float embedding_time: float search_time: float generation_time: float throughput: float # documents per second class PerformanceProfiler: def __init__(self): self.timings = {} self.start_times = {} def start_timer(self, operation: str): self.start_times[operation] = time.time() def end_timer(self, operation: str): if operation in self.start_times: duration = time.time() - self.start_times[operation] if operation not in self.timings: self.timings[operation] = [] self.timings[operation].append(duration) return duration return 0.0 def get_performance_metrics(self, document_count: int) -> PerformanceMetrics: total_time = sum(sum(times) for times in self.timings.values()) return PerformanceMetrics( total_time=total_time, chunking_time=sum(self.timings.get('chunking', [0])), embedding_time=sum(self.timings.get('embedding', [0])), search_time=sum(self.timings.get('search', [0])), generation_time=sum(self.timings.get('generation', [0])), throughput=document_count / total_time if total_time > 0 else 0 ) ``` ### Memory Usage ```python import psutil import os from typing import Dict, List class MemoryProfiler: def __init__(self): self.process = psutil.Process(os.getpid()) self.memory_snapshots = [] def take_memory_snapshot(self, label: str): """Take a snapshot of current memory usage""" memory_info = self.process.memory_info() memory_mb = memory_info.rss / 1024 / 1024 # Convert to MB self.memory_snapshots.append({ 'label': label, 'memory_mb': memory_mb, 'timestamp': time.time() }) def get_peak_memory_usage(self) -> float: """Get peak memory usage in MB""" if not self.memory_snapshots: return 0.0 return max(snapshot['memory_mb'] for snapshot in self.memory_snapshots) def get_memory_usage_by_operation(self) -> Dict[str, float]: """Get memory usage breakdown by operation""" if not self.memory_snapshots: return {} memory_by_op = {} for i in range(1, len(self.memory_snapshots)): prev_snapshot = self.memory_snapshots[i-1] curr_snapshot = self.memory_snapshots[i] operation = curr_snapshot['label'] memory_delta = curr_snapshot['memory_mb'] - prev_snapshot['memory_mb'] if operation not in memory_by_op: memory_by_op[operation] = [] memory_by_op[operation].append(memory_delta) return {op: sum(deltas) for op, deltas in memory_by_op.items()} ``` ## Evaluation Datasets ### Standardized Test Sets #### Question-Answer Pairs ```python from dataclasses import dataclass from typing import List, Optional import json @dataclass class EvaluationQuery: id: str question: str reference_answer: Optional[str] relevant_chunk_ids: List[str] query_type: str # factoid, analytical, comparative difficulty: str # easy, medium, hard domain: str # finance, medical, legal, technical class EvaluationDataset: def __init__(self, name: str): self.name = name self.queries: List[EvaluationQuery] = [] self.documents: Dict[str, str] = {} self.chunks: Dict[str, Dict] = {} def add_query(self, query: EvaluationQuery): self.queries.append(query) def add_document(self, doc_id: str, content: str): self.documents[doc_id] = content def add_chunk(self, chunk_id: str, content: str, doc_id: str, metadata: Dict): self.chunks[chunk_id] = { 'id': chunk_id, 'content': content, 'doc_id': doc_id, 'metadata': metadata } def save_to_file(self, filepath: str): data = { 'name': self.name, 'queries': [ { 'id': q.id, 'question': q.question, 'reference_answer': q.reference_answer, 'relevant_chunk_ids': q.relevant_chunk_ids, 'query_type': q.query_type, 'difficulty': q.difficulty, 'domain': q.domain } for q in self.queries ], 'documents': self.documents, 'chunks': self.chunks } with open(filepath, 'w') as f: json.dump(data, f, indent=2) @classmethod def load_from_file(cls, filepath: str): with open(filepath, 'r') as f: data = json.load(f) dataset = cls(data['name']) dataset.documents = data['documents'] dataset.chunks = data['chunks'] for q_data in data['queries']: query = EvaluationQuery( id=q_data['id'], question=q_data['question'], reference_answer=q_data.get('reference_answer'), relevant_chunk_ids=q_data['relevant_chunk_ids'], query_type=q_data['query_type'], difficulty=q_data['difficulty'], domain=q_data['domain'] ) dataset.add_query(query) return dataset ``` ### Dataset Generation #### Synthetic Query Generation ```python import random from typing import List, Dict class SyntheticQueryGenerator: def __init__(self): self.query_templates = { 'factoid': [ "What is {concept}?", "When did {event} occur?", "Who developed {technology}?", "How many {items} are mentioned?", "What is the value of {metric}?" ], 'analytical': [ "Compare and contrast {concept1} and {concept2}.", "Analyze the impact of {concept} on {domain}.", "What are the advantages and disadvantages of {technology}?", "Explain the relationship between {concept1} and {concept2}.", "Evaluate the effectiveness of {approach} for {problem}." ], 'comparative': [ "Which is better: {option1} or {option2}?", "How does {method1} differ from {method2}?", "Compare the performance of {system1} and {system2}.", "What are the key differences between {approach1} and {approach2}?" ] } def generate_queries_from_chunks(self, chunks: List[Dict], num_queries: int = 100) -> List[EvaluationQuery]: """Generate synthetic queries from document chunks""" queries = [] # Extract entities and concepts from chunks entities = self._extract_entities_from_chunks(chunks) for i in range(num_queries): query_type = random.choice(['factoid', 'analytical', 'comparative']) template = random.choice(self.query_templates[query_type]) # Fill template with extracted entities query_text = self._fill_template(template, entities) # Find relevant chunks for this query relevant_chunks = self._find_relevant_chunks(query_text, chunks) query = EvaluationQuery( id=f"synthetic_{i}", question=query_text, reference_answer=None, # Would need generation model relevant_chunk_ids=[chunk['id'] for chunk in relevant_chunks], query_type=query_type, difficulty=random.choice(['easy', 'medium', 'hard']), domain='synthetic' ) queries.append(query) return queries def _extract_entities_from_chunks(self, chunks: List[Dict]) -> Dict[str, List[str]]: """Extract entities, concepts, and relationships from chunks""" # This would use proper NER in practice entities = { 'concepts': [], 'technologies': [], 'methods': [], 'metrics': [], 'events': [] } for chunk in chunks: content = chunk['content'] # Simplified entity extraction words = content.split() entities['concepts'].extend([word for word in words if len(word) > 6]) entities['technologies'].extend([word for word in words if 'technology' in word.lower()]) entities['methods'].extend([word for word in words if 'method' in word.lower()]) entities['metrics'].extend([word for word in words if '%' in word or '$' in word]) # Remove duplicates and limit for key in entities: entities[key] = list(set(entities[key]))[:50] return entities def _fill_template(self, template: str, entities: Dict[str, List[str]]) -> str: """Fill query template with random entities""" import re def replace_placeholder(match): placeholder = match.group(1) # Map placeholders to entity types entity_mapping = { 'concept': 'concepts', 'concept1': 'concepts', 'concept2': 'concepts', 'technology': 'technologies', 'method': 'methods', 'method1': 'methods', 'method2': 'methods', 'metric': 'metrics', 'event': 'events', 'items': 'concepts', 'option1': 'concepts', 'option2': 'concepts', 'approach': 'methods', 'problem': 'concepts', 'domain': 'concepts', 'system1': 'concepts', 'system2': 'concepts' } entity_type = entity_mapping.get(placeholder, 'concepts') available_entities = entities.get(entity_type, ['something']) if available_entities: return random.choice(available_entities) else: return 'something' return re.sub(r'\{(\w+)\}', replace_placeholder, template) def _find_relevant_chunks(self, query: str, chunks: List[Dict], k: int = 3) -> List[Dict]: """Find chunks most relevant to the query""" # Simple keyword matching for synthetic generation query_words = set(query.lower().split()) chunk_scores = [] for chunk in chunks: chunk_words = set(chunk['content'].lower().split()) overlap = len(query_words & chunk_words) chunk_scores.append((overlap, chunk)) # Sort by overlap and return top k chunk_scores.sort(key=lambda x: x[0], reverse=True) return [chunk for _, chunk in chunk_scores[:k]] ``` ## A/B Testing Framework ### Statistical Significance Testing ```python import numpy as np from scipy import stats from typing import List, Dict, Tuple class ABTestAnalyzer: def __init__(self): self.significance_level = 0.05 def compare_metrics(self, control_metrics: List[float], treatment_metrics: List[float], metric_name: str) -> Dict: """ Compare metrics between control and treatment groups """ control_mean = np.mean(control_metrics) treatment_mean = np.mean(treatment_metrics) control_std = np.std(control_metrics) treatment_std = np.std(treatment_metrics) # Perform t-test t_statistic, p_value = stats.ttest_ind(control_metrics, treatment_metrics) # Calculate effect size (Cohen's d) pooled_std = np.sqrt(((len(control_metrics) - 1) * control_std**2 + (len(treatment_metrics) - 1) * treatment_std**2) / (len(control_metrics) + len(treatment_metrics) - 2)) cohens_d = (treatment_mean - control_mean) / pooled_std if pooled_std > 0 else 0 # Determine significance is_significant = p_value < self.significance_level return { 'metric_name': metric_name, 'control_mean': control_mean, 'treatment_mean': treatment_mean, 'absolute_difference': treatment_mean - control_mean, 'relative_difference': ((treatment_mean - control_mean) / control_mean * 100) if control_mean != 0 else 0, 'control_std': control_std, 'treatment_std': treatment_std, 't_statistic': t_statistic, 'p_value': p_value, 'is_significant': is_significant, 'effect_size': cohens_d, 'significance_level': self.significance_level } def analyze_ab_test_results(self, control_results: Dict[str, List[float]], treatment_results: Dict[str, List[float]]) -> Dict: """ Analyze A/B test results across multiple metrics """ analysis_results = {} # Ensure both dictionaries have the same keys all_metrics = set(control_results.keys()) & set(treatment_results.keys()) for metric in all_metrics: if metric in control_results and metric in treatment_results: analysis_results[metric] = self.compare_metrics( control_results[metric], treatment_results[metric], metric ) # Calculate overall summary significant_improvements = sum(1 for result in analysis_results.values() if result['is_significant'] and result['relative_difference'] > 0) significant_degradations = sum(1 for result in analysis_results.values() if result['is_significant'] and result['relative_difference'] < 0) analysis_results['summary'] = { 'total_metrics_compared': len(analysis_results), 'significant_improvements': significant_improvements, 'significant_degradations': significant_degradations, 'no_significant_change': len(analysis_results) - significant_improvements - significant_degradations } return analysis_results ``` ## Automated Evaluation Pipeline ### End-to-End Evaluation ```python class ChunkingEvaluationPipeline: def __init__(self, strategies: Dict[str, Any], dataset: EvaluationDataset): self.strategies = strategies self.dataset = dataset self.results = {} self.profiler = PerformanceProfiler() self.memory_profiler = MemoryProfiler() def run_evaluation(self) -> Dict: """Run comprehensive evaluation of all strategies""" evaluation_results = {} for strategy_name, strategy in self.strategies.items(): print(f"Evaluating strategy: {strategy_name}") # Reset profilers for each strategy self.profiler = PerformanceProfiler() self.memory_profiler = MemoryProfiler() # Evaluate strategy strategy_results = self._evaluate_strategy(strategy, strategy_name) evaluation_results[strategy_name] = strategy_results # Compare strategies comparison_results = self._compare_strategies(evaluation_results) return { 'individual_results': evaluation_results, 'comparison': comparison_results, 'recommendations': self._generate_recommendations(comparison_results) } def _evaluate_strategy(self, strategy: Any, strategy_name: str) -> Dict: """Evaluate a single chunking strategy""" results = { 'strategy_name': strategy_name, 'retrieval_metrics': {}, 'quality_metrics': {}, 'performance_metrics': {} } # Track memory usage self.memory_profiler.take_memory_snapshot(f"{strategy_name}_start") # Process all documents self.profiler.start_timer('total_processing') all_chunks = {} for doc_id, content in self.dataset.documents.items(): self.profiler.start_timer('chunking') chunks = strategy.chunk(content) self.profiler.end_timer('chunking') all_chunks[doc_id] = chunks self.memory_profiler.take_memory_snapshot(f"{strategy_name}_after_chunking") # Generate embeddings for chunks self.profiler.start_timer('embedding') chunk_embeddings = self._generate_embeddings(all_chunks) self.profiler.end_timer('embedding') self.memory_profiler.take_memory_snapshot(f"{strategy_name}_after_embedding") # Evaluate retrieval performance retrieval_results = self._evaluate_retrieval(all_chunks, chunk_embeddings) results['retrieval_metrics'] = retrieval_results # Evaluate chunk quality quality_results = self._evaluate_chunk_quality(all_chunks) results['quality_metrics'] = quality_results # Get performance metrics self.profiler.end_timer('total_processing') performance_metrics = self.profiler.get_performance_metrics(len(self.dataset.documents)) results['performance_metrics'] = performance_metrics.__dict__ # Get memory metrics self.memory_profiler.take_memory_snapshot(f"{strategy_name}_end") results['memory_metrics'] = { 'peak_memory_mb': self.memory_profiler.get_peak_memory_usage(), 'memory_by_operation': self.memory_profiler.get_memory_usage_by_operation() } return results def _evaluate_retrieval(self, all_chunks: Dict, chunk_embeddings: Dict) -> Dict: """Evaluate retrieval performance""" retrieval_metrics = { 'precision': [], 'recall': [], 'f1_score': [], 'mrr': [], 'map': [] } for query in self.dataset.queries: # Perform retrieval self.profiler.start_timer('search') retrieved_chunks = self._retrieve_chunks(query.question, chunk_embeddings, k=10) self.profiler.end_timer('search') # Get relevant chunks for this query relevant_chunk_ids = set(query.relevant_chunk_ids) relevant_chunks = [chunk for chunk in retrieved_chunks if chunk.get('id') in relevant_chunk_ids] # Calculate metrics precision = calculate_precision(retrieved_chunks, relevant_chunks) recall = calculate_recall(retrieved_chunks, relevant_chunks) f1 = calculate_f1_score(precision, recall) retrieval_metrics['precision'].append(precision) retrieval_metrics['recall'].append(recall) retrieval_metrics['f1_score'].append(f1) # Calculate averages return {metric: np.mean(values) for metric, values in retrieval_metrics.items()} def _evaluate_chunk_quality(self, all_chunks: Dict) -> Dict: """Evaluate quality of generated chunks""" quality_assessor = ChunkQualityAssessor() quality_scores = [] for doc_id, chunks in all_chunks.items(): # Analyze document content = self.dataset.documents[doc_id] analyzer = DocumentAnalyzer() analysis = analyzer.analyze(content) # Assess chunk quality scores = quality_assessor.assess_chunks(chunks, analysis) quality_scores.append(scores) # Aggregate quality scores if quality_scores: avg_scores = {} for metric in quality_scores[0].keys(): avg_scores[metric] = np.mean([scores[metric] for scores in quality_scores]) return avg_scores return {} def _compare_strategies(self, evaluation_results: Dict) -> Dict: """Compare performance across strategies""" ab_analyzer = ABTestAnalyzer() comparison = {} # Compare each metric across strategies strategy_names = list(evaluation_results.keys()) for i in range(len(strategy_names)): for j in range(i + 1, len(strategy_names)): strategy1 = strategy_names[i] strategy2 = strategy_names[j] comparison_key = f"{strategy1}_vs_{strategy2}" comparison[comparison_key] = {} # Compare retrieval metrics for metric in ['precision', 'recall', 'f1_score']: if (metric in evaluation_results[strategy1]['retrieval_metrics'] and metric in evaluation_results[strategy2]['retrieval_metrics']): comparison[comparison_key][f"retrieval_{metric}"] = ab_analyzer.compare_metrics( [evaluation_results[strategy1]['retrieval_metrics'][metric]], [evaluation_results[strategy2]['retrieval_metrics'][metric]], f"retrieval_{metric}" ) return comparison def _generate_recommendations(self, comparison_results: Dict) -> Dict: """Generate recommendations based on evaluation results""" recommendations = { 'best_overall': None, 'best_for_precision': None, 'best_for_recall': None, 'best_for_performance': None, 'trade_offs': [] } # This would analyze the comparison results and generate specific recommendations # Implementation depends on specific use case requirements return recommendations def _generate_embeddings(self, all_chunks: Dict) -> Dict: """Generate embeddings for all chunks""" # This would use the actual embedding model # Placeholder implementation embeddings = {} for doc_id, chunks in all_chunks.items(): embeddings[doc_id] = [] for chunk in chunks: # Generate embedding for chunk content embedding = np.random.rand(384) # Placeholder embeddings[doc_id].append({ 'chunk': chunk, 'embedding': embedding }) return embeddings def _retrieve_chunks(self, query: str, chunk_embeddings: Dict, k: int = 10) -> List[Dict]: """Retrieve most relevant chunks for a query""" # This would use actual similarity search # Placeholder implementation all_chunks = [] for doc_embeddings in chunk_embeddings.values(): for chunk_data in doc_embeddings: all_chunks.append(chunk_data['chunk']) # Simple random selection as placeholder selected = random.sample(all_chunks, min(k, len(all_chunks))) return selected ``` This comprehensive evaluation framework provides the tools needed to thoroughly assess chunking strategies across multiple dimensions: retrieval effectiveness, answer quality, system performance, and statistical significance. The modular design allows for easy extension and customization based on specific requirements and use cases.