30 KiB
Performance Evaluation Framework
This document provides comprehensive methodologies for evaluating chunking strategy performance and effectiveness.
Evaluation Metrics
Core Retrieval Metrics
Retrieval Precision
Measures the fraction of retrieved chunks that are relevant to the query.
def calculate_precision(retrieved_chunks: List[Dict], relevant_chunks: List[Dict]) -> float:
"""
Calculate retrieval precision
Precision = |Relevant ∩ Retrieved| / |Retrieved|
"""
retrieved_ids = {chunk.get('id') for chunk in retrieved_chunks}
relevant_ids = {chunk.get('id') for chunk in relevant_chunks}
intersection = retrieved_ids & relevant_ids
if not retrieved_ids:
return 0.0
return len(intersection) / len(retrieved_ids)
Retrieval Recall
Measures the fraction of relevant chunks that are successfully retrieved.
def calculate_recall(retrieved_chunks: List[Dict], relevant_chunks: List[Dict]) -> float:
"""
Calculate retrieval recall
Recall = |Relevant ∩ Retrieved| / |Relevant|
"""
retrieved_ids = {chunk.get('id') for chunk in retrieved_chunks}
relevant_ids = {chunk.get('id') for chunk in relevant_chunks}
intersection = retrieved_ids & relevant_ids
if not relevant_ids:
return 0.0
return len(intersection) / len(relevant_ids)
F1-Score
Harmonic mean of precision and recall.
def calculate_f1_score(precision: float, recall: float) -> float:
"""
Calculate F1-score
F1 = 2 * (Precision * Recall) / (Precision + Recall)
"""
if precision + recall == 0:
return 0.0
return 2 * (precision * recall) / (precision + recall)
Mean Reciprocal Rank (MRR)
Measures the rank of the first relevant result.
def calculate_mrr(queries: List[Dict], results: List[List[Dict]]) -> float:
"""
Calculate Mean Reciprocal Rank
"""
reciprocal_ranks = []
for query, query_results in zip(queries, results):
relevant_found = False
for rank, result in enumerate(query_results, 1):
if result.get('is_relevant', False):
reciprocal_ranks.append(1.0 / rank)
relevant_found = True
break
if not relevant_found:
reciprocal_ranks.append(0.0)
return sum(reciprocal_ranks) / len(reciprocal_ranks)
Mean Average Precision (MAP)
Considers both precision and the ranking of relevant documents.
def calculate_average_precision(retrieved_chunks: List[Dict], relevant_chunks: List[Dict]) -> float:
"""
Calculate Average Precision for a single query
"""
retrieved_ids = {chunk.get('id') for chunk in retrieved_chunks}
relevant_ids = {chunk.get('id') for chunk in relevant_chunks}
if not relevant_ids:
return 0.0
precisions = []
relevant_count = 0
for rank, chunk in enumerate(retrieved_chunks, 1):
if chunk.get('id') in relevant_ids:
relevant_count += 1
precision_at_rank = relevant_count / rank
precisions.append(precision_at_rank)
return sum(precisions) / len(relevant_ids) if relevant_ids else 0.0
def calculate_map(queries: List[Dict], results: List[List[Dict]]) -> float:
"""
Calculate Mean Average Precision across multiple queries
"""
average_precisions = []
for query, query_results in zip(queries, results):
ap = calculate_average_precision(query_results, query.get('relevant_chunks', []))
average_precisions.append(ap)
return sum(average_precisions) / len(average_precisions) if average_precisions else 0.0
Normalized Discounted Cumulative Gain (NDCG)
Measures ranking quality with emphasis on highly relevant results.
def calculate_dcg(retrieved_chunks: List[Dict]) -> float:
"""
Calculate Discounted Cumulative Gain
"""
dcg = 0.0
for rank, chunk in enumerate(retrieved_chunks, 1):
relevance = chunk.get('relevance_score', 0)
dcg += relevance / np.log2(rank + 1)
return dcg
def calculate_ndcg(retrieved_chunks: List[Dict], ideal_chunks: List[Dict]) -> float:
"""
Calculate Normalized Discounted Cumulative Gain
"""
dcg = calculate_dcg(retrieved_chunks)
idcg = calculate_dcg(ideal_chunks)
if idcg == 0:
return 0.0
return dcg / idcg
End-to-End RAG Evaluation
Answer Quality Metrics
Factual Consistency
Measures how well the generated answer aligns with retrieved chunks.
import spacy
from transformers import pipeline
class FactualConsistencyEvaluator:
def __init__(self):
self.nlp = spacy.load("en_core_web_sm")
self.nli_pipeline = pipeline("text-classification",
model="roberta-large-mnli")
def evaluate_consistency(self, answer: str, retrieved_chunks: List[str]) -> float:
"""
Evaluate factual consistency between answer and retrieved context
"""
if not retrieved_chunks:
return 0.0
# Combine retrieved chunks as context
context = " ".join(retrieved_chunks[:3]) # Use top 3 chunks
# Use Natural Language Inference to check consistency
result = self.nli_pipeline(f"premise: {context} hypothesis: {answer}")
# Extract consistency score (entailment probability)
for item in result:
if item['label'] == 'ENTAILMENT':
return item['score']
elif item['label'] == 'CONTRADICTION':
return 1.0 - item['score']
return 0.5 # Neutral if NLI is inconclusive
Answer Completeness
Measures how completely the answer addresses the user's query.
def evaluate_completeness(answer: str, query: str, reference_answer: str = None) -> float:
"""
Evaluate answer completeness
"""
# Extract key entities from query
query_entities = extract_entities(query)
answer_entities = extract_entities(answer)
# Calculate entity coverage
if not query_entities:
return 0.5 # Neutral if no entities in query
covered_entities = query_entities & answer_entities
entity_coverage = len(covered_entities) / len(query_entities)
# If reference answer is available, compare against it
if reference_answer:
reference_entities = extract_entities(reference_answer)
answer_reference_overlap = len(answer_entities & reference_entities) / max(len(reference_entities), 1)
return (entity_coverage + answer_reference_overlap) / 2
return entity_coverage
def extract_entities(text: str) -> set:
"""
Extract named entities from text (simplified)
"""
# This would use a proper NER model in practice
import re
# Simple noun phrase extraction as placeholder
noun_phrases = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', text)
return set(noun_phrases)
Response Relevance
Measures how relevant the answer is to the original query.
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
class RelevanceEvaluator:
def __init__(self, model_name="all-MiniLM-L6-v2"):
self.model = SentenceTransformer(model_name)
def evaluate_relevance(self, query: str, answer: str) -> float:
"""
Evaluate semantic relevance between query and answer
"""
# Generate embeddings
query_embedding = self.model.encode([query])
answer_embedding = self.model.encode([answer])
# Calculate cosine similarity
similarity = cosine_similarity(query_embedding, answer_embedding)[0][0]
return float(similarity)
Performance Metrics
Processing Time
import time
from dataclasses import dataclass
from typing import List, Dict
@dataclass
class PerformanceMetrics:
total_time: float
chunking_time: float
embedding_time: float
search_time: float
generation_time: float
throughput: float # documents per second
class PerformanceProfiler:
def __init__(self):
self.timings = {}
self.start_times = {}
def start_timer(self, operation: str):
self.start_times[operation] = time.time()
def end_timer(self, operation: str):
if operation in self.start_times:
duration = time.time() - self.start_times[operation]
if operation not in self.timings:
self.timings[operation] = []
self.timings[operation].append(duration)
return duration
return 0.0
def get_performance_metrics(self, document_count: int) -> PerformanceMetrics:
total_time = sum(sum(times) for times in self.timings.values())
return PerformanceMetrics(
total_time=total_time,
chunking_time=sum(self.timings.get('chunking', [0])),
embedding_time=sum(self.timings.get('embedding', [0])),
search_time=sum(self.timings.get('search', [0])),
generation_time=sum(self.timings.get('generation', [0])),
throughput=document_count / total_time if total_time > 0 else 0
)
Memory Usage
import psutil
import os
from typing import Dict, List
class MemoryProfiler:
def __init__(self):
self.process = psutil.Process(os.getpid())
self.memory_snapshots = []
def take_memory_snapshot(self, label: str):
"""Take a snapshot of current memory usage"""
memory_info = self.process.memory_info()
memory_mb = memory_info.rss / 1024 / 1024 # Convert to MB
self.memory_snapshots.append({
'label': label,
'memory_mb': memory_mb,
'timestamp': time.time()
})
def get_peak_memory_usage(self) -> float:
"""Get peak memory usage in MB"""
if not self.memory_snapshots:
return 0.0
return max(snapshot['memory_mb'] for snapshot in self.memory_snapshots)
def get_memory_usage_by_operation(self) -> Dict[str, float]:
"""Get memory usage breakdown by operation"""
if not self.memory_snapshots:
return {}
memory_by_op = {}
for i in range(1, len(self.memory_snapshots)):
prev_snapshot = self.memory_snapshots[i-1]
curr_snapshot = self.memory_snapshots[i]
operation = curr_snapshot['label']
memory_delta = curr_snapshot['memory_mb'] - prev_snapshot['memory_mb']
if operation not in memory_by_op:
memory_by_op[operation] = []
memory_by_op[operation].append(memory_delta)
return {op: sum(deltas) for op, deltas in memory_by_op.items()}
Evaluation Datasets
Standardized Test Sets
Question-Answer Pairs
from dataclasses import dataclass
from typing import List, Optional
import json
@dataclass
class EvaluationQuery:
id: str
question: str
reference_answer: Optional[str]
relevant_chunk_ids: List[str]
query_type: str # factoid, analytical, comparative
difficulty: str # easy, medium, hard
domain: str # finance, medical, legal, technical
class EvaluationDataset:
def __init__(self, name: str):
self.name = name
self.queries: List[EvaluationQuery] = []
self.documents: Dict[str, str] = {}
self.chunks: Dict[str, Dict] = {}
def add_query(self, query: EvaluationQuery):
self.queries.append(query)
def add_document(self, doc_id: str, content: str):
self.documents[doc_id] = content
def add_chunk(self, chunk_id: str, content: str, doc_id: str, metadata: Dict):
self.chunks[chunk_id] = {
'id': chunk_id,
'content': content,
'doc_id': doc_id,
'metadata': metadata
}
def save_to_file(self, filepath: str):
data = {
'name': self.name,
'queries': [
{
'id': q.id,
'question': q.question,
'reference_answer': q.reference_answer,
'relevant_chunk_ids': q.relevant_chunk_ids,
'query_type': q.query_type,
'difficulty': q.difficulty,
'domain': q.domain
}
for q in self.queries
],
'documents': self.documents,
'chunks': self.chunks
}
with open(filepath, 'w') as f:
json.dump(data, f, indent=2)
@classmethod
def load_from_file(cls, filepath: str):
with open(filepath, 'r') as f:
data = json.load(f)
dataset = cls(data['name'])
dataset.documents = data['documents']
dataset.chunks = data['chunks']
for q_data in data['queries']:
query = EvaluationQuery(
id=q_data['id'],
question=q_data['question'],
reference_answer=q_data.get('reference_answer'),
relevant_chunk_ids=q_data['relevant_chunk_ids'],
query_type=q_data['query_type'],
difficulty=q_data['difficulty'],
domain=q_data['domain']
)
dataset.add_query(query)
return dataset
Dataset Generation
Synthetic Query Generation
import random
from typing import List, Dict
class SyntheticQueryGenerator:
def __init__(self):
self.query_templates = {
'factoid': [
"What is {concept}?",
"When did {event} occur?",
"Who developed {technology}?",
"How many {items} are mentioned?",
"What is the value of {metric}?"
],
'analytical': [
"Compare and contrast {concept1} and {concept2}.",
"Analyze the impact of {concept} on {domain}.",
"What are the advantages and disadvantages of {technology}?",
"Explain the relationship between {concept1} and {concept2}.",
"Evaluate the effectiveness of {approach} for {problem}."
],
'comparative': [
"Which is better: {option1} or {option2}?",
"How does {method1} differ from {method2}?",
"Compare the performance of {system1} and {system2}.",
"What are the key differences between {approach1} and {approach2}?"
]
}
def generate_queries_from_chunks(self, chunks: List[Dict], num_queries: int = 100) -> List[EvaluationQuery]:
"""Generate synthetic queries from document chunks"""
queries = []
# Extract entities and concepts from chunks
entities = self._extract_entities_from_chunks(chunks)
for i in range(num_queries):
query_type = random.choice(['factoid', 'analytical', 'comparative'])
template = random.choice(self.query_templates[query_type])
# Fill template with extracted entities
query_text = self._fill_template(template, entities)
# Find relevant chunks for this query
relevant_chunks = self._find_relevant_chunks(query_text, chunks)
query = EvaluationQuery(
id=f"synthetic_{i}",
question=query_text,
reference_answer=None, # Would need generation model
relevant_chunk_ids=[chunk['id'] for chunk in relevant_chunks],
query_type=query_type,
difficulty=random.choice(['easy', 'medium', 'hard']),
domain='synthetic'
)
queries.append(query)
return queries
def _extract_entities_from_chunks(self, chunks: List[Dict]) -> Dict[str, List[str]]:
"""Extract entities, concepts, and relationships from chunks"""
# This would use proper NER in practice
entities = {
'concepts': [],
'technologies': [],
'methods': [],
'metrics': [],
'events': []
}
for chunk in chunks:
content = chunk['content']
# Simplified entity extraction
words = content.split()
entities['concepts'].extend([word for word in words if len(word) > 6])
entities['technologies'].extend([word for word in words if 'technology' in word.lower()])
entities['methods'].extend([word for word in words if 'method' in word.lower()])
entities['metrics'].extend([word for word in words if '%' in word or '$' in word])
# Remove duplicates and limit
for key in entities:
entities[key] = list(set(entities[key]))[:50]
return entities
def _fill_template(self, template: str, entities: Dict[str, List[str]]) -> str:
"""Fill query template with random entities"""
import re
def replace_placeholder(match):
placeholder = match.group(1)
# Map placeholders to entity types
entity_mapping = {
'concept': 'concepts',
'concept1': 'concepts',
'concept2': 'concepts',
'technology': 'technologies',
'method': 'methods',
'method1': 'methods',
'method2': 'methods',
'metric': 'metrics',
'event': 'events',
'items': 'concepts',
'option1': 'concepts',
'option2': 'concepts',
'approach': 'methods',
'problem': 'concepts',
'domain': 'concepts',
'system1': 'concepts',
'system2': 'concepts'
}
entity_type = entity_mapping.get(placeholder, 'concepts')
available_entities = entities.get(entity_type, ['something'])
if available_entities:
return random.choice(available_entities)
else:
return 'something'
return re.sub(r'\{(\w+)\}', replace_placeholder, template)
def _find_relevant_chunks(self, query: str, chunks: List[Dict], k: int = 3) -> List[Dict]:
"""Find chunks most relevant to the query"""
# Simple keyword matching for synthetic generation
query_words = set(query.lower().split())
chunk_scores = []
for chunk in chunks:
chunk_words = set(chunk['content'].lower().split())
overlap = len(query_words & chunk_words)
chunk_scores.append((overlap, chunk))
# Sort by overlap and return top k
chunk_scores.sort(key=lambda x: x[0], reverse=True)
return [chunk for _, chunk in chunk_scores[:k]]
A/B Testing Framework
Statistical Significance Testing
import numpy as np
from scipy import stats
from typing import List, Dict, Tuple
class ABTestAnalyzer:
def __init__(self):
self.significance_level = 0.05
def compare_metrics(self, control_metrics: List[float],
treatment_metrics: List[float],
metric_name: str) -> Dict:
"""
Compare metrics between control and treatment groups
"""
control_mean = np.mean(control_metrics)
treatment_mean = np.mean(treatment_metrics)
control_std = np.std(control_metrics)
treatment_std = np.std(treatment_metrics)
# Perform t-test
t_statistic, p_value = stats.ttest_ind(control_metrics, treatment_metrics)
# Calculate effect size (Cohen's d)
pooled_std = np.sqrt(((len(control_metrics) - 1) * control_std**2 +
(len(treatment_metrics) - 1) * treatment_std**2) /
(len(control_metrics) + len(treatment_metrics) - 2))
cohens_d = (treatment_mean - control_mean) / pooled_std if pooled_std > 0 else 0
# Determine significance
is_significant = p_value < self.significance_level
return {
'metric_name': metric_name,
'control_mean': control_mean,
'treatment_mean': treatment_mean,
'absolute_difference': treatment_mean - control_mean,
'relative_difference': ((treatment_mean - control_mean) / control_mean * 100) if control_mean != 0 else 0,
'control_std': control_std,
'treatment_std': treatment_std,
't_statistic': t_statistic,
'p_value': p_value,
'is_significant': is_significant,
'effect_size': cohens_d,
'significance_level': self.significance_level
}
def analyze_ab_test_results(self,
control_results: Dict[str, List[float]],
treatment_results: Dict[str, List[float]]) -> Dict:
"""
Analyze A/B test results across multiple metrics
"""
analysis_results = {}
# Ensure both dictionaries have the same keys
all_metrics = set(control_results.keys()) & set(treatment_results.keys())
for metric in all_metrics:
if metric in control_results and metric in treatment_results:
analysis_results[metric] = self.compare_metrics(
control_results[metric],
treatment_results[metric],
metric
)
# Calculate overall summary
significant_improvements = sum(1 for result in analysis_results.values()
if result['is_significant'] and result['relative_difference'] > 0)
significant_degradations = sum(1 for result in analysis_results.values()
if result['is_significant'] and result['relative_difference'] < 0)
analysis_results['summary'] = {
'total_metrics_compared': len(analysis_results),
'significant_improvements': significant_improvements,
'significant_degradations': significant_degradations,
'no_significant_change': len(analysis_results) - significant_improvements - significant_degradations
}
return analysis_results
Automated Evaluation Pipeline
End-to-End Evaluation
class ChunkingEvaluationPipeline:
def __init__(self, strategies: Dict[str, Any], dataset: EvaluationDataset):
self.strategies = strategies
self.dataset = dataset
self.results = {}
self.profiler = PerformanceProfiler()
self.memory_profiler = MemoryProfiler()
def run_evaluation(self) -> Dict:
"""Run comprehensive evaluation of all strategies"""
evaluation_results = {}
for strategy_name, strategy in self.strategies.items():
print(f"Evaluating strategy: {strategy_name}")
# Reset profilers for each strategy
self.profiler = PerformanceProfiler()
self.memory_profiler = MemoryProfiler()
# Evaluate strategy
strategy_results = self._evaluate_strategy(strategy, strategy_name)
evaluation_results[strategy_name] = strategy_results
# Compare strategies
comparison_results = self._compare_strategies(evaluation_results)
return {
'individual_results': evaluation_results,
'comparison': comparison_results,
'recommendations': self._generate_recommendations(comparison_results)
}
def _evaluate_strategy(self, strategy: Any, strategy_name: str) -> Dict:
"""Evaluate a single chunking strategy"""
results = {
'strategy_name': strategy_name,
'retrieval_metrics': {},
'quality_metrics': {},
'performance_metrics': {}
}
# Track memory usage
self.memory_profiler.take_memory_snapshot(f"{strategy_name}_start")
# Process all documents
self.profiler.start_timer('total_processing')
all_chunks = {}
for doc_id, content in self.dataset.documents.items():
self.profiler.start_timer('chunking')
chunks = strategy.chunk(content)
self.profiler.end_timer('chunking')
all_chunks[doc_id] = chunks
self.memory_profiler.take_memory_snapshot(f"{strategy_name}_after_chunking")
# Generate embeddings for chunks
self.profiler.start_timer('embedding')
chunk_embeddings = self._generate_embeddings(all_chunks)
self.profiler.end_timer('embedding')
self.memory_profiler.take_memory_snapshot(f"{strategy_name}_after_embedding")
# Evaluate retrieval performance
retrieval_results = self._evaluate_retrieval(all_chunks, chunk_embeddings)
results['retrieval_metrics'] = retrieval_results
# Evaluate chunk quality
quality_results = self._evaluate_chunk_quality(all_chunks)
results['quality_metrics'] = quality_results
# Get performance metrics
self.profiler.end_timer('total_processing')
performance_metrics = self.profiler.get_performance_metrics(len(self.dataset.documents))
results['performance_metrics'] = performance_metrics.__dict__
# Get memory metrics
self.memory_profiler.take_memory_snapshot(f"{strategy_name}_end")
results['memory_metrics'] = {
'peak_memory_mb': self.memory_profiler.get_peak_memory_usage(),
'memory_by_operation': self.memory_profiler.get_memory_usage_by_operation()
}
return results
def _evaluate_retrieval(self, all_chunks: Dict, chunk_embeddings: Dict) -> Dict:
"""Evaluate retrieval performance"""
retrieval_metrics = {
'precision': [],
'recall': [],
'f1_score': [],
'mrr': [],
'map': []
}
for query in self.dataset.queries:
# Perform retrieval
self.profiler.start_timer('search')
retrieved_chunks = self._retrieve_chunks(query.question, chunk_embeddings, k=10)
self.profiler.end_timer('search')
# Get relevant chunks for this query
relevant_chunk_ids = set(query.relevant_chunk_ids)
relevant_chunks = [chunk for chunk in retrieved_chunks
if chunk.get('id') in relevant_chunk_ids]
# Calculate metrics
precision = calculate_precision(retrieved_chunks, relevant_chunks)
recall = calculate_recall(retrieved_chunks, relevant_chunks)
f1 = calculate_f1_score(precision, recall)
retrieval_metrics['precision'].append(precision)
retrieval_metrics['recall'].append(recall)
retrieval_metrics['f1_score'].append(f1)
# Calculate averages
return {metric: np.mean(values) for metric, values in retrieval_metrics.items()}
def _evaluate_chunk_quality(self, all_chunks: Dict) -> Dict:
"""Evaluate quality of generated chunks"""
quality_assessor = ChunkQualityAssessor()
quality_scores = []
for doc_id, chunks in all_chunks.items():
# Analyze document
content = self.dataset.documents[doc_id]
analyzer = DocumentAnalyzer()
analysis = analyzer.analyze(content)
# Assess chunk quality
scores = quality_assessor.assess_chunks(chunks, analysis)
quality_scores.append(scores)
# Aggregate quality scores
if quality_scores:
avg_scores = {}
for metric in quality_scores[0].keys():
avg_scores[metric] = np.mean([scores[metric] for scores in quality_scores])
return avg_scores
return {}
def _compare_strategies(self, evaluation_results: Dict) -> Dict:
"""Compare performance across strategies"""
ab_analyzer = ABTestAnalyzer()
comparison = {}
# Compare each metric across strategies
strategy_names = list(evaluation_results.keys())
for i in range(len(strategy_names)):
for j in range(i + 1, len(strategy_names)):
strategy1 = strategy_names[i]
strategy2 = strategy_names[j]
comparison_key = f"{strategy1}_vs_{strategy2}"
comparison[comparison_key] = {}
# Compare retrieval metrics
for metric in ['precision', 'recall', 'f1_score']:
if (metric in evaluation_results[strategy1]['retrieval_metrics'] and
metric in evaluation_results[strategy2]['retrieval_metrics']):
comparison[comparison_key][f"retrieval_{metric}"] = ab_analyzer.compare_metrics(
[evaluation_results[strategy1]['retrieval_metrics'][metric]],
[evaluation_results[strategy2]['retrieval_metrics'][metric]],
f"retrieval_{metric}"
)
return comparison
def _generate_recommendations(self, comparison_results: Dict) -> Dict:
"""Generate recommendations based on evaluation results"""
recommendations = {
'best_overall': None,
'best_for_precision': None,
'best_for_recall': None,
'best_for_performance': None,
'trade_offs': []
}
# This would analyze the comparison results and generate specific recommendations
# Implementation depends on specific use case requirements
return recommendations
def _generate_embeddings(self, all_chunks: Dict) -> Dict:
"""Generate embeddings for all chunks"""
# This would use the actual embedding model
# Placeholder implementation
embeddings = {}
for doc_id, chunks in all_chunks.items():
embeddings[doc_id] = []
for chunk in chunks:
# Generate embedding for chunk content
embedding = np.random.rand(384) # Placeholder
embeddings[doc_id].append({
'chunk': chunk,
'embedding': embedding
})
return embeddings
def _retrieve_chunks(self, query: str, chunk_embeddings: Dict, k: int = 10) -> List[Dict]:
"""Retrieve most relevant chunks for a query"""
# This would use actual similarity search
# Placeholder implementation
all_chunks = []
for doc_embeddings in chunk_embeddings.values():
for chunk_data in doc_embeddings:
all_chunks.append(chunk_data['chunk'])
# Simple random selection as placeholder
selected = random.sample(all_chunks, min(k, len(all_chunks)))
return selected
This comprehensive evaluation framework provides the tools needed to thoroughly assess chunking strategies across multiple dimensions: retrieval effectiveness, answer quality, system performance, and statistical significance. The modular design allows for easy extension and customization based on specific requirements and use cases.