Initial commit
This commit is contained in:
709
skills/ai/chunking-strategy/references/implementation.md
Normal file
709
skills/ai/chunking-strategy/references/implementation.md
Normal file
@@ -0,0 +1,709 @@
|
||||
# Complete Implementation Guidelines
|
||||
|
||||
This document provides comprehensive implementation guidance for building effective chunking systems.
|
||||
|
||||
## System Architecture
|
||||
|
||||
### Core Components
|
||||
|
||||
```
|
||||
Document Processor
|
||||
├── Ingestion Layer
|
||||
│ ├── Document Type Detection
|
||||
│ ├── Format Parsing (PDF, HTML, Markdown, etc.)
|
||||
│ └── Content Extraction
|
||||
├── Analysis Layer
|
||||
│ ├── Structure Analysis
|
||||
│ ├── Content Type Identification
|
||||
│ └── Complexity Assessment
|
||||
├── Strategy Selection Layer
|
||||
│ ├── Rule-based Selection
|
||||
│ ├── ML-based Prediction
|
||||
│ └── Adaptive Configuration
|
||||
├── Chunking Layer
|
||||
│ ├── Strategy Implementation
|
||||
│ ├── Parameter Optimization
|
||||
│ └── Quality Validation
|
||||
└── Output Layer
|
||||
├── Chunk Metadata Generation
|
||||
├── Embedding Integration
|
||||
└── Storage Preparation
|
||||
```
|
||||
|
||||
## Pre-processing Pipeline
|
||||
|
||||
### Document Analysis Framework
|
||||
|
||||
```python
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Dict, Any
|
||||
import re
|
||||
|
||||
@dataclass
|
||||
class DocumentAnalysis:
|
||||
doc_type: str
|
||||
structure_score: float # 0-1, higher means more structured
|
||||
complexity_score: float # 0-1, higher means more complex
|
||||
content_types: List[str]
|
||||
language: str
|
||||
estimated_tokens: int
|
||||
has_multimodal: bool
|
||||
|
||||
class DocumentAnalyzer:
|
||||
def __init__(self):
|
||||
self.structure_patterns = {
|
||||
'markdown': [r'^#+\s', r'^\*\*.*\*\*$', r'^\* ', r'^\d+\. '],
|
||||
'html': [r'<h[1-6]>', r'<p>', r'<div>', r'<table>'],
|
||||
'latex': [r'\\section', r'\\subsection', r'\\begin\{', r'\\end\{'],
|
||||
'academic': [r'^\d+\.', r'^\d+\.\d+', r'^[A-Z]\.', r'^Figure \d+']
|
||||
}
|
||||
|
||||
def analyze(self, content: str) -> DocumentAnalysis:
|
||||
doc_type = self.detect_document_type(content)
|
||||
structure_score = self.calculate_structure_score(content, doc_type)
|
||||
complexity_score = self.calculate_complexity_score(content)
|
||||
content_types = self.identify_content_types(content)
|
||||
language = self.detect_language(content)
|
||||
estimated_tokens = self.estimate_tokens(content)
|
||||
has_multimodal = self.detect_multimodal_content(content)
|
||||
|
||||
return DocumentAnalysis(
|
||||
doc_type=doc_type,
|
||||
structure_score=structure_score,
|
||||
complexity_score=complexity_score,
|
||||
content_types=content_types,
|
||||
language=language,
|
||||
estimated_tokens=estimated_tokens,
|
||||
has_multimodal=has_multimodal
|
||||
)
|
||||
|
||||
def detect_document_type(self, content: str) -> str:
|
||||
content_lower = content.lower()
|
||||
|
||||
if '<html' in content_lower or '<body' in content_lower:
|
||||
return 'html'
|
||||
elif '#' in content and '##' in content:
|
||||
return 'markdown'
|
||||
elif '\\documentclass' in content_lower or '\\begin{' in content_lower:
|
||||
return 'latex'
|
||||
elif any(keyword in content_lower for keyword in ['abstract', 'introduction', 'conclusion', 'references']):
|
||||
return 'academic'
|
||||
elif 'def ' in content or 'class ' in content or 'function ' in content_lower:
|
||||
return 'code'
|
||||
else:
|
||||
return 'plain'
|
||||
|
||||
def calculate_structure_score(self, content: str, doc_type: str) -> float:
|
||||
patterns = self.structure_patterns.get(doc_type, [])
|
||||
if not patterns:
|
||||
return 0.5 # Default for unstructured content
|
||||
|
||||
line_count = len(content.split('\n'))
|
||||
structured_lines = 0
|
||||
|
||||
for line in content.split('\n'):
|
||||
for pattern in patterns:
|
||||
if re.search(pattern, line.strip()):
|
||||
structured_lines += 1
|
||||
break
|
||||
|
||||
return min(structured_lines / max(line_count, 1), 1.0)
|
||||
|
||||
def calculate_complexity_score(self, content: str) -> float:
|
||||
# Factors that increase complexity
|
||||
avg_sentence_length = self.calculate_avg_sentence_length(content)
|
||||
vocabulary_richness = self.calculate_vocabulary_richness(content)
|
||||
nested_structure = self.detect_nested_structure(content)
|
||||
|
||||
# Normalize and combine
|
||||
complexity = (
|
||||
min(avg_sentence_length / 30, 1.0) * 0.3 +
|
||||
vocabulary_richness * 0.4 +
|
||||
nested_structure * 0.3
|
||||
)
|
||||
|
||||
return min(complexity, 1.0)
|
||||
|
||||
def identify_content_types(self, content: str) -> List[str]:
|
||||
types = []
|
||||
|
||||
if '```' in content or 'def ' in content or 'function ' in content.lower():
|
||||
types.append('code')
|
||||
if '|' in content and '\n' in content:
|
||||
types.append('tables')
|
||||
if re.search(r'\!\[.*\]\(.*\)', content):
|
||||
types.append('images')
|
||||
if re.search(r'http[s]?://', content):
|
||||
types.append('links')
|
||||
if re.search(r'\d+\.\d+', content) or re.search(r'\$\d', content):
|
||||
types.append('numbers')
|
||||
|
||||
return types if types else ['text']
|
||||
|
||||
def detect_language(self, content: str) -> str:
|
||||
# Simple language detection - can be enhanced with proper language detection libraries
|
||||
if re.search(r'[\u4e00-\u9fff]', content):
|
||||
return 'chinese'
|
||||
elif re.search(r'[u0600-\u06ff]', content):
|
||||
return 'arabic'
|
||||
elif re.search(r'[u0400-\u04ff]', content):
|
||||
return 'russian'
|
||||
else:
|
||||
return 'english' # Default assumption
|
||||
|
||||
def estimate_tokens(self, content: str) -> int:
|
||||
# Rough estimation - actual tokenization varies by model
|
||||
word_count = len(content.split())
|
||||
return int(word_count * 1.3) # Average tokens per word
|
||||
|
||||
def detect_multimodal_content(self, content: str) -> bool:
|
||||
multimodal_indicators = [
|
||||
r'\!\[.*\]\(.*\)', # Images
|
||||
r'<iframe', # Embedded content
|
||||
r'<object', # Embedded objects
|
||||
r'<embed', # Embedded media
|
||||
]
|
||||
|
||||
return any(re.search(pattern, content) for pattern in multimodal_indicators)
|
||||
|
||||
def calculate_avg_sentence_length(self, content: str) -> float:
|
||||
sentences = re.split(r'[.!?]+', content)
|
||||
sentences = [s.strip() for s in sentences if s.strip()]
|
||||
if not sentences:
|
||||
return 0
|
||||
return sum(len(s.split()) for s in sentences) / len(sentences)
|
||||
|
||||
def calculate_vocabulary_richness(self, content: str) -> float:
|
||||
words = content.lower().split()
|
||||
if not words:
|
||||
return 0
|
||||
unique_words = set(words)
|
||||
return len(unique_words) / len(words)
|
||||
|
||||
def detect_nested_structure(self, content: str) -> float:
|
||||
# Detect nested lists, indented content, etc.
|
||||
lines = content.split('\n')
|
||||
indented_lines = 0
|
||||
|
||||
for line in lines:
|
||||
if line.strip() and line.startswith(' '):
|
||||
indented_lines += 1
|
||||
|
||||
return indented_lines / max(len(lines), 1)
|
||||
```
|
||||
|
||||
### Strategy Selection Engine
|
||||
|
||||
```python
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, Any
|
||||
|
||||
class ChunkingStrategy(ABC):
|
||||
@abstractmethod
|
||||
def chunk(self, content: str, analysis: DocumentAnalysis) -> List[Dict[str, Any]]:
|
||||
pass
|
||||
|
||||
class StrategySelector:
|
||||
def __init__(self):
|
||||
self.strategies = {
|
||||
'fixed_size': FixedSizeStrategy(),
|
||||
'recursive': RecursiveStrategy(),
|
||||
'structure_aware': StructureAwareStrategy(),
|
||||
'semantic': SemanticStrategy(),
|
||||
'adaptive': AdaptiveStrategy()
|
||||
}
|
||||
|
||||
def select_strategy(self, analysis: DocumentAnalysis) -> str:
|
||||
# Rule-based selection logic
|
||||
if analysis.structure_score > 0.8 and analysis.doc_type in ['markdown', 'html', 'latex']:
|
||||
return 'structure_aware'
|
||||
elif analysis.complexity_score > 0.7 and analysis.estimated_tokens < 10000:
|
||||
return 'semantic'
|
||||
elif analysis.doc_type == 'code':
|
||||
return 'structure_aware'
|
||||
elif analysis.structure_score < 0.3:
|
||||
return 'fixed_size'
|
||||
elif analysis.complexity_score > 0.5:
|
||||
return 'recursive'
|
||||
else:
|
||||
return 'adaptive'
|
||||
|
||||
def get_strategy(self, analysis: DocumentAnalysis) -> ChunkingStrategy:
|
||||
strategy_name = self.select_strategy(analysis)
|
||||
return self.strategies[strategy_name]
|
||||
|
||||
# Example strategy implementations
|
||||
class FixedSizeStrategy(ChunkingStrategy):
|
||||
def __init__(self, default_size=512, default_overlap=50):
|
||||
self.default_size = default_size
|
||||
self.default_overlap = default_overlap
|
||||
|
||||
def chunk(self, content: str, analysis: DocumentAnalysis) -> List[Dict[str, Any]]:
|
||||
# Adjust parameters based on analysis
|
||||
if analysis.complexity_score > 0.7:
|
||||
chunk_size = 1024
|
||||
elif analysis.complexity_score < 0.3:
|
||||
chunk_size = 256
|
||||
else:
|
||||
chunk_size = self.default_size
|
||||
|
||||
overlap = int(chunk_size * 0.1) # 10% overlap
|
||||
|
||||
# Implementation here...
|
||||
return self._fixed_size_chunk(content, chunk_size, overlap)
|
||||
|
||||
def _fixed_size_chunk(self, content: str, chunk_size: int, overlap: int) -> List[Dict[str, Any]]:
|
||||
# Implementation using RecursiveCharacterTextSplitter or custom logic
|
||||
pass
|
||||
|
||||
class AdaptiveStrategy(ChunkingStrategy):
|
||||
def chunk(self, content: str, analysis: DocumentAnalysis) -> List[Dict[str, Any]]:
|
||||
# Combine multiple strategies based on content characteristics
|
||||
if analysis.structure_score > 0.6:
|
||||
# Use structure-aware for structured parts
|
||||
structured_chunks = self._chunk_structured_parts(content, analysis)
|
||||
else:
|
||||
# Use fixed-size for unstructured parts
|
||||
unstructured_chunks = self._chunk_unstructured_parts(content, analysis)
|
||||
|
||||
# Merge and optimize
|
||||
return self._merge_chunks(structured_chunks + unstructured_chunks)
|
||||
|
||||
def _chunk_structured_parts(self, content: str, analysis: DocumentAnalysis) -> List[Dict[str, Any]]:
|
||||
# Implementation for structured content
|
||||
pass
|
||||
|
||||
def _chunk_unstructured_parts(self, content: str, analysis: DocumentAnalysis) -> List[Dict[str, Any]]:
|
||||
# Implementation for unstructured content
|
||||
pass
|
||||
|
||||
def _merge_chunks(self, chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
# Implementation for merging and optimizing chunks
|
||||
pass
|
||||
```
|
||||
|
||||
## Quality Assurance Framework
|
||||
|
||||
### Chunk Quality Metrics
|
||||
|
||||
```python
|
||||
from typing import List, Dict, Any
|
||||
import numpy as np
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
|
||||
class ChunkQualityAssessor:
|
||||
def __init__(self):
|
||||
self.quality_weights = {
|
||||
'coherence': 0.3,
|
||||
'completeness': 0.25,
|
||||
'size_appropriateness': 0.2,
|
||||
'semantic_similarity': 0.15,
|
||||
'boundary_quality': 0.1
|
||||
}
|
||||
|
||||
def assess_chunks(self, chunks: List[Dict[str, Any]], analysis: DocumentAnalysis) -> Dict[str, float]:
|
||||
scores = {}
|
||||
|
||||
# Coherence: Do chunks make sense on their own?
|
||||
scores['coherence'] = self._assess_coherence(chunks)
|
||||
|
||||
# Completeness: Do chunks preserve important information?
|
||||
scores['completeness'] = self._assess_completeness(chunks, analysis)
|
||||
|
||||
# Size appropriateness: Are chunks within optimal size range?
|
||||
scores['size_appropriateness'] = self._assess_size(chunks)
|
||||
|
||||
# Semantic similarity: Are chunks thematically consistent?
|
||||
scores['semantic_similarity'] = self._assess_semantic_consistency(chunks)
|
||||
|
||||
# Boundary quality: Are chunk boundaries placed well?
|
||||
scores['boundary_quality'] = self._assess_boundary_quality(chunks)
|
||||
|
||||
# Calculate overall quality score
|
||||
overall_score = sum(
|
||||
score * self.quality_weights[metric]
|
||||
for metric, score in scores.items()
|
||||
)
|
||||
|
||||
scores['overall'] = overall_score
|
||||
return scores
|
||||
|
||||
def _assess_coherence(self, chunks: List[Dict[str, Any]]) -> float:
|
||||
# Simple heuristic-based coherence assessment
|
||||
coherence_scores = []
|
||||
|
||||
for chunk in chunks:
|
||||
content = chunk['content']
|
||||
|
||||
# Check for complete sentences
|
||||
sentences = re.split(r'[.!?]+', content)
|
||||
complete_sentences = sum(1 for s in sentences if s.strip())
|
||||
coherence = complete_sentences / max(len(sentences), 1)
|
||||
|
||||
coherence_scores.append(coherence)
|
||||
|
||||
return np.mean(coherence_scores)
|
||||
|
||||
def _assess_completeness(self, chunks: List[Dict[str, Any]], analysis: DocumentAnalysis) -> float:
|
||||
# Check if important structural elements are preserved
|
||||
if analysis.doc_type in ['markdown', 'html']:
|
||||
return self._assess_structure_preservation(chunks, analysis)
|
||||
else:
|
||||
return self._assess_content_preservation(chunks)
|
||||
|
||||
def _assess_structure_preservation(self, chunks: List[Dict[str, Any]], analysis: DocumentAnalysis) -> float:
|
||||
# Check if headings, lists, and other structural elements are preserved
|
||||
preserved_elements = 0
|
||||
total_elements = 0
|
||||
|
||||
for chunk in chunks:
|
||||
content = chunk['content']
|
||||
|
||||
# Count preserved structural elements
|
||||
headings = len(re.findall(r'^#+\s', content, re.MULTILINE))
|
||||
lists = len(re.findall(r'^\s*[-*+]\s', content, re.MULTILINE))
|
||||
|
||||
preserved_elements += headings + lists
|
||||
total_elements += 1 # Simplified count
|
||||
|
||||
return preserved_elements / max(total_elements, 1)
|
||||
|
||||
def _assess_content_preservation(self, chunks: List[Dict[str, Any]]) -> float:
|
||||
# Simple check based on content ratio
|
||||
total_content = ''.join(chunk['content'] for chunk in chunks)
|
||||
# This would need comparison with original content
|
||||
return 0.8 # Placeholder
|
||||
|
||||
def _assess_size(self, chunks: List[Dict[str, Any]]) -> float:
|
||||
optimal_min = 100 # tokens
|
||||
optimal_max = 1000 # tokens
|
||||
|
||||
size_scores = []
|
||||
for chunk in chunks:
|
||||
token_count = self._estimate_tokens(chunk['content'])
|
||||
if optimal_min <= token_count <= optimal_max:
|
||||
score = 1.0
|
||||
elif token_count < optimal_min:
|
||||
score = token_count / optimal_min
|
||||
else:
|
||||
score = max(0, 1 - (token_count - optimal_max) / optimal_max)
|
||||
|
||||
size_scores.append(score)
|
||||
|
||||
return np.mean(size_scores)
|
||||
|
||||
def _assess_semantic_consistency(self, chunks: List[Dict[str, Any]]) -> float:
|
||||
# This would require embedding models for actual implementation
|
||||
# Placeholder implementation
|
||||
return 0.7
|
||||
|
||||
def _assess_boundary_quality(self, chunks: List[Dict[str, Any]]) -> float:
|
||||
# Check if boundaries don't split important content
|
||||
boundary_scores = []
|
||||
|
||||
for i, chunk in enumerate(chunks):
|
||||
content = chunk['content']
|
||||
|
||||
# Check for incomplete sentences at boundaries
|
||||
if not content.strip().endswith(('.', '!', '?', '>', '}')):
|
||||
boundary_scores.append(0.5)
|
||||
else:
|
||||
boundary_scores.append(1.0)
|
||||
|
||||
return np.mean(boundary_scores)
|
||||
|
||||
def _estimate_tokens(self, content: str) -> int:
|
||||
# Simple token estimation
|
||||
return len(content.split()) * 4 // 3 # Rough approximation
|
||||
```
|
||||
|
||||
## Error Handling and Edge Cases
|
||||
|
||||
### Robust Error Handling
|
||||
|
||||
```python
|
||||
import logging
|
||||
from typing import Optional, List
|
||||
from dataclasses import dataclass
|
||||
|
||||
@dataclass
|
||||
class ChunkingError:
|
||||
error_type: str
|
||||
message: str
|
||||
chunk_index: Optional[int] = None
|
||||
recovery_action: Optional[str] = None
|
||||
|
||||
class ChunkingErrorHandler:
|
||||
def __init__(self):
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self.error_handlers = {
|
||||
'empty_content': self._handle_empty_content,
|
||||
'oversized_chunk': self._handle_oversized_chunk,
|
||||
'encoding_error': self._handle_encoding_error,
|
||||
'memory_error': self._handle_memory_error,
|
||||
'structure_parsing_error': self._handle_structure_parsing_error
|
||||
}
|
||||
|
||||
def handle_error(self, error: Exception, context: Dict[str, Any]) -> ChunkingError:
|
||||
error_type = self._classify_error(error)
|
||||
handler = self.error_handlers.get(error_type, self._handle_generic_error)
|
||||
return handler(error, context)
|
||||
|
||||
def _classify_error(self, error: Exception) -> str:
|
||||
if isinstance(error, ValueError) and 'empty' in str(error).lower():
|
||||
return 'empty_content'
|
||||
elif isinstance(error, MemoryError):
|
||||
return 'memory_error'
|
||||
elif isinstance(error, UnicodeError):
|
||||
return 'encoding_error'
|
||||
elif 'too large' in str(error).lower():
|
||||
return 'oversized_chunk'
|
||||
elif 'parsing' in str(error).lower():
|
||||
return 'structure_parsing_error'
|
||||
else:
|
||||
return 'generic_error'
|
||||
|
||||
def _handle_empty_content(self, error: Exception, context: Dict[str, Any]) -> ChunkingError:
|
||||
self.logger.warning(f"Empty content encountered: {error}")
|
||||
return ChunkingError(
|
||||
error_type='empty_content',
|
||||
message=str(error),
|
||||
recovery_action='skip_empty_content'
|
||||
)
|
||||
|
||||
def _handle_oversized_chunk(self, error: Exception, context: Dict[str, Any]) -> ChunkingError:
|
||||
self.logger.warning(f"Oversized chunk detected: {error}")
|
||||
return ChunkingError(
|
||||
error_type='oversized_chunk',
|
||||
message=str(error),
|
||||
chunk_index=context.get('chunk_index'),
|
||||
recovery_action='reduce_chunk_size'
|
||||
)
|
||||
|
||||
def _handle_encoding_error(self, error: Exception, context: Dict[str, Any]) -> ChunkingError:
|
||||
self.logger.error(f"Encoding error: {error}")
|
||||
return ChunkingError(
|
||||
error_type='encoding_error',
|
||||
message=str(error),
|
||||
recovery_action='fallback_encoding'
|
||||
)
|
||||
|
||||
def _handle_memory_error(self, error: Exception, context: Dict[str, Any]) -> ChunkingError:
|
||||
self.logger.error(f"Memory error during chunking: {error}")
|
||||
return ChunkingError(
|
||||
error_type='memory_error',
|
||||
message=str(error),
|
||||
recovery_action='process_in_batches'
|
||||
)
|
||||
|
||||
def _handle_structure_parsing_error(self, error: Exception, context: Dict[str, Any]) -> ChunkingError:
|
||||
self.logger.warning(f"Structure parsing failed: {error}")
|
||||
return ChunkingError(
|
||||
error_type='structure_parsing_error',
|
||||
message=str(error),
|
||||
recovery_action='fallback_to_fixed_size'
|
||||
)
|
||||
|
||||
def _handle_generic_error(self, error: Exception, context: Dict[str, Any]) -> ChunkingError:
|
||||
self.logger.error(f"Unexpected error during chunking: {error}")
|
||||
return ChunkingError(
|
||||
error_type='generic_error',
|
||||
message=str(error),
|
||||
recovery_action='skip_and_continue'
|
||||
)
|
||||
```
|
||||
|
||||
## Performance Optimization
|
||||
|
||||
### Caching and Memoization
|
||||
|
||||
```python
|
||||
import hashlib
|
||||
import pickle
|
||||
from functools import lru_cache
|
||||
from typing import Dict, Any, Optional
|
||||
import redis
|
||||
import json
|
||||
|
||||
class ChunkingCache:
|
||||
def __init__(self, redis_url: Optional[str] = None):
|
||||
if redis_url:
|
||||
self.redis_client = redis.from_url(redis_url)
|
||||
else:
|
||||
self.redis_client = None
|
||||
self.local_cache = {}
|
||||
|
||||
def _generate_cache_key(self, content: str, strategy: str, params: Dict[str, Any]) -> str:
|
||||
content_hash = hashlib.md5(content.encode()).hexdigest()
|
||||
params_str = json.dumps(params, sort_keys=True)
|
||||
params_hash = hashlib.md5(params_str.encode()).hexdigest()
|
||||
return f"chunking:{strategy}:{content_hash}:{params_hash}"
|
||||
|
||||
def get(self, content: str, strategy: str, params: Dict[str, Any]) -> Optional[List[Dict[str, Any]]]:
|
||||
cache_key = self._generate_cache_key(content, strategy, params)
|
||||
|
||||
# Try local cache first
|
||||
if cache_key in self.local_cache:
|
||||
return self.local_cache[cache_key]
|
||||
|
||||
# Try Redis cache
|
||||
if self.redis_client:
|
||||
try:
|
||||
cached_data = self.redis_client.get(cache_key)
|
||||
if cached_data:
|
||||
chunks = pickle.loads(cached_data)
|
||||
self.local_cache[cache_key] = chunks # Cache locally too
|
||||
return chunks
|
||||
except Exception as e:
|
||||
logging.warning(f"Redis cache error: {e}")
|
||||
|
||||
return None
|
||||
|
||||
def set(self, content: str, strategy: str, params: Dict[str, Any], chunks: List[Dict[str, Any]]) -> None:
|
||||
cache_key = self._generate_cache_key(content, strategy, params)
|
||||
|
||||
# Store in local cache
|
||||
self.local_cache[cache_key] = chunks
|
||||
|
||||
# Store in Redis cache
|
||||
if self.redis_client:
|
||||
try:
|
||||
cached_data = pickle.dumps(chunks)
|
||||
self.redis_client.setex(cache_key, 3600, cached_data) # 1 hour TTL
|
||||
except Exception as e:
|
||||
logging.warning(f"Redis cache set error: {e}")
|
||||
|
||||
def clear_local_cache(self):
|
||||
self.local_cache.clear()
|
||||
|
||||
def clear_redis_cache(self):
|
||||
if self.redis_client:
|
||||
pattern = "chunking:*"
|
||||
keys = self.redis_client.keys(pattern)
|
||||
if keys:
|
||||
self.redis_client.delete(*keys)
|
||||
```
|
||||
|
||||
### Batch Processing
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
import concurrent.futures
|
||||
from typing import List, Callable, Any
|
||||
|
||||
class BatchChunkingProcessor:
|
||||
def __init__(self, max_workers: int = 4, batch_size: int = 10):
|
||||
self.max_workers = max_workers
|
||||
self.batch_size = batch_size
|
||||
|
||||
def process_documents_batch(self, documents: List[str],
|
||||
chunking_function: Callable[[str], List[Dict[str, Any]]]) -> List[List[Dict[str, Any]]]:
|
||||
"""Process multiple documents in parallel"""
|
||||
results = []
|
||||
|
||||
# Process in batches to avoid memory issues
|
||||
for i in range(0, len(documents), self.batch_size):
|
||||
batch = documents[i:i + self.batch_size]
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
||||
future_to_doc = {
|
||||
executor.submit(chunking_function, doc): doc
|
||||
for doc in batch
|
||||
}
|
||||
|
||||
batch_results = []
|
||||
for future in concurrent.futures.as_completed(future_to_doc):
|
||||
try:
|
||||
chunks = future.result()
|
||||
batch_results.append(chunks)
|
||||
except Exception as e:
|
||||
logging.error(f"Error processing document: {e}")
|
||||
batch_results.append([]) # Empty result for failed processing
|
||||
|
||||
results.extend(batch_results)
|
||||
|
||||
return results
|
||||
|
||||
async def process_documents_async(self, documents: List[str],
|
||||
chunking_function: Callable[[str], List[Dict[str, Any]]]) -> List[List[Dict[str, Any]]]:
|
||||
"""Process documents asynchronously"""
|
||||
semaphore = asyncio.Semaphore(self.max_workers)
|
||||
|
||||
async def process_single_document(doc: str) -> List[Dict[str, Any]]:
|
||||
async with semaphore:
|
||||
# Run the synchronous chunking function in an executor
|
||||
loop = asyncio.get_event_loop()
|
||||
return await loop.run_in_executor(None, chunking_function, doc)
|
||||
|
||||
tasks = [process_single_document(doc) for doc in documents]
|
||||
return await asyncio.gather(*tasks, return_exceptions=True)
|
||||
```
|
||||
|
||||
## Monitoring and Observability
|
||||
|
||||
### Metrics Collection
|
||||
|
||||
```python
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, Any, List
|
||||
from collections import defaultdict
|
||||
|
||||
@dataclass
|
||||
class ChunkingMetrics:
|
||||
total_documents: int
|
||||
total_chunks: int
|
||||
avg_chunk_size: float
|
||||
processing_time: float
|
||||
memory_usage: float
|
||||
error_count: int
|
||||
strategy_distribution: Dict[str, int]
|
||||
|
||||
class MetricsCollector:
|
||||
def __init__(self):
|
||||
self.metrics = defaultdict(list)
|
||||
self.start_time = None
|
||||
|
||||
def start_timing(self):
|
||||
self.start_time = time.time()
|
||||
|
||||
def end_timing(self) -> float:
|
||||
if self.start_time:
|
||||
duration = time.time() - self.start_time
|
||||
self.metrics['processing_time'].append(duration)
|
||||
self.start_time = None
|
||||
return duration
|
||||
return 0.0
|
||||
|
||||
def record_chunk_count(self, count: int):
|
||||
self.metrics['chunk_count'].append(count)
|
||||
|
||||
def record_chunk_size(self, size: int):
|
||||
self.metrics['chunk_size'].append(size)
|
||||
|
||||
def record_strategy_usage(self, strategy: str):
|
||||
self.metrics['strategy'][strategy] = self.metrics['strategy'].get(strategy, 0) + 1
|
||||
|
||||
def record_error(self, error_type: str):
|
||||
self.metrics['errors'].append(error_type)
|
||||
|
||||
def record_memory_usage(self, memory_mb: float):
|
||||
self.metrics['memory_usage'].append(memory_mb)
|
||||
|
||||
def get_summary(self) -> ChunkingMetrics:
|
||||
return ChunkingMetrics(
|
||||
total_documents=len(self.metrics['processing_time']),
|
||||
total_chunks=sum(self.metrics['chunk_count']),
|
||||
avg_chunk_size=sum(self.metrics['chunk_size']) / max(len(self.metrics['chunk_size']), 1),
|
||||
processing_time=sum(self.metrics['processing_time']),
|
||||
memory_usage=sum(self.metrics['memory_usage']) / max(len(self.metrics['memory_usage']), 1),
|
||||
error_count=len(self.metrics['errors']),
|
||||
strategy_distribution=dict(self.metrics['strategy'])
|
||||
)
|
||||
|
||||
def reset(self):
|
||||
self.metrics.clear()
|
||||
self.start_time = None
|
||||
```
|
||||
|
||||
This implementation guide provides a comprehensive foundation for building robust, scalable chunking systems that can handle various document types and use cases while maintaining high quality and performance.
|
||||
Reference in New Issue
Block a user