# Advanced Chunking Strategies This document provides detailed implementations of 11 advanced chunking strategies for comprehensive RAG systems. ## Strategy Overview | Strategy | Complexity | Use Case | Key Benefit | |----------|------------|----------|-------------| | Fixed-Length | Low | Simple documents, baseline | Easy implementation | | Sentence-Based | Medium | General text processing | Natural language boundaries | | Paragraph-Based | Medium | Structured documents | Context preservation | | Sliding Window | Medium | Context-critical queries | Overlap for continuity | | Semantic | High | Complex documents | Thematic coherence | | Recursive | Medium | Mixed content types | Hierarchical structure | | Context-Enriched | High | Technical documents | Enhanced context | | Modality-Specific | High | Multi-modal content | Specialized handling | | Agentic | Very High | Dynamic requirements | Adaptive chunking | | Subdocument | Medium | Large documents | Logical grouping | | Hybrid | Very High | Complex systems | Best-of-all approaches | ## 1. Fixed-Length Chunking ### Overview Divide documents into chunks of fixed character/token count regardless of content structure. ### Implementation ```python from langchain.text_splitter import CharacterTextSplitter import tiktoken class FixedLengthChunker: def __init__(self, chunk_size=1000, chunk_overlap=200, encoding_name="cl100k_base"): self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap self.encoding = tiktoken.get_encoding(encoding_name) def chunk_by_characters(self, text): """Chunk by character count""" splitter = CharacterTextSplitter( chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap, separator="\n\n" ) return splitter.split_text(text) def chunk_by_tokens(self, text): """Chunk by token count using tiktoken""" tokens = self.encoding.encode(text) chunks = [] start = 0 while start < len(tokens): end = min(start + self.chunk_size, len(tokens)) chunk_tokens = tokens[start:end] chunk_text = self.encoding.decode(chunk_tokens) chunks.append(chunk_text) # Calculate next start position with overlap start = max(0, end - self.chunk_overlap) # Prevent infinite loop if end >= len(tokens): break return chunks def chunk_optimized(self, text, strategy="balanced"): """Optimized chunking based on strategy""" strategies = { "conservative": {"chunk_size": 500, "overlap": 100}, "balanced": {"chunk_size": 1000, "overlap": 200}, "aggressive": {"chunk_size": 2000, "overlap": 400} } config = strategies.get(strategy, strategies["balanced"]) self.chunk_size = config["chunk_size"] self.chunk_overlap = config["overlap"] return self.chunk_by_tokens(text) ``` ### Best Practices - Start with 1000 tokens for general use - Use 10-20% overlap for context preservation - Adjust based on embedding model context window - Consider document type for optimal sizing ## 2. Sentence-Based Chunking ### Overview Split documents at sentence boundaries while maintaining target chunk sizes. ### Implementation ```python import nltk import spacy from typing import List class SentenceChunker: def __init__(self, max_sentences=10, overlap_sentences=2, library="spacy"): self.max_sentences = max_sentences self.overlap_sentences = overlap_sentences self.library = library if library == "spacy": self.nlp = spacy.load("en_core_web_sm") elif library == "nltk": nltk.download('punkt') def extract_sentences_spacy(self, text): """Extract sentences using spaCy""" doc = self.nlp(text) return [sent.text.strip() for sent in doc.sents] def extract_sentences_nltk(self, text): """Extract sentences using NLTK""" sentences = nltk.sent_tokenize(text) return [sent.strip() for sent in sentences] def chunk_sentences(self, text): """Chunk text by sentences""" if self.library == "spacy": sentences = self.extract_sentences_spacy(text) else: sentences = self.extract_sentences_nltk(text) chunks = [] for i in range(0, len(sentences), self.max_sentences - self.overlap_sentences): end_idx = min(i + self.max_sentences, len(sentences)) chunk_sentences = sentences[i:end_idx] if chunk_sentences: chunk = " ".join(chunk_sentences) chunks.append(chunk) return chunks def chunk_with_metadata(self, text): """Chunk with sentence count metadata""" sentences = self.extract_sentences_spacy(text) chunks = [] for i in range(0, len(sentences), self.max_sentences - self.overlap_sentences): end_idx = min(i + self.max_sentences, len(sentences)) chunk_sentences = sentences[i:end_idx] if chunk_sentences: chunk = { "text": " ".join(chunk_sentences), "sentence_count": len(chunk_sentences), "start_sentence": i, "end_sentence": end_idx - 1, "overlap": self.overlap_sentences > 0 and i > 0 } chunks.append(chunk) return chunks ``` ## 3. Paragraph-Based Chunking ### Overview Split documents at paragraph boundaries while maintaining semantic coherence. ### Implementation ```python import re from typing import List, Dict class ParagraphChunker: def __init__(self, max_paragraphs=5, min_length=100, merge_short=True): self.max_paragraphs = max_paragraphs self.min_length = min_length self.merge_short = merge_short def extract_paragraphs(self, text): """Extract paragraphs from text""" # Split on various paragraph separators paragraphs = re.split(r'\n\s*\n|\r\n\s*\r\n', text) # Clean and filter paragraphs cleaned_paragraphs = [] for para in paragraphs: para = para.strip() if para and len(para) > self.min_length // 4: # Allow short paragraphs cleaned_paragraphs.append(para) return cleaned_paragraphs def chunk_paragraphs(self, text): """Chunk text by paragraphs""" paragraphs = self.extract_paragraphs(text) chunks = [] current_chunk = [] current_length = 0 for i, paragraph in enumerate(paragraphs): paragraph_length = len(paragraph) # If adding this paragraph exceeds reasonable limits, start new chunk if (current_chunk and (len(current_chunk) >= self.max_paragraphs or current_length + paragraph_length > 3000)): # Save current chunk if current_chunk: chunks.append("\n\n".join(current_chunk)) # Start new chunk with overlap overlap_count = min(2, len(current_chunk)) current_chunk = current_chunk[-overlap_count:] if overlap_count > 0 else [] current_length = sum(len(p) for p in current_chunk) current_chunk.append(paragraph) current_length += paragraph_length # Add final chunk if current_chunk: chunks.append("\n\n".join(current_chunk)) return chunks def chunk_with_structure(self, text): """Chunk while preserving structure information""" paragraphs = self.extract_paragraphs(text) chunks = [] current_chunk = [] current_start = 0 for i, paragraph in enumerate(paragraphs): current_chunk.append(paragraph) # Check if we should end the current chunk should_end = ( len(current_chunk) >= self.max_paragraphs or (i < len(paragraphs) - 1 and self._is_boundary_paragraph(paragraph, paragraphs[i + 1])) ) if should_end or i == len(paragraphs) - 1: chunk_data = { "text": "\n\n".join(current_chunk), "paragraph_count": len(current_chunk), "start_paragraph": current_start, "end_paragraph": i, "structure_type": self._detect_structure_type(current_chunk) } chunks.append(chunk_data) # Prepare for next chunk current_start = i + 1 overlap_count = min(1, len(current_chunk)) current_chunk = current_chunk[-overlap_count:] if overlap_count > 0 else [] return chunks def _is_boundary_paragraph(self, current, next_para): """Check if there's a natural boundary between paragraphs""" boundary_indicators = [ lambda c, n: c.strip().endswith(':'), # Ends with colon lambda c, n: n.strip().startswith(('•', '-', '*')), # List starts lambda c, n: bool(re.match(r'^\d+\.', n.strip())), # Numbered list lambda c, n: len(n.strip()) < 50, # Very short paragraph ] return any(indicator(current, next_para) for indicator in boundary_indicators) def _detect_structure_type(self, paragraphs): """Detect the type of structure in the chunk""" text = " ".join(paragraphs) if re.search(r'^#+\s', text, re.MULTILINE): return "markdown_headings" elif re.search(r'^\s*[-*+]\s', text, re.MULTILINE): return "bullet_points" elif re.search(r'^\s*\d+\.\s', text, re.MULTILINE): return "numbered_list" elif any(char.isdigit() for char in text) and ('%' in text or '$' in text): return "data_heavy" else: return "prose" ``` ## 4. Sliding Window Chunking ### Overview Create overlapping chunks using a sliding window approach for maximum context preservation. ### Implementation ```python from typing import List, Iterator import numpy as np class SlidingWindowChunker: def __init__(self, window_size=1000, step_size=500, unit="tokens"): self.window_size = window_size self.step_size = step_size self.unit = unit def sliding_chunk_tokens(self, text, encoding_name="cl100k_base"): """Create sliding window chunks by tokens""" import tiktoken encoding = tiktoken.get_encoding(encoding_name) tokens = encoding.encode(text) chunks = [] for start in range(0, len(tokens), self.step_size): end = min(start + self.window_size, len(tokens)) window_tokens = tokens[start:end] chunk_text = encoding.decode(window_tokens) chunks.append({ "text": chunk_text, "start_token": start, "end_token": end - 1, "token_count": len(window_tokens), "overlap": self.window_size - self.step_size }) if end >= len(tokens): break return chunks def sliding_chunk_characters(self, text): """Create sliding window chunks by characters""" chunks = [] for start in range(0, len(text), self.step_size): end = min(start + self.window_size, len(text)) chunk_text = text[start:end] chunks.append({ "text": chunk_text, "start_char": start, "end_char": end - 1, "char_count": len(chunk_text), "overlap": self.window_size - self.step_size }) if end >= len(text): break return chunks def adaptive_sliding_window(self, text, min_overlap=0.1, max_overlap=0.5): """Adaptive sliding window based on content density""" if self.unit == "tokens": base_chunks = self.sliding_chunk_tokens(text) else: base_chunks = self.sliding_chunk_characters(text) # Analyze content density adaptive_chunks = [] for i, chunk in enumerate(base_chunks): text_content = chunk["text"] density = self._calculate_content_density(text_content) # Adjust overlap based on density if density > 0.8: # High density - more overlap adjusted_overlap = int(self.window_size * max_overlap) elif density < 0.3: # Low density - less overlap adjusted_overlap = int(self.window_size * min_overlap) else: adjusted_overlap = self.window_size - self.step_size chunk["content_density"] = density chunk["adjusted_overlap"] = adjusted_overlap adaptive_chunks.append(chunk) return adaptive_chunks def _calculate_content_density(self, text): """Calculate content density (information per unit)""" # Simple heuristic: unique words / total words words = text.split() if not words: return 0.0 unique_words = set(word.lower().strip('.,!?;:()[]{}"\'') for word in words) density = len(unique_words) / len(words) # Adjust for punctuation and special characters special_chars = sum(1 for char in text if not char.isalnum() and not char.isspace()) density += special_chars / len(text) * 0.1 return min(density, 1.0) def semantic_sliding_window(self, text, embedding_model, similarity_threshold=0.7): """Sliding window with semantic boundary detection""" from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity import numpy as np # Split into sentences sentences = self._split_into_sentences(text) if len(sentences) < 2: return [{"text": text, "method": "single_sentence"}] # Generate sentence embeddings sentence_embeddings = embedding_model.encode(sentences) chunks = [] current_window_sentences = [] current_window_start = 0 for i, sentence in enumerate(sentences): current_window_sentences.append(sentence) # Check if we should create a boundary should_create_boundary = ( len(current_window_sentences) >= 10 or # Max sentences per window (i < len(sentences) - 1 and # Not the last sentence self._should_create_semantic_boundary( sentence_embeddings, i, similarity_threshold )) ) if should_create_boundary: chunk_text = " ".join(current_window_sentences) chunks.append({ "text": chunk_text, "sentence_count": len(current_window_sentences), "start_sentence": current_window_start, "end_sentence": i, "method": "semantic_sliding_window" }) # Start new window with overlap overlap_size = min(2, len(current_window_sentences) // 2) current_window_sentences = current_window_sentences[-overlap_size:] current_window_start = i + 1 - overlap_size # Add final chunk if current_window_sentences: chunk_text = " ".join(current_window_sentences) chunks.append({ "text": chunk_text, "sentence_count": len(current_window_sentences), "start_sentence": current_window_start, "end_sentence": len(sentences) - 1, "method": "semantic_sliding_window" }) return chunks def _split_into_sentences(self, text): """Split text into sentences""" import re # Simple sentence splitting sentences = re.split(r'[.!?]+', text) return [s.strip() for s in sentences if s.strip()] def _should_create_semantic_boundary(self, embeddings, current_idx, threshold): """Determine if semantic boundary should be created""" if current_idx >= len(embeddings) - 1: return True # Calculate similarity with next sentence current_embedding = embeddings[current_idx].reshape(1, -1) next_embedding = embeddings[current_idx + 1].reshape(1, -1) similarity = cosine_similarity(current_embedding, next_embedding)[0][0] return similarity < threshold ``` ## 5. Semantic Chunking ### Overview Use semantic similarity to identify natural boundaries in text. ### Implementation ```python from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity import numpy as np from typing import List, Dict class SemanticChunker: def __init__(self, model_name="all-MiniLM-L6-v2", similarity_threshold=0.8, min_chunk_size=2, max_chunk_size=10): self.model = SentenceTransformer(model_name) self.similarity_threshold = similarity_threshold self.min_chunk_size = min_chunk_size self.max_chunk_size = max_chunk_size def semantic_chunk_sentences(self, text): """Chunk text based on semantic similarity between sentences""" # Split into sentences sentences = self._split_into_sentences(text) if len(sentences) <= self.min_chunk_size: return [{"text": text, "sentence_count": len(sentences), "method": "single_chunk"}] # Generate embeddings for all sentences sentence_embeddings = self.model.encode(sentences) # Find semantic boundaries boundaries = self._find_semantic_boundaries(sentence_embeddings) # Create chunks based on boundaries chunks = [] start_idx = 0 for boundary_idx in boundaries: if boundary_idx > start_idx: chunk_sentences = sentences[start_idx:boundary_idx + 1] chunk_text = " ".join(chunk_sentences) chunks.append({ "text": chunk_text, "sentence_count": len(chunk_sentences), "start_sentence": start_idx, "end_sentence": boundary_idx, "method": "semantic_boundary" }) start_idx = boundary_idx + 1 # Add remaining sentences if start_idx < len(sentences): chunk_sentences = sentences[start_idx:] chunk_text = " ".join(chunk_sentences) chunks.append({ "text": chunk_text, "sentence_count": len(chunk_sentences), "start_sentence": start_idx, "end_sentence": len(sentences) - 1, "method": "semantic_boundary" }) return self._merge_small_chunks(chunks) def _find_semantic_boundaries(self, embeddings): """Find semantic boundaries based on similarity thresholds""" boundaries = [] for i in range(len(embeddings) - 1): # Calculate similarity between consecutive sentences similarity = cosine_similarity( embeddings[i].reshape(1, -1), embeddings[i + 1].reshape(1, -1) )[0][0] # If similarity is below threshold, create boundary if similarity < self.similarity_threshold: boundaries.append(i) return boundaries def _split_into_sentences(self, text): """Split text into sentences""" import re # Enhanced sentence splitting sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text) return [s.strip() for s in sentences if s.strip()] def _merge_small_chunks(self, chunks): """Merge chunks that are too small""" if not chunks: return chunks merged_chunks = [] current_chunk = chunks[0].copy() for next_chunk in chunks[1:]: if (current_chunk["sentence_count"] < self.min_chunk_size and current_chunk["sentence_count"] + next_chunk["sentence_count"] <= self.max_chunk_size): # Merge chunks current_chunk["text"] += " " + next_chunk["text"] current_chunk["sentence_count"] += next_chunk["sentence_count"] current_chunk["end_sentence"] = next_chunk["end_sentence"] else: merged_chunks.append(current_chunk) current_chunk = next_chunk.copy() merged_chunks.append(current_chunk) return merged_chunks def adaptive_semantic_chunking(self, text, content_analyzer=None): """Semantic chunking with adaptive threshold""" sentences = self._split_into_sentences(text) if len(sentences) <= 2: return [{"text": text, "method": "too_short"}] # Generate embeddings embeddings = self.model.encode(sentences) # Analyze content complexity if content_analyzer: complexity = content_analyzer.analyze_complexity(text) # Adjust threshold based on complexity adaptive_threshold = self.similarity_threshold * (1.0 + complexity * 0.2) else: adaptive_threshold = self.similarity_threshold # Find boundaries with adaptive threshold boundaries = self._find_adaptive_boundaries(embeddings, adaptive_threshold) # Create chunks chunks = [] start_idx = 0 for boundary_idx in boundaries: if boundary_idx > start_idx: chunk_sentences = sentences[start_idx:boundary_idx + 1] chunk_text = " ".join(chunk_sentences) chunks.append({ "text": chunk_text, "sentence_count": len(chunk_sentences), "start_sentence": start_idx, "end_sentence": boundary_idx, "method": "adaptive_semantic", "threshold_used": adaptive_threshold }) start_idx = boundary_idx + 1 # Add remaining sentences if start_idx < len(sentences): chunk_sentences = sentences[start_idx:] chunk_text = " ".join(chunk_sentences) chunks.append({ "text": chunk_text, "sentence_count": len(chunk_sentences), "start_sentence": start_idx, "end_sentence": len(sentences) - 1, "method": "adaptive_semantic", "threshold_used": adaptive_threshold }) return chunks def _find_adaptive_boundaries(self, embeddings, threshold): """Find boundaries with adaptive threshold based on local context""" boundaries = [] for i in range(len(embeddings) - 1): # Calculate local similarity local_similarities = [] # Look at local window of similarities window_size = min(3, i) for j in range(max(0, i - window_size), i + 1): if j < len(embeddings) - 1: similarity = cosine_similarity( embeddings[j].reshape(1, -1), embeddings[j + 1].reshape(1, -1) )[0][0] local_similarities.append(similarity) # Use local average for comparison if local_similarities: local_avg = np.mean(local_similarities) current_similarity = local_similarities[-1] # Create boundary if current similarity is significantly lower than local average if current_similarity < local_avg * threshold: boundaries.append(i) else: # Fallback to global threshold similarity = cosine_similarity( embeddings[i].reshape(1, -1), embeddings[i + 1].reshape(1, -1) )[0][0] if similarity < threshold: boundaries.append(i) return boundaries def hierarchical_semantic_chunking(self, text, max_levels=3): """Multi-level semantic chunking""" sentences = self._split_into_sentences(text) if len(sentences) <= 4: return [{ "text": text, "level": 0, "sentence_count": len(sentences), "method": "hierarchical_semantic" }] # Level 0: Original text chunks = [{ "text": text, "level": 0, "sentence_count": len(sentences), "method": "hierarchical_semantic" }] # Generate embeddings once embeddings = self.model.encode(sentences) # Create hierarchical chunks current_level_sentences = sentences current_level_embeddings = embeddings for level in range(1, max_levels + 1): if len(current_level_sentences) <= 2: break # Find boundaries at this level boundaries = self._find_semantic_boundaries(current_level_embeddings) # Create chunks at this level level_chunks = [] start_idx = 0 for boundary_idx in boundaries: if boundary_idx > start_idx: chunk_sentences = current_level_sentences[start_idx:boundary_idx + 1] chunk_text = " ".join(chunk_sentences) level_chunks.append({ "text": chunk_text, "level": level, "sentence_count": len(chunk_sentences), "start_sentence": start_idx, "end_sentence": boundary_idx, "method": "hierarchical_semantic" }) start_idx = boundary_idx + 1 # Add remaining sentences if start_idx < len(current_level_sentences): chunk_sentences = current_level_sentences[start_idx:] chunk_text = " ".join(chunk_sentences) level_chunks.append({ "text": chunk_text, "level": level, "sentence_count": len(chunk_sentences), "start_sentence": start_idx, "end_sentence": len(current_level_sentences) - 1, "method": "hierarchical_semantic" }) chunks.extend(level_chunks) # Prepare for next level if len(level_chunks) > 1: current_level_sentences = [chunk["text"] for chunk in level_chunks] current_level_embeddings = self.model.encode(current_level_sentences) else: break return chunks ``` ## 6. Recursive Chunking ### Overview Hierarchical splitting using ordered separators to preserve document structure. ### Implementation ```python from typing import List, Dict, Optional import re class RecursiveChunker: def __init__(self, chunk_size=1000, chunk_overlap=200, separators=None, length_function=len): self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap self.length_function = length_function # Default separators in order of preference self.separators = separators or [ "\n\n\n", # Triple newlines (section breaks) "\n\n", # Double newlines (paragraph breaks) "\n", # Single newlines (line breaks) " ", # Spaces (word breaks) "" # Character-level (last resort) ] def recursive_split(self, text, separators=None): """Recursively split text using hierarchical separators""" separators = separators or self.separators final_chunks = [] # Try each separator in order for separator in separators: if separator == "": # Last resort: split by characters return self._split_by_characters(text) # Split by current separator splits = text.split(separator) # Filter out empty splits splits = [split for split in splits if split.strip()] if len(splits) > 1: # Found a good separator for split in splits: if self.length_function(split) <= self.chunk_size: final_chunks.append(split) else: # Recursively split this piece sub_chunks = self.recursive_split(split, separators[separators.index(separator) + 1:]) final_chunks.extend(sub_chunks) return self._merge_chunks(final_chunks) # No separator worked, split by characters return self._split_by_characters(text) def _split_by_characters(self, text): """Split text by characters as last resort""" chunks = [] start = 0 while start < len(text): end = min(start + self.chunk_size, len(text)) chunk = text[start:end] chunks.append(chunk) # Calculate next start with overlap start = max(0, end - self.chunk_overlap) if end >= len(text): break return chunks def _merge_chunks(self, chunks): """Merge chunks that are too small""" if not chunks: return chunks merged_chunks = [] current_chunk = chunks[0] for next_chunk in chunks[1:]: combined_length = self.length_function(current_chunk + next_chunk) if combined_length <= self.chunk_size: # Merge chunks current_chunk += "\n\n" + next_chunk else: # Add current chunk and start new one merged_chunks.append(current_chunk) current_chunk = next_chunk merged_chunks.append(current_chunk) return merged_chunks def recursive_split_with_metadata(self, text, separators=None): """Recursive split with detailed metadata""" separators = separators or self.separators chunks = [] def _recursive_split_with_context(text_chunk, parent_separator=""): nonlocal chunks for separator in separators: if separator == "": sub_chunks = self._split_by_characters(text_chunk) for i, chunk in enumerate(sub_chunks): chunks.append({ "text": chunk, "separator": "character", "parent_separator": parent_separator, "level": len(separators) - separators.index(separator), "chunk_index": len(chunks), "size": self.length_function(chunk) }) return splits = text_chunk.split(separator) splits = [split for split in splits if split.strip()] if len(splits) > 1: for i, split in enumerate(splits): if self.length_function(split) <= self.chunk_size: chunks.append({ "text": split, "separator": separator, "parent_separator": parent_separator, "level": len(separators) - separators.index(separator), "chunk_index": len(chunks), "size": self.length_function(split) }) else: # Recursively split this piece _recursive_split_with_context(split, separator) return # No separator worked sub_chunks = self._split_by_characters(text_chunk) for i, chunk in enumerate(sub_chunks): chunks.append({ "text": chunk, "separator": "character_fallback", "parent_separator": parent_separator, "level": len(separators), "chunk_index": len(chunks), "size": self.length_function(chunk) }) _recursive_split_with_context(text) return chunks def markdown_aware_recursive_split(self, text): """Recursive splitting optimized for Markdown documents""" markdown_separators = [ "\n# ", # H1 headers "\n## ", # H2 headers "\n### ", # H3 headers "\n#### ", # H4 headers "\n##### ", # H5 headers "\n###### ", # H6 headers "\n\n", # Paragraph breaks "\n", # Line breaks " ", # Spaces "" # Characters ] chunks = [] def _split_markdown(text_chunk, separator_idx=0): if separator_idx >= len(markdown_separators): return self._split_by_characters(text_chunk) separator = markdown_separators[separator_idx] if separator.startswith("\n#"): # Markdown headers pattern = re.escape(separator) splits = re.split(pattern, text_chunk) if len(splits) > 1: # Re-add separator to splits (except first) for i in range(1, len(splits)): splits[i] = separator + splits[i] result_chunks = [] for split in splits: if self.length_function(split) <= self.chunk_size: result_chunks.append(split) else: # Try next level separator sub_chunks = _split_markdown(split, separator_idx + 1) result_chunks.extend(sub_chunks) return result_chunks else: # Regular separators splits = text_chunk.split(separator) splits = [split for split in splits if split.strip()] if len(splits) > 1: result_chunks = [] for split in splits: if self.length_function(split) <= self.chunk_size: result_chunks.append(split) else: # Try next level separator sub_chunks = _split_markdown(split, separator_idx + 1) result_chunks.extend(sub_chunks) return result_chunks # Try next separator return _split_markdown(text_chunk, separator_idx + 1) raw_chunks = _split_markdown(text) # Add metadata for i, chunk in enumerate(raw_chunks): chunks.append({ "text": chunk, "chunk_index": i, "size": self.length_function(chunk), "format": "markdown", "contains_header": bool(re.search(r'^#+\s', chunk, re.MULTILINE)), "contains_code": bool(re.search(r'```', chunk)), "contains_list": bool(re.search(r'^\s*[-*+]\s', chunk, re.MULTILINE)) }) return chunks ``` ## 7-11. Additional Advanced Strategies ### 7. Context-Enriched Chunking ```python class ContextEnrichedChunker: def __init__(self, base_chunker, context_generator=None): self.base_chunker = base_chunker self.context_generator = context_generator def enrich_chunks(self, text, query_context=None): """Add contextual information to chunks""" base_chunks = self.base_chunker.chunk(text) enriched_chunks = [] for i, chunk in enumerate(base_chunks): # Generate context for this chunk context = self._generate_context(chunk, text, i, query_context) enriched_chunk = { "original_text": chunk, "context": context, "enriched_text": f"Context: {context}\n\nContent: {chunk}", "chunk_index": i, "method": "context_enriched" } enriched_chunks.append(enriched_chunk) return enriched_chunks def _generate_context(self, chunk, full_text, chunk_index, query_context): """Generate contextual information for a chunk""" # Simple context generation sentences = full_text.split('.') # Find sentences before and after chunk_start = full_text.find(chunk) chunk_end = chunk_start + len(chunk) # Get preceding and following context pre_context = full_text[max(0, chunk_start - 200):chunk_start] post_context = full_text[chunk_end:chunk_end + 200] context_parts = [] if pre_context.strip(): context_parts.append(f"Preceding: {pre_context.strip()}") if post_context.strip(): context_parts.append(f"Following: {post_context.strip()}") return " | ".join(context_parts) ``` ### 8. Modality-Specific Chunking ```python class ModalitySpecificChunker: def __init__(self): self.chunkers = { "text": RecursiveChunker(), "code": CodeChunker(), "table": TableChunker(), "image": ImageChunker() } def chunk_mixed_content(self, document): """Chunk document with multiple content types""" chunks = [] # Detect content types sections = self._detect_content_types(document) for section in sections: content_type = section["type"] content = section["content"] if content_type in self.chunkers: section_chunks = self.chunkers[content_type].chunk(content) for chunk in section_chunks: chunks.append({ "content": chunk, "type": content_type, "metadata": section.get("metadata", {}), "method": f"modality_specific_{content_type}" }) return chunks def _detect_content_types(self, document): """Detect different content types in document""" sections = [] # Simple detection logic if "```" in document: # Code blocks detected code_blocks = re.findall(r'```(\w+)?\n(.*?)\n```', document, re.DOTALL) for lang, code in code_blocks: sections.append({ "type": "code", "content": code, "metadata": {"language": lang} }) if "|" in document and "\n" in document: # Potential table detected sections.append({ "type": "table", "content": document, # Simplified "metadata": {} }) # Default to text sections.append({ "type": "text", "content": document, "metadata": {} }) return sections ``` ### 9. Agentic Chunking ```python class AgenticChunker: def __init__(self, chunking_agents): self.agents = chunking_agents def adaptive_chunking(self, text, requirements): """Use agents to determine optimal chunking strategy""" # Analyze text characteristics text_analysis = self._analyze_text(text) # Select appropriate agent based on requirements and text selected_agent = self._select_agent(text_analysis, requirements) # Use selected agent for chunking chunks = selected_agent.chunk(text, requirements) return { "chunks": chunks, "selected_agent": selected_agent.name, "reasoning": selected_agent.reasoning, "text_analysis": text_analysis } def _analyze_text(self, text): """Analyze text characteristics""" return { "length": len(text), "complexity": self._calculate_complexity(text), "structure": self._detect_structure(text), "content_type": self._detect_content_type(text) } def _select_agent(self, analysis, requirements): """Select best chunking agent""" for agent in self.agents: if agent.can_handle(analysis, requirements): return agent # Fallback to first agent return self.agents[0] ``` ### 10. Subdocument Chunking ```python class SubdocumentChunker: def __init__(self, max_size=5000): self.max_size = max_size def chunk_by_logical_sections(self, document): """Chunk document by logical sections""" sections = self._identify_logical_sections(document) chunks = [] for section in sections: if len(section["content"]) <= self.max_size: chunks.append({ "content": section["content"], "title": section["title"], "level": section["level"], "method": "subdocument_section" }) else: # Further split large sections sub_chunks = self._split_large_section(section) chunks.extend(sub_chunks) return chunks def _identify_logical_sections(self, document): """Identify logical sections in document""" sections = [] # Simple heading detection heading_pattern = r'^(#{1,6})\s+(.+)$' lines = document.split('\n') current_section = {"title": "Introduction", "content": "", "level": 0} for line in lines: match = re.match(heading_pattern, line) if match: # Save current section if current_section["content"].strip(): sections.append(current_section) # Start new section level = len(match.group(1)) title = match.group(2) current_section = { "title": title, "content": "", "level": level } else: current_section["content"] += line + "\n" # Add final section if current_section["content"].strip(): sections.append(current_section) return sections ``` ### 11. Hybrid Chunking ```python class HybridChunker: def __init__(self, strategies, weights=None): self.strategies = strategies self.weights = weights or [1.0 / len(strategies)] * len(strategies) def hybrid_chunk(self, text, evaluation_criteria=None): """Combine multiple chunking strategies""" all_chunks = [] # Apply all strategies for i, strategy in enumerate(self.strategies): strategy_chunks = strategy.chunk(text) for chunk in strategy_chunks: all_chunks.append({ "content": chunk, "strategy": strategy.name, "strategy_weight": self.weights[i], "method": "hybrid" }) # Evaluate and select best chunks if evaluation_criteria: evaluated_chunks = self._evaluate_chunks(all_chunks, evaluation_criteria) else: evaluated_chunks = all_chunks # Merge overlapping chunks from different strategies merged_chunks = self._merge_overlapping_chunks(evaluated_chunks) return merged_chunks def _evaluate_chunks(self, chunks, criteria): """Evaluate chunks based on criteria""" for chunk in chunks: score = 0.0 for criterion, weight in criteria.items(): criterion_score = self._evaluate_criterion(chunk, criterion) score += criterion_score * weight chunk["evaluation_score"] = score # Sort by evaluation score chunks.sort(key=lambda x: x["evaluation_score"], reverse=True) return chunks def _merge_overlapping_chunks(self, chunks): """Merge chunks that overlap significantly""" # Simple implementation - could be more sophisticated merged = [] used_indices = set() for i, chunk1 in enumerate(chunks): if i in used_indices: continue best_chunk = chunk1.copy() for j, chunk2 in enumerate(chunks[i+1:], i+1): if j in used_indices: continue # Check overlap overlap = self._calculate_overlap(chunk1["content"], chunk2["content"]) if overlap > 0.7: # High overlap # Merge chunks best_chunk["content"] = max( chunk1["content"], chunk2["content"], key=len ) best_chunk["merged_strategies"] = [ chunk1["strategy"], chunk2["strategy"] ] used_indices.add(j) merged.append(best_chunk) used_indices.add(i) return merged def _calculate_overlap(self, text1, text2): """Calculate text overlap ratio""" words1 = set(text1.lower().split()) words2 = set(text2.lower().split()) intersection = words1 & words2 union = words1 | words2 return len(intersection) / len(union) if union else 0 ``` ## Usage Examples ### Basic Usage ```python # Initialize different chunkers fixed_chunker = FixedLengthChunker(chunk_size=1000, chunk_overlap=200) semantic_chunker = SemanticChunker(similarity_threshold=0.8) hybrid_chunker = HybridChunker([fixed_chunker, semantic_chunker]) # Apply chunking text = "Your long document text here..." fixed_chunks = fixed_chunker.chunk_optimized(text, strategy="balanced") semantic_chunks = semantic_chunker.semantic_chunk_sentences(text) hybrid_chunks = hybrid_chunker.hybrid_chunk(text) print(f"Fixed chunks: {len(fixed_chunks)}") print(f"Semantic chunks: {len(semantic_chunks)}") print(f"Hybrid chunks: {len(hybrid_chunks)}") ``` ### Advanced Usage with Evaluation ```python # Create evaluation criteria evaluation_criteria = { "coherence": 0.4, "size_appropriateness": 0.3, "content_completeness": 0.3 } # Apply hybrid chunking with evaluation results = hybrid_chunker.hybrid_chunk(text, evaluation_criteria) # Analyze results for chunk in results[:5]: print(f"Strategy: {chunk['strategy']}") print(f"Score: {chunk.get('evaluation_score', 'N/A')}") print(f"Content preview: {chunk['content'][:100]}...") print("-" * 50) ``` These 11 advanced chunking strategies provide comprehensive coverage of different approaches for various document types and use cases, from simple fixed-size chunking to sophisticated hybrid methods that combine multiple strategies.