13 KiB
13 KiB
Detailed Chunking Strategies
This document provides comprehensive implementation details for all chunking strategies mentioned in the main skill.
Level 1: Fixed-Size Chunking
Implementation
from langchain.text_splitter import RecursiveCharacterTextSplitter
class FixedSizeChunker:
def __init__(self, chunk_size=512, chunk_overlap=50):
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function=len,
separators=["\n\n", "\n", " ", ""]
)
def chunk(self, documents):
return self.splitter.split_documents(documents)
Parameter Recommendations
| Use Case | Chunk Size | Overlap | Rationale |
|---|---|---|---|
| Factoid Queries | 256 | 25 | Small chunks for precise answers |
| General Q&A | 512 | 50 | Balanced approach for most cases |
| Analytical Queries | 1024 | 100 | Larger context for complex analysis |
| Code Documentation | 300 | 30 | Preserve code context while maintaining focus |
Best Practices
- Start with 512 tokens and 10-20% overlap
- Adjust based on embedding model context window
- Use overlap for queries where context might span boundaries
- Monitor token count vs. character count based on model
Level 2: Recursive Character Chunking
Implementation
from langchain.text_splitter import RecursiveCharacterTextSplitter
class RecursiveChunker:
def __init__(self, chunk_size=512, separators=None):
self.chunk_size = chunk_size
self.separators = separators or ["\n\n", "\n", " ", ""]
self.splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=0,
length_function=len,
separators=self.separators
)
def chunk(self, text):
return self.splitter.create_documents([text])
# Document-specific configurations
def get_chunker_for_document_type(doc_type):
configurations = {
"markdown": ["\n## ", "\n### ", "\n\n", "\n", " ", ""],
"html": ["</div>", "</p>", "\n\n", "\n", " ", ""],
"code": ["\n\n", "\n", " ", ""],
"plain": ["\n\n", "\n", " ", ""]
}
return RecursiveChunker(separators=configurations.get(doc_type, ["\n\n", "\n", " ", ""]))
Customization Guidelines
- Markdown: Use headings as primary separators
- HTML: Use block-level tags as separators
- Code: Preserve function and class boundaries
- Academic papers: Prioritize paragraph and section breaks
Level 3: Structure-Aware Chunking
Markdown Documents
import markdown
from bs4 import BeautifulSoup
class MarkdownChunker:
def __init__(self, max_chunk_size=512):
self.max_chunk_size = max_chunk_size
def chunk(self, markdown_text):
html = markdown.markdown(markdown_text)
soup = BeautifulSoup(html, 'html.parser')
chunks = []
current_chunk = ""
current_heading = "Introduction"
for element in soup.find_all(['h1', 'h2', 'h3', 'p', 'pre', 'table']):
if element.name.startswith('h'):
if current_chunk.strip():
chunks.append({
"content": current_chunk.strip(),
"heading": current_heading
})
current_heading = element.get_text().strip()
current_chunk = f"{element}\n"
elif element.name in ['pre', 'table']:
# Preserve code blocks and tables intact
if len(current_chunk) + len(str(element)) > self.max_chunk_size:
if current_chunk.strip():
chunks.append({
"content": current_chunk.strip(),
"heading": current_heading
})
current_chunk = f"{element}\n"
else:
current_chunk += f"{element}\n"
else:
current_chunk += str(element)
if current_chunk.strip():
chunks.append({
"content": current_chunk.strip(),
"heading": current_heading
})
return chunks
Code Documents
import ast
import re
class CodeChunker:
def __init__(self, language='python'):
self.language = language
def chunk_python(self, code):
tree = ast.parse(code)
chunks = []
for node in ast.walk(tree):
if isinstance(node, (ast.FunctionDef, ast.ClassDef)):
start_line = node.lineno - 1
end_line = node.end_lineno if hasattr(node, 'end_lineno') else start_line + 10
lines = code.split('\n')
chunk_lines = lines[start_line:end_line]
chunks.append('\n'.join(chunk_lines))
return chunks
def chunk_javascript(self, code):
# Use regex for languages without AST parsers
function_pattern = r'(function\s+\w+\s*\([^)]*\)\s*\{[^}]*\})'
class_pattern = r'(class\s+\w+\s*\{[^}]*\})'
patterns = [function_pattern, class_pattern]
chunks = []
for pattern in patterns:
matches = re.finditer(pattern, code, re.MULTILINE | re.DOTALL)
for match in matches:
chunks.append(match.group(1))
return chunks
def chunk(self, code):
if self.language == 'python':
return self.chunk_python(code)
elif self.language == 'javascript':
return self.chunk_javascript(code)
else:
# Fallback to line-based chunking
return self.chunk_by_lines(code)
def chunk_by_lines(self, code, max_lines=50):
lines = code.split('\n')
chunks = []
for i in range(0, len(lines), max_lines):
chunk = '\n'.join(lines[i:i+max_lines])
chunks.append(chunk)
return chunks
Tabular Data
import pandas as pd
class TableChunker:
def __init__(self, max_rows=100, summary_rows=5):
self.max_rows = max_rows
self.summary_rows = summary_rows
def chunk(self, table_data):
if isinstance(table_data, str):
df = pd.read_csv(StringIO(table_data))
else:
df = table_data
chunks = []
if len(df) <= self.max_rows:
# Small table - keep intact
chunks.append({
"type": "full_table",
"content": df.to_string(),
"metadata": {
"rows": len(df),
"columns": len(df.columns)
}
})
else:
# Large table - create summary + chunks
summary = df.head(self.summary_rows)
chunks.append({
"type": "table_summary",
"content": f"Table Summary ({len(df)} rows, {len(df.columns)} columns):\n{summary.to_string()}",
"metadata": {
"total_rows": len(df),
"summary_rows": self.summary_rows,
"columns": list(df.columns)
}
})
# Chunk the remaining data
for i in range(self.summary_rows, len(df), self.max_rows):
chunk_df = df.iloc[i:i+self.max_rows]
chunks.append({
"type": "table_chunk",
"content": f"Rows {i+1}-{min(i+self.max_rows, len(df))}:\n{chunk_df.to_string()}",
"metadata": {
"start_row": i + 1,
"end_row": min(i + self.max_rows, len(df)),
"columns": list(df.columns)
}
})
return chunks
Level 4: Semantic Chunking
Implementation
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
class SemanticChunker:
def __init__(self, model_name="all-MiniLM-L6-v2", similarity_threshold=0.8, buffer_size=3):
self.model = SentenceTransformer(model_name)
self.similarity_threshold = similarity_threshold
self.buffer_size = buffer_size
def split_into_sentences(self, text):
# Simple sentence splitting - can be enhanced with nltk/spacy
sentences = re.split(r'[.!?]+', text)
return [s.strip() for s in sentences if s.strip()]
def chunk(self, text):
sentences = self.split_into_sentences(text)
if len(sentences) <= self.buffer_size:
return [text]
# Create embeddings
embeddings = self.model.encode(sentences)
chunks = []
current_chunk_sentences = []
for i in range(len(sentences)):
current_chunk_sentences.append(sentences[i])
# Check if we should create a boundary
if i < len(sentences) - 1:
similarity = cosine_similarity(
[embeddings[i]],
[embeddings[i + 1]]
)[0][0]
if similarity < self.similarity_threshold and len(current_chunk_sentences) >= 2:
chunks.append(' '.join(current_chunk_sentences))
current_chunk_sentences = []
# Add remaining sentences
if current_chunk_sentences:
chunks.append(' '.join(current_chunk_sentences))
return chunks
Parameter Tuning
| Parameter | Range | Effect |
|---|---|---|
| similarity_threshold | 0.5-0.9 | Higher values create more chunks |
| buffer_size | 1-10 | Larger buffers provide more context |
| model_name | Various | Different models for different domains |
Optimization Tips
- Use domain-specific models for specialized content
- Adjust threshold based on content complexity
- Cache embeddings for repeated processing
- Consider batch processing for large documents
Level 5: Advanced Contextual Methods
Late Chunking
import torch
from transformers import AutoTokenizer, AutoModel
class LateChunker:
def __init__(self, model_name="microsoft/DialoGPT-medium"):
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModel.from_pretrained(model_name)
def chunk(self, text, chunk_size=512):
# Tokenize entire document
tokens = self.tokenizer(text, return_tensors="pt", truncation=False)
# Get token-level embeddings
with torch.no_grad():
outputs = self.model(**tokens, output_hidden_states=True)
token_embeddings = outputs.last_hidden_state[0]
# Create chunk embeddings from token embeddings
chunks = []
for i in range(0, len(token_embeddings), chunk_size):
chunk_tokens = token_embeddings[i:i+chunk_size]
chunk_embedding = torch.mean(chunk_tokens, dim=0)
chunks.append({
"content": self.tokenizer.decode(tokens["input_ids"][0][i:i+chunk_size]),
"embedding": chunk_embedding.numpy()
})
return chunks
Contextual Retrieval
import openai
class ContextualChunker:
def __init__(self, api_key):
self.client = openai.OpenAI(api_key=api_key)
def generate_context(self, chunk, full_document):
prompt = f"""
Given the following document and a chunk from it, provide a brief context
that helps understand the chunk's meaning within the full document.
Document:
{full_document[:2000]}...
Chunk:
{chunk}
Context (max 50 words):
"""
response = self.client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": prompt}],
max_tokens=100,
temperature=0
)
return response.choices[0].message.content.strip()
def chunk_with_context(self, text, base_chunker):
# First create base chunks
base_chunks = base_chunker.chunk(text)
# Then add context to each chunk
contextualized_chunks = []
for chunk in base_chunks:
context = self.generate_context(chunk.page_content, text)
contextualized_content = f"Context: {context}\n\nContent: {chunk.page_content}"
contextualized_chunks.append({
"content": contextualized_content,
"original_content": chunk.page_content,
"context": context
})
return contextualized_chunks
Performance Considerations
Computational Cost Analysis
| Strategy | Time Complexity | Space Complexity | Relative Cost |
|---|---|---|---|
| Fixed-Size | O(n) | O(n) | Low |
| Recursive | O(n) | O(n) | Low |
| Structure-Aware | O(n log n) | O(n) | Medium |
| Semantic | O(n²) | O(n²) | High |
| Late Chunking | O(n) | O(n) | Very High |
| Contextual | O(n²) | O(n²) | Very High |
Optimization Strategies
- Parallel Processing: Process chunks concurrently when possible
- Caching: Store embeddings and intermediate results
- Batch Operations: Group similar operations together
- Progressive Loading: Process large documents in streaming fashion
- Model Selection: Choose appropriate models for task complexity