Files
gh-giuseppe-trisciuoglio-de…/skills/ai/chunking-strategy/references/strategies.md
2025-11-29 18:28:30 +08:00

13 KiB

Detailed Chunking Strategies

This document provides comprehensive implementation details for all chunking strategies mentioned in the main skill.

Level 1: Fixed-Size Chunking

Implementation

from langchain.text_splitter import RecursiveCharacterTextSplitter

class FixedSizeChunker:
    def __init__(self, chunk_size=512, chunk_overlap=50):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
            separators=["\n\n", "\n", " ", ""]
        )

    def chunk(self, documents):
        return self.splitter.split_documents(documents)

Parameter Recommendations

Use Case Chunk Size Overlap Rationale
Factoid Queries 256 25 Small chunks for precise answers
General Q&A 512 50 Balanced approach for most cases
Analytical Queries 1024 100 Larger context for complex analysis
Code Documentation 300 30 Preserve code context while maintaining focus

Best Practices

  • Start with 512 tokens and 10-20% overlap
  • Adjust based on embedding model context window
  • Use overlap for queries where context might span boundaries
  • Monitor token count vs. character count based on model

Level 2: Recursive Character Chunking

Implementation

from langchain.text_splitter import RecursiveCharacterTextSplitter

class RecursiveChunker:
    def __init__(self, chunk_size=512, separators=None):
        self.chunk_size = chunk_size
        self.separators = separators or ["\n\n", "\n", " ", ""]
        self.splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=0,
            length_function=len,
            separators=self.separators
        )

    def chunk(self, text):
        return self.splitter.create_documents([text])

# Document-specific configurations
def get_chunker_for_document_type(doc_type):
    configurations = {
        "markdown": ["\n## ", "\n### ", "\n\n", "\n", " ", ""],
        "html": ["</div>", "</p>", "\n\n", "\n", " ", ""],
        "code": ["\n\n", "\n", " ", ""],
        "plain": ["\n\n", "\n", " ", ""]
    }
    return RecursiveChunker(separators=configurations.get(doc_type, ["\n\n", "\n", " ", ""]))

Customization Guidelines

  • Markdown: Use headings as primary separators
  • HTML: Use block-level tags as separators
  • Code: Preserve function and class boundaries
  • Academic papers: Prioritize paragraph and section breaks

Level 3: Structure-Aware Chunking

Markdown Documents

import markdown
from bs4 import BeautifulSoup

class MarkdownChunker:
    def __init__(self, max_chunk_size=512):
        self.max_chunk_size = max_chunk_size

    def chunk(self, markdown_text):
        html = markdown.markdown(markdown_text)
        soup = BeautifulSoup(html, 'html.parser')

        chunks = []
        current_chunk = ""
        current_heading = "Introduction"

        for element in soup.find_all(['h1', 'h2', 'h3', 'p', 'pre', 'table']):
            if element.name.startswith('h'):
                if current_chunk.strip():
                    chunks.append({
                        "content": current_chunk.strip(),
                        "heading": current_heading
                    })
                current_heading = element.get_text().strip()
                current_chunk = f"{element}\n"
            elif element.name in ['pre', 'table']:
                # Preserve code blocks and tables intact
                if len(current_chunk) + len(str(element)) > self.max_chunk_size:
                    if current_chunk.strip():
                        chunks.append({
                            "content": current_chunk.strip(),
                            "heading": current_heading
                        })
                    current_chunk = f"{element}\n"
                else:
                    current_chunk += f"{element}\n"
            else:
                current_chunk += str(element)

        if current_chunk.strip():
            chunks.append({
                "content": current_chunk.strip(),
                "heading": current_heading
            })

        return chunks

Code Documents

import ast
import re

class CodeChunker:
    def __init__(self, language='python'):
        self.language = language

    def chunk_python(self, code):
        tree = ast.parse(code)
        chunks = []

        for node in ast.walk(tree):
            if isinstance(node, (ast.FunctionDef, ast.ClassDef)):
                start_line = node.lineno - 1
                end_line = node.end_lineno if hasattr(node, 'end_lineno') else start_line + 10
                lines = code.split('\n')
                chunk_lines = lines[start_line:end_line]
                chunks.append('\n'.join(chunk_lines))

        return chunks

    def chunk_javascript(self, code):
        # Use regex for languages without AST parsers
        function_pattern = r'(function\s+\w+\s*\([^)]*\)\s*\{[^}]*\})'
        class_pattern = r'(class\s+\w+\s*\{[^}]*\})'

        patterns = [function_pattern, class_pattern]
        chunks = []

        for pattern in patterns:
            matches = re.finditer(pattern, code, re.MULTILINE | re.DOTALL)
            for match in matches:
                chunks.append(match.group(1))

        return chunks

    def chunk(self, code):
        if self.language == 'python':
            return self.chunk_python(code)
        elif self.language == 'javascript':
            return self.chunk_javascript(code)
        else:
            # Fallback to line-based chunking
            return self.chunk_by_lines(code)

    def chunk_by_lines(self, code, max_lines=50):
        lines = code.split('\n')
        chunks = []

        for i in range(0, len(lines), max_lines):
            chunk = '\n'.join(lines[i:i+max_lines])
            chunks.append(chunk)

        return chunks

Tabular Data

import pandas as pd

class TableChunker:
    def __init__(self, max_rows=100, summary_rows=5):
        self.max_rows = max_rows
        self.summary_rows = summary_rows

    def chunk(self, table_data):
        if isinstance(table_data, str):
            df = pd.read_csv(StringIO(table_data))
        else:
            df = table_data

        chunks = []

        if len(df) <= self.max_rows:
            # Small table - keep intact
            chunks.append({
                "type": "full_table",
                "content": df.to_string(),
                "metadata": {
                    "rows": len(df),
                    "columns": len(df.columns)
                }
            })
        else:
            # Large table - create summary + chunks
            summary = df.head(self.summary_rows)
            chunks.append({
                "type": "table_summary",
                "content": f"Table Summary ({len(df)} rows, {len(df.columns)} columns):\n{summary.to_string()}",
                "metadata": {
                    "total_rows": len(df),
                    "summary_rows": self.summary_rows,
                    "columns": list(df.columns)
                }
            })

            # Chunk the remaining data
            for i in range(self.summary_rows, len(df), self.max_rows):
                chunk_df = df.iloc[i:i+self.max_rows]
                chunks.append({
                    "type": "table_chunk",
                    "content": f"Rows {i+1}-{min(i+self.max_rows, len(df))}:\n{chunk_df.to_string()}",
                    "metadata": {
                        "start_row": i + 1,
                        "end_row": min(i + self.max_rows, len(df)),
                        "columns": list(df.columns)
                    }
                })

        return chunks

Level 4: Semantic Chunking

Implementation

import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

class SemanticChunker:
    def __init__(self, model_name="all-MiniLM-L6-v2", similarity_threshold=0.8, buffer_size=3):
        self.model = SentenceTransformer(model_name)
        self.similarity_threshold = similarity_threshold
        self.buffer_size = buffer_size

    def split_into_sentences(self, text):
        # Simple sentence splitting - can be enhanced with nltk/spacy
        sentences = re.split(r'[.!?]+', text)
        return [s.strip() for s in sentences if s.strip()]

    def chunk(self, text):
        sentences = self.split_into_sentences(text)

        if len(sentences) <= self.buffer_size:
            return [text]

        # Create embeddings
        embeddings = self.model.encode(sentences)

        chunks = []
        current_chunk_sentences = []

        for i in range(len(sentences)):
            current_chunk_sentences.append(sentences[i])

            # Check if we should create a boundary
            if i < len(sentences) - 1:
                similarity = cosine_similarity(
                    [embeddings[i]],
                    [embeddings[i + 1]]
                )[0][0]

                if similarity < self.similarity_threshold and len(current_chunk_sentences) >= 2:
                    chunks.append(' '.join(current_chunk_sentences))
                    current_chunk_sentences = []

        # Add remaining sentences
        if current_chunk_sentences:
            chunks.append(' '.join(current_chunk_sentences))

        return chunks

Parameter Tuning

Parameter Range Effect
similarity_threshold 0.5-0.9 Higher values create more chunks
buffer_size 1-10 Larger buffers provide more context
model_name Various Different models for different domains

Optimization Tips

  • Use domain-specific models for specialized content
  • Adjust threshold based on content complexity
  • Cache embeddings for repeated processing
  • Consider batch processing for large documents

Level 5: Advanced Contextual Methods

Late Chunking

import torch
from transformers import AutoTokenizer, AutoModel

class LateChunker:
    def __init__(self, model_name="microsoft/DialoGPT-medium"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)

    def chunk(self, text, chunk_size=512):
        # Tokenize entire document
        tokens = self.tokenizer(text, return_tensors="pt", truncation=False)

        # Get token-level embeddings
        with torch.no_grad():
            outputs = self.model(**tokens, output_hidden_states=True)
            token_embeddings = outputs.last_hidden_state[0]

        # Create chunk embeddings from token embeddings
        chunks = []
        for i in range(0, len(token_embeddings), chunk_size):
            chunk_tokens = token_embeddings[i:i+chunk_size]
            chunk_embedding = torch.mean(chunk_tokens, dim=0)
            chunks.append({
                "content": self.tokenizer.decode(tokens["input_ids"][0][i:i+chunk_size]),
                "embedding": chunk_embedding.numpy()
            })

        return chunks

Contextual Retrieval

import openai

class ContextualChunker:
    def __init__(self, api_key):
        self.client = openai.OpenAI(api_key=api_key)

    def generate_context(self, chunk, full_document):
        prompt = f"""
        Given the following document and a chunk from it, provide a brief context
        that helps understand the chunk's meaning within the full document.

        Document:
        {full_document[:2000]}...

        Chunk:
        {chunk}

        Context (max 50 words):
        """

        response = self.client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=100,
            temperature=0
        )

        return response.choices[0].message.content.strip()

    def chunk_with_context(self, text, base_chunker):
        # First create base chunks
        base_chunks = base_chunker.chunk(text)

        # Then add context to each chunk
        contextualized_chunks = []
        for chunk in base_chunks:
            context = self.generate_context(chunk.page_content, text)
            contextualized_content = f"Context: {context}\n\nContent: {chunk.page_content}"

            contextualized_chunks.append({
                "content": contextualized_content,
                "original_content": chunk.page_content,
                "context": context
            })

        return contextualized_chunks

Performance Considerations

Computational Cost Analysis

Strategy Time Complexity Space Complexity Relative Cost
Fixed-Size O(n) O(n) Low
Recursive O(n) O(n) Low
Structure-Aware O(n log n) O(n) Medium
Semantic O(n²) O(n²) High
Late Chunking O(n) O(n) Very High
Contextual O(n²) O(n²) Very High

Optimization Strategies

  1. Parallel Processing: Process chunks concurrently when possible
  2. Caching: Store embeddings and intermediate results
  3. Batch Operations: Group similar operations together
  4. Progressive Loading: Process large documents in streaming fashion
  5. Model Selection: Choose appropriate models for task complexity