48 KiB
48 KiB
Visualization and Evaluation Tools
This document covers tools and methodologies for visualizing chunking strategies and evaluating their effectiveness.
Overview of Visualization Tools
| Tool | Purpose | Complexity | Integration |
|---|---|---|---|
| ChunkViz | Visualize chunk boundaries and overlaps | Low | Standalone |
| Plotly | Interactive visualizations | Medium | Library |
| TensorBoard | ML experiment tracking | High | Framework |
| Streamlit | Web-based dashboards | Medium | Web App |
| D3.js | Custom visualizations | High | Web Library |
1. ChunkViz - Chunking Visualization Tool
Overview
ChunkViz is a specialized tool for visualizing how different chunking strategies behave with various parameters.
Installation and Setup
# Clone ChunkViz repository
git clone https://github.com/gkamradt/ChunkViz.git
cd ChunkViz
# Install dependencies
pip install -r requirements.txt
Basic Usage
from chunkviz import visualize_chunking
import matplotlib.pyplot as plt
# Sample text
sample_text = """
Natural language processing has evolved significantly over the past decade.
Modern transformer models have revolutionized how we approach text understanding.
These models use attention mechanisms to process input text efficiently.
The attention mechanism allows models to focus on relevant input parts.
Retrieval-Augmented Generation (RAG) combines LLMs with external knowledge.
This approach enables models to access current information beyond training data.
RAG systems typically have three main components: retriever, knowledge base, generator.
The retriever finds relevant documents based on user queries.
The generator uses retrieved documents to produce informed responses.
"""
# Visualize different chunking strategies
strategies = [
{"name": "Fixed Size 100", "chunk_size": 100, "overlap": 0},
{"name": "Fixed Size 200", "chunk_size": 200, "overlap": 0},
{"name": "Fixed Size 100 + 20% Overlap", "chunk_size": 100, "overlap": 20},
{"name": "Semantic", "method": "semantic", "threshold": 0.7}
]
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.flatten()
for i, strategy in enumerate(strategies):
if i < len(axes):
visualize_chunking(
text=sample_text,
strategy=strategy,
ax=axes[i],
title=strategy["name"]
)
plt.tight_layout()
plt.show()
Advanced ChunkViz Implementation
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np
from typing import List, Dict, Tuple
import re
class AdvancedChunkViz:
def __init__(self, figsize=(12, 8)):
self.figsize = figsize
self.colors = plt.cm.Set3(np.linspace(0, 1, 12))
def visualize_multiple_strategies(self, text, strategies, save_path=None):
"""Visualize multiple chunking strategies side by side"""
n_strategies = len(strategies)
n_cols = min(3, n_strategies)
n_rows = (n_strategies + n_cols - 1) // n_cols
fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols * 4, n_rows * 3))
if n_strategies == 1:
axes = [axes]
elif n_rows == 1:
axes = axes.reshape(1, -1)
for i, strategy in enumerate(strategies):
row, col = i // n_cols, i % n_cols
ax = axes[row, col] if n_rows > 1 else axes[col]
self._visualize_single_strategy(text, strategy, ax)
# Hide unused subplots
for i in range(n_strategies, n_rows * n_cols):
row, col = i // n_cols, i % n_cols
ax = axes[row, col] if n_rows > 1 else axes[col]
ax.set_visible(False)
plt.tight_layout()
if save_path:
plt.savefig(save_path, dpi=300, bbox_inches='tight')
plt.show()
def _visualize_single_strategy(self, text, strategy, ax):
"""Visualize a single chunking strategy"""
chunks = self._apply_strategy(text, strategy)
# Set up the plot
ax.set_xlim(0, len(text))
ax.set_ylim(0, len(chunks))
ax.set_xlabel('Character Position')
ax.set_ylabel('Chunk Index')
ax.set_title(self._get_strategy_title(strategy))
# Color code chunks
for i, chunk in enumerate(chunks):
start_pos = chunk['start']
end_pos = chunk['end']
color = self.colors[i % len(self.colors)]
# Draw chunk rectangle
rect = patches.Rectangle(
(start_pos, i), end_pos - start_pos, 0.8,
linewidth=1, edgecolor='black', facecolor=color, alpha=0.7
)
ax.add_patch(rect)
# Add chunk text (truncated)
chunk_text = chunk['text'][:20] + "..." if len(chunk['text']) > 20 else chunk['text']
ax.text(
start_pos + (end_pos - start_pos) / 2, i + 0.4,
chunk_text, ha='center', va='center', fontsize=8
)
# Add chunk size info
ax.text(
start_pos, i - 0.1,
f"{len(chunk['text'])} chars", ha='left', va='top', fontsize=6
)
# Add overlap indicators
for i in range(1, len(chunks)):
prev_end = chunks[i-1]['end']
curr_start = chunks[i]['start']
if curr_start < prev_end: # Overlap exists
overlap_len = prev_end - curr_start
ax.axvspan(
curr_start, prev_end, alpha=0.3, color='red',
ymin=(i-0.5)/len(chunks), ymax=(i+0.5)/len(chunks)
)
ax.text(
(curr_start + prev_end) / 2, i + 0.9,
f"Overlap: {overlap_len}", ha='center', va='bottom',
fontsize=6, color='red', weight='bold'
)
def _apply_strategy(self, text, strategy):
"""Apply chunking strategy to text"""
if strategy.get('method') == 'semantic':
return self._semantic_chunking(text, strategy)
else:
return self._fixed_size_chunking(text, strategy)
def _fixed_size_chunking(self, text, strategy):
"""Fixed-size chunking"""
chunk_size = strategy.get('chunk_size', 100)
overlap = strategy.get('overlap', 0)
overlap_chars = int(chunk_size * overlap / 100) if isinstance(overlap, int) else overlap
chunks = []
start = 0
while start < len(text):
end = min(start + chunk_size, len(text))
chunks.append({
'text': text[start:end],
'start': start,
'end': end
})
# Calculate next start with overlap
start = max(0, end - overlap_chars)
if end >= len(text):
break
return chunks
def _semantic_chunking(self, text, strategy):
"""Simple semantic chunking simulation"""
# For visualization purposes, split by sentences
sentences = re.split(r'[.!?]+', text)
sentences = [s.strip() for s in sentences if s.strip()]
threshold = strategy.get('threshold', 0.7)
chunks = []
current_chunk = []
current_start = 0
for i, sentence in enumerate(sentences):
current_chunk.append(sentence)
# Simple heuristic: create new chunk after certain number of sentences
if len(current_chunk) >= 3:
chunk_text = '. '.join(current_chunk) + '.'
end_pos = current_start + len(chunk_text)
chunks.append({
'text': chunk_text,
'start': current_start,
'end': end_pos
})
# Start new chunk
current_start = end_pos
current_chunk = []
# Add remaining sentences
if current_chunk:
chunk_text = '. '.join(current_chunk) + '.'
chunks.append({
'text': chunk_text,
'start': current_start,
'end': current_start + len(chunk_text)
})
return chunks
def _get_strategy_title(self, strategy):
"""Get descriptive title for strategy"""
if strategy.get('method') == 'semantic':
return f"Semantic (threshold={strategy.get('threshold', 0.7)})"
else:
chunk_size = strategy.get('chunk_size', 100)
overlap = strategy.get('overlap', 0)
return f"Fixed Size ({chunk_size} chars, {overlap}% overlap)"
def create_interactive_visualization(self, text, strategies):
"""Create interactive visualization using Plotly"""
import plotly.graph_objects as go
from plotly.subplots import make_subplots
n_strategies = len(strategies)
fig = make_subplots(
rows=n_strategies, cols=1,
subplot_titles=[self._get_strategy_title(s) for s in strategies],
vertical_spacing=0.1
)
for i, strategy in enumerate(strategies):
chunks = self._apply_strategy(text, strategy)
for j, chunk in enumerate(chunks):
fig.add_trace(
go.Scatter(
x=[chunk['start'], chunk['end']],
y=[j, j],
mode='lines+markers',
line=dict(width=20),
hovertemplate='<b>Chunk %{text}</b><br>' +
'Start: %{x}<br>' +
'End: %{customdata}<br>' +
'Length: %{marker.size}<extra></extra>',
text=[f"Chunk {j+1}"],
customdata=[chunk['end']],
marker=dict(size=len(chunk['text'])),
name=f"Chunk {j+1}"
),
row=i+1, col=1
)
fig.update_layout(
height=300 * n_strategies,
title_text="Interactive Chunking Visualization",
showlegend=False
)
return fig
def compare_chunk_size_distributions(self, text, chunk_sizes):
"""Compare chunk size distributions"""
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
# Collect chunk sizes for each strategy
all_chunk_sizes = []
labels = []
for size in chunk_sizes:
strategy = {"chunk_size": size, "overlap": 0}
chunks = self._apply_strategy(text, strategy)
chunk_sizes_list = [len(chunk['text']) for chunk in chunks]
all_chunk_sizes.append(chunk_sizes_list)
labels.append(f"Size {size}")
# Box plot
ax1.boxplot(all_chunk_sizes, labels=labels)
ax1.set_title('Chunk Size Distribution')
ax1.set_ylabel('Chunk Size (characters)')
ax1.grid(True, alpha=0.3)
# Histogram
for i, (sizes, label) in enumerate(zip(all_chunk_sizes, labels)):
ax2.hist(sizes, alpha=0.7, label=label, bins=20)
ax2.set_title('Chunk Size Histograms')
ax2.set_xlabel('Chunk Size (characters)')
ax2.set_ylabel('Frequency')
ax2.legend()
ax2.grid(True, alpha=0.3)
plt.tight_layout()
return fig
def visualize_overlap_effects(self, text, base_chunk_size, overlap_percentages):
"""Visualize effects of different overlap percentages"""
fig, axes = plt.subplots(len(overlap_percentages), 1,
figsize=(12, 3 * len(overlap_percentages)))
if len(overlap_percentages) == 1:
axes = [axes]
for i, overlap in enumerate(overlap_percentages):
strategy = {"chunk_size": base_chunk_size, "overlap": overlap}
chunks = self._apply_strategy(text, strategy)
self._visualize_single_strategy(text, strategy, axes[i])
# Add overlap statistics
total_overlap = sum(
max(0, chunks[j-1]['end'] - chunks[j]['start'])
for j in range(1, len(chunks))
)
total_text = len(text)
overlap_percentage = (total_overlap / total_text) * 100 if total_text > 0 else 0
axes[i].text(
0.02, 0.98, f"Total Overlap: {overlap_percentage:.1f}%",
transform=axes[i].transAxes, ha='left', va='top',
bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8)
)
plt.tight_layout()
return fig
2. Plotly Interactive Visualizations
Advanced Interactive Dashboard
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import pandas as pd
import numpy as np
from typing import List, Dict
class InteractiveChunkingDashboard:
def __init__(self):
self.figures = {}
def create_strategy_comparison_dashboard(self, text, strategies):
"""Create comprehensive dashboard for strategy comparison"""
# Create subplots
fig = make_subplots(
rows=3, cols=2,
subplot_titles=[
'Chunk Boundaries', 'Chunk Size Distribution',
'Overlap Analysis', 'Coverage Analysis',
'Performance Metrics', 'Interactive Chunk Explorer'
],
specs=[
[{"type": "scatter"}, {"type": "histogram"}],
[{"type": "bar"}, {"type": "scatter"}],
[{"type": "bar"}, {"type": "table"}]
]
)
# Process all strategies
strategy_data = {}
for strategy in strategies:
chunks = self._apply_chunking_strategy(text, strategy)
strategy_data[strategy['name']] = {
'chunks': chunks,
'strategy': strategy
}
# 1. Chunk Boundaries Visualization
self._add_chunk_boundaries(fig, strategy_data, row=1, col=1)
# 2. Chunk Size Distribution
self._add_size_distribution(fig, strategy_data, row=1, col=2)
# 3. Overlap Analysis
self._add_overlap_analysis(fig, strategy_data, row=2, col=1)
# 4. Coverage Analysis
self._add_coverage_analysis(fig, strategy_data, text, row=2, col=2)
# 5. Performance Metrics
self._add_performance_metrics(fig, strategy_data, row=3, col=1)
# 6. Interactive Chunk Explorer
self._add_chunk_explorer(fig, strategy_data, row=3, col=2)
# Update layout
fig.update_layout(
height=1200,
title_text="Comprehensive Chunking Strategy Analysis",
showlegend=True
)
return fig
def _apply_chunking_strategy(self, text, strategy):
"""Apply chunking strategy and return chunks"""
# This would use actual chunking implementations
# For now, simulate fixed-size chunking
chunk_size = strategy.get('chunk_size', 100)
overlap = strategy.get('overlap', 0)
chunks = []
start = 0
chunk_id = 0
while start < len(text):
end = min(start + chunk_size, len(text))
chunks.append({
'id': chunk_id,
'text': text[start:end],
'start': start,
'end': end,
'size': end - start
})
chunk_id += 1
# Calculate next start with overlap
start = max(0, end - int(chunk_size * overlap / 100))
if end >= len(text):
break
return chunks
def _add_chunk_boundaries(self, fig, strategy_data, row, col):
"""Add chunk boundaries visualization"""
colors = px.colors.qualitative.Set3
for i, (strategy_name, data) in enumerate(strategy_data.items()):
chunks = data['chunks']
for j, chunk in enumerate(chunks):
fig.add_trace(
go.Scatter(
x=[chunk['start'], chunk['end']],
y=[i, i],
mode='lines+markers',
line=dict(width=15, color=colors[i % len(colors)]),
hovertemplate=f'<b>{strategy_name}</b><br>' +
f'Chunk {j+1}<br>' +
f'Start: {chunk["start"]}<br>' +
f'End: {chunk["end"]}<br>' +
f'Size: {chunk["size"]}<extra></extra>',
name=f'{strategy_name} - Chunk {j+1}' if j < 3 else None,
showlegend=j < 3
),
row=row, col=col
)
fig.update_xaxes(title_text="Character Position", row=row, col=col)
fig.update_yaxes(title_text="Strategy", row=row, col=col)
def _add_size_distribution(self, fig, strategy_data, row, col):
"""Add chunk size distribution histogram"""
for i, (strategy_name, data) in enumerate(strategy_data.items()):
chunks = data['chunks']
sizes = [chunk['size'] for chunk in chunks]
fig.add_trace(
go.Histogram(
x=sizes,
name=strategy_name,
opacity=0.7,
nbinsx=20
),
row=row, col=col
)
fig.update_xaxes(title_text="Chunk Size (characters)", row=row, col=col)
fig.update_yaxes(title_text="Frequency", row=row, col=col)
def _add_overlap_analysis(self, fig, strategy_data, row, col):
"""Add overlap analysis bar chart"""
strategy_names = []
overlap_totals = []
overlap_percentages = []
for strategy_name, data in strategy_data.items():
chunks = data['chunks']
strategy = data['strategy']
overlap = strategy.get('overlap', 0)
if overlap > 0:
total_overlap = sum(
max(0, chunks[j-1]['end'] - chunks[j]['start'])
for j in range(1, len(chunks))
)
overlap_pct = (total_overlap / len(chunks[0]['text'])) * 100 if chunks else 0
else:
total_overlap = 0
overlap_pct = 0
strategy_names.append(strategy_name)
overlap_totals.append(total_overlap)
overlap_percentages.append(overlap_pct)
fig.add_trace(
go.Bar(
x=strategy_names,
y=overlap_percentages,
name="Overlap %",
marker_color='lightblue'
),
row=row, col=col
)
fig.update_xaxes(title_text="Strategy", row=row, col=col)
fig.update_yaxes(title_text="Overlap Percentage", row=row, col=col)
def _add_coverage_analysis(self, fig, strategy_data, text, row, col):
"""Add coverage analysis scatter plot"""
strategy_names = []
chunk_counts = []
avg_sizes = []
coverage_scores = []
for strategy_name, data in strategy_data.items():
chunks = data['chunks']
# Calculate coverage (simplified)
total_covered = sum(chunk['size'] for chunk in chunks)
coverage = (total_covered / len(text)) * 100 if text else 0
strategy_names.append(strategy_name)
chunk_counts.append(len(chunks))
avg_sizes.append(np.mean([chunk['size'] for chunk in chunks]) if chunks else 0)
coverage_scores.append(coverage)
fig.add_trace(
go.Scatter(
x=chunk_counts,
y=avg_sizes,
mode='markers+text',
text=strategy_names,
textposition="top center",
marker=dict(
size=coverage_scores,
color=coverage_scores,
colorscale='Viridis',
showscale=True,
colorbar=dict(title="Coverage %", x=1.02)
),
name="Strategies"
),
row=row, col=col
)
fig.update_xaxes(title_text="Number of Chunks", row=row, col=col)
fig.update_yaxes(title_text="Average Chunk Size", row=row, col=col)
def _add_performance_metrics(self, fig, strategy_data, row, col):
"""Add performance metrics comparison"""
metrics = ['Coherence', 'Speed', 'Memory', 'Accuracy']
strategies = list(strategy_data.keys())
# Simulated metrics (in real implementation, these would be calculated)
metric_values = {
'Coherence': [0.8, 0.7, 0.9, 0.6],
'Speed': [0.9, 0.95, 0.7, 0.85],
'Memory': [0.7, 0.8, 0.6, 0.9],
'Accuracy': [0.75, 0.8, 0.85, 0.7]
}
for i, metric in enumerate(metrics):
fig.add_trace(
go.Bar(
x=strategies,
y=metric_values[metric][:len(strategies)],
name=metric,
opacity=0.8
),
row=row, col=col
)
fig.update_xaxes(title_text="Strategy", row=row, col=col)
fig.update_yaxes(title_text="Score (0-1)", row=row, col=col)
def _add_chunk_explorer(self, fig, strategy_data, row, col):
"""Add interactive chunk explorer table"""
# Select first strategy for table display
first_strategy = list(strategy_data.keys())[0]
chunks = strategy_data[first_strategy]['chunks']
# Create table data
table_data = [
[
f"Chunk {i+1}",
f"{chunk['size']}",
f"{chunk['start']}-{chunk['end']}",
chunk['text'][:50] + "..." if len(chunk['text']) > 50 else chunk['text']
]
for i, chunk in enumerate(chunks[:10]) # Limit to 10 chunks
]
fig.add_trace(
go.Table(
header=dict(
values=['Chunk ID', 'Size', 'Position', 'Preview'],
fill_color='lightblue',
align='left'
),
cells=dict(
values=list(zip(*table_data)),
fill_color='white',
align='left'
),
name="Chunk Details"
),
row=row, col=col
)
def create_real_time_analysis(self, text_input_callback):
"""Create real-time analysis dashboard"""
fig = go.Figure()
# This would be connected to real-time data
fig.add_trace(
go.Scatter(
x=[1, 2, 3, 4],
y=[10, 11, 12, 13],
mode='lines+markers',
name='Real-time Performance'
)
)
fig.update_layout(
title="Real-time Chunking Analysis",
xaxis_title="Time",
yaxis_title="Performance Metric",
updatemenus=[{
'buttons': [
{
'label': 'Play',
'method': 'animate',
'args': [None]
},
{
'label': 'Pause',
'method': 'animate',
'args': [[None], {'frame': {'duration': 0, 'redraw': False}}]
}
],
'direction': 'left',
'pad': {'r': 10, 't': 87},
'showactive': False,
'x': 0.011,
'xanchor': 'right',
'y': 0,
'yanchor': 'top'
}]
)
return fig
3. Streamlit Web Dashboard
Complete Web Application
import streamlit as st
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import numpy as np
from typing import List, Dict
import time
class StreamlitChunkingDashboard:
def __init__(self):
self.chunking_strategies = {
"Fixed Size - Small": {"chunk_size": 100, "overlap": 0},
"Fixed Size - Medium": {"chunk_size": 250, "overlap": 0},
"Fixed Size - Large": {"chunk_size": 500, "overlap": 0},
"With Overlap - 10%": {"chunk_size": 250, "overlap": 10},
"With Overlap - 25%": {"chunk_size": 250, "overlap": 25},
"Semantic": {"method": "semantic", "threshold": 0.7}
}
def run(self):
"""Run the Streamlit dashboard"""
st.set_page_config(
page_title="Chunking Strategy Analyzer",
page_icon="📊",
layout="wide"
)
st.title("🔪 Chunking Strategy Analyzer")
st.markdown("""
Analyze and visualize different chunking strategies for RAG systems.
Compare performance metrics and find the optimal approach for your use case.
""")
# Sidebar for configuration
self._render_sidebar()
# Main content area
tab1, tab2, tab3, tab4 = st.tabs(["📊 Analysis", "🔍 Comparison", "📈 Metrics", "⚙️ Settings"])
with tab1:
self._render_analysis_tab()
with tab2:
self._render_comparison_tab()
with tab3:
self._render_metrics_tab()
with tab4:
self._render_settings_tab()
def _render_sidebar(self):
"""Render sidebar configuration"""
st.sidebar.header("Configuration")
# Text input
input_method = st.sidebar.radio(
"Input Method",
["Sample Text", "Upload File", "Enter Text"]
)
if input_method == "Sample Text":
self._load_sample_text()
elif input_method == "Upload File":
self._handle_file_upload()
else:
self._handle_text_input()
# Strategy selection
st.sidebar.subheader("Select Strategies")
selected_strategies = []
for strategy_name in self.chunking_strategies.keys():
if st.sidebar.checkbox(strategy_name, value=True):
selected_strategies.append(strategy_name)
st.session_state.selected_strategies = selected_strategies
# Analysis parameters
st.sidebar.subheader("Analysis Parameters")
st.session_state.min_chunk_size = st.sidebar.slider(
"Min Chunk Size", 50, 500, 100
)
st.session_state.max_chunk_size = st.sidebar.slider(
"Max Chunk Size", 100, 2000, 500
)
def _load_sample_text(self):
"""Load sample text for demonstration"""
sample_texts = {
"Technical Document": """
Machine learning is a subset of artificial intelligence that enables systems to learn and improve from experience without being explicitly programmed.
Deep learning, a subset of machine learning, uses neural networks with multiple layers to progressively extract higher-level features from raw input.
Natural Language Processing (NLP) combines computational linguistics with statistical and machine learning models to enable computers to process human language.
Transformer models have revolutionized NLP by using attention mechanisms to weigh the importance of different words in the input text.
Retrieval-Augmented Generation (RAG) enhances language models by retrieving relevant information from external knowledge bases.
""",
"Legal Document": """
This agreement is entered into on this date between the parties herein mentioned.
The terms and conditions outlined herein shall govern the relationship between the parties.
Any breach of contract shall result in immediate termination and possible legal action.
The jurisdiction for any disputes shall be the courts of the specified region.
All amendments must be made in writing and signed by both parties.
""",
"Scientific Paper": """
Our research demonstrates significant improvements in chunking strategies for RAG systems.
We evaluated multiple approaches across different document types and query patterns.
Experimental results show that semantic chunking outperforms fixed-size methods by 23% on average.
Contextual retrieval provides additional benefits but requires increased computational resources.
Future work will explore hybrid approaches that combine multiple strategies.
"""
}
selected_sample = st.sidebar.selectbox(
"Select Sample Text",
list(sample_texts.keys())
)
st.session_state.input_text = sample_texts[selected_sample]
def _handle_file_upload(self):
"""Handle file upload"""
uploaded_file = st.sidebar.file_uploader(
"Choose a file",
type=['txt', 'md', 'csv']
)
if uploaded_file is not None:
try:
content = uploaded_file.read().decode('utf-8')
st.session_state.input_text = content
st.sidebar.success(f"Loaded {uploaded_file.name}")
except Exception as e:
st.sidebar.error(f"Error reading file: {e}")
def _handle_text_input(self):
"""Handle manual text input"""
text = st.sidebar.text_area(
"Enter your text:",
height=200
)
if text:
st.session_state.input_text = text
def _render_analysis_tab(self):
"""Render analysis tab"""
if 'input_text' not in st.session_state or not st.session_state.input_text:
st.warning("Please input text using the sidebar configuration.")
return
st.header("📊 Chunk Analysis")
# Display text statistics
self._display_text_statistics()
# Apply selected strategies
if 'selected_strategies' in st.session_state and st.session_state.selected_strategies:
strategy_results = self._apply_strategies()
# Create visualizations
col1, col2 = st.columns(2)
with col1:
self._create_chunk_boundary_chart(strategy_results)
with col2:
self._create_size_distribution_chart(strategy_results)
# Detailed analysis
st.subheader("Detailed Strategy Analysis")
self._create_detailed_analysis_table(strategy_results)
def _render_comparison_tab(self):
"""Render comparison tab"""
if 'selected_strategies' not in st.session_state or not st.session_state.selected_strategies:
st.warning("Please select strategies in the sidebar.")
return
st.header("🔍 Strategy Comparison")
strategy_results = self._apply_strategies()
# Comparison metrics
col1, col2 = st.columns(2)
with col1:
self._create_performance_radar_chart(strategy_results)
with col2:
self._create_efficiency_comparison(strategy_results)
# Detailed comparison table
st.subheader("Detailed Comparison")
self._create_comparison_table(strategy_results)
def _render_metrics_tab(self):
"""Render metrics tab"""
st.header("📈 Performance Metrics")
if 'input_text' not in st.session_state:
st.warning("Please input text first.")
return
# Metric selection
selected_metrics = st.multiselect(
"Select Metrics to Display",
["Coverage", "Overlap", "Coherence", "Speed", "Memory Usage"],
default=["Coverage", "Overlap", "Coherence"]
)
if selected_metrics:
strategy_results = self._apply_strategies()
self._create_metrics_dashboard(strategy_results, selected_metrics)
def _render_settings_tab(self):
"""Render settings tab"""
st.header("⚙️ Advanced Settings")
# Chunking parameters
st.subheader("Chunking Parameters")
col1, col2 = st.columns(2)
with col1:
st.number_input("Default Chunk Size", value=250, min_value=50, max_value=2000)
st.number_input("Default Overlap %", value=10, min_value=0, max_value=50)
with col2:
st.selectbox("Default Strategy", ["Fixed Size", "Semantic", "Hybrid"])
st.slider("Similarity Threshold", 0.0, 1.0, 0.7)
# Visualization settings
st.subheader("Visualization Settings")
st.checkbox("Show Overlap Regions", value=True)
st.checkbox("Show Chunk Statistics", value=True)
st.checkbox("Color Code by Strategy", value=True)
# Export settings
st.subheader("Export Options")
export_format = st.selectbox("Export Format", ["PNG", "SVG", "PDF", "CSV"])
if st.button("Export Analysis"):
st.success(f"Analysis exported as {export_format}")
def _display_text_statistics(self):
"""Display input text statistics"""
text = st.session_state.input_text
word_count = len(text.split())
char_count = len(text)
sentence_count = text.count('.') + text.count('!') + text.count('?')
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Characters", f"{char_count:,}")
with col2:
st.metric("Words", f"{word_count:,}")
with col3:
st.metric("Sentences", f"{sentence_count:,}")
with col4:
st.metric("Avg Word Length", f"{char_count/word_count:.1f}")
def _apply_strategies(self):
"""Apply selected chunking strategies"""
text = st.session_state.input_text
results = {}
for strategy_name in st.session_state.selected_strategies:
strategy = self.chunking_strategies[strategy_name]
chunks = self._chunk_text(text, strategy)
results[strategy_name] = {
'chunks': chunks,
'strategy': strategy,
'metrics': self._calculate_metrics(chunks, text)
}
return results
def _chunk_text(self, text, strategy):
"""Apply chunking strategy to text"""
# Simplified chunking implementation
if strategy.get('method') == 'semantic':
# Simulate semantic chunking
sentences = text.split('. ')
chunks = []
current_chunk = []
for sentence in sentences:
current_chunk.append(sentence)
if len(' '.join(current_chunk)) > strategy.get('chunk_size', 250):
chunks.append({
'text': '. '.join(current_chunk),
'size': len('. '.join(current_chunk))
})
current_chunk = []
if current_chunk:
chunks.append({
'text': '. '.join(current_chunk),
'size': len('. '.join(current_chunk))
})
else:
# Fixed-size chunking
chunk_size = strategy.get('chunk_size', 250)
overlap = strategy.get('overlap', 0)
overlap_chars = int(chunk_size * overlap / 100)
chunks = []
start = 0
while start < len(text):
end = min(start + chunk_size, len(text))
chunks.append({
'text': text[start:end],
'size': end - start
})
start = max(0, end - overlap_chars)
if end >= len(text):
break
return chunks
def _calculate_metrics(self, chunks, original_text):
"""Calculate metrics for chunks"""
if not chunks:
return {}
total_chars = sum(chunk['size'] for chunk in chunks)
coverage = (total_chars / len(original_text)) * 100 if original_text else 0
chunk_sizes = [chunk['size'] for chunk in chunks]
avg_size = np.mean(chunk_sizes) if chunk_sizes else 0
size_variance = np.var(chunk_sizes) if chunk_sizes else 0
# Simulated metrics
coherence_score = np.random.uniform(0.6, 0.9)
speed_score = 1.0 - (len(chunks) / 100) # More chunks = slower
memory_score = 1.0 - (total_chars / 10000) # More text = more memory
return {
'num_chunks': len(chunks),
'avg_chunk_size': avg_size,
'coverage': coverage,
'size_variance': size_variance,
'coherence': coherence_score,
'speed': speed_score,
'memory': memory_score
}
def _create_chunk_boundary_chart(self, strategy_results):
"""Create chunk boundary visualization"""
fig = go.Figure()
colors = px.colors.qualitative.Set3
for i, (strategy_name, data) in enumerate(strategy_results.items()):
chunks = data['chunks']
for j, chunk in enumerate(chunks):
# Create chunk visualization
fig.add_trace(
go.Scatter(
x=[j, j + 1],
y=[i, i],
mode='lines',
line=dict(width=chunk['size']/10, color=colors[i % len(colors)]),
hovertemplate=f'<b>{strategy_name}</b><br>' +
f'Chunk {j+1}<br>' +
f'Size: {chunk["size"]}<extra></extra>',
name=f'{strategy_name}' if j == 0 else None,
showlegend=j == 0
)
)
fig.update_layout(
title="Chunk Boundaries Visualization",
xaxis_title="Chunk Index",
yaxis_title="Strategy",
height=400
)
st.plotly_chart(fig, use_container_width=True)
def _create_size_distribution_chart(self, strategy_results):
"""Create size distribution chart"""
fig = go.Figure()
for strategy_name, data in strategy_results.items():
chunk_sizes = [chunk['size'] for chunk in data['chunks']]
fig.add_trace(
go.Histogram(
x=chunk_sizes,
name=strategy_name,
opacity=0.7,
nbinsx=20
)
)
fig.update_layout(
title="Chunk Size Distribution",
xaxis_title="Chunk Size (characters)",
yaxis_title="Frequency",
height=400
)
st.plotly_chart(fig, use_container_width=True)
def _create_performance_radar_chart(self, strategy_results):
"""Create performance radar chart"""
metrics = ['Coherence', 'Speed', 'Memory', 'Coverage', 'Consistency']
fig = go.Figure()
for strategy_name, data in strategy_results.items():
metrics_data = data['metrics']
values = [
metrics_data.get('coherence', 0.5),
metrics_data.get('speed', 0.5),
metrics_data.get('memory', 0.5),
metrics_data.get('coverage', 0.5) / 100,
1.0 - (metrics_data.get('size_variance', 0) / 10000) # Inverse variance
]
fig.add_trace(
go.Scatterpolar(
r=values,
theta=metrics,
fill='toself',
name=strategy_name
)
)
fig.update_layout(
polar=dict(
radialaxis=dict(
visible=True,
range=[0, 1]
)
),
title="Performance Comparison",
height=400
)
st.plotly_chart(fig, use_container_width=True)
def _create_efficiency_comparison(self, strategy_results):
"""Create efficiency comparison chart"""
strategies = list(strategy_results.keys())
processing_times = [np.random.uniform(0.5, 2.0) for _ in strategies] # Simulated
memory_usage = [np.random.uniform(10, 50) for _ in strategies] # Simulated
fig = go.Figure()
# Add processing time
fig.add_trace(
go.Scatter(
x=strategies,
y=processing_times,
mode='markers+lines',
name='Processing Time (s)',
marker=dict(size=10),
line=dict(width=2)
)
)
# Add memory usage
fig.add_trace(
go.Scatter(
x=strategies,
y=memory_usage,
mode='markers+lines',
name='Memory Usage (MB)',
marker=dict(size=10),
line=dict(width=2),
yaxis='y2'
)
)
fig.update_layout(
title="Efficiency Comparison",
xaxis_title="Strategy",
yaxis=dict(title="Processing Time (s)"),
yaxis2=dict(title="Memory Usage (MB)", overlaying="y", side="right"),
height=400
)
st.plotly_chart(fig, use_container_width=True)
def _create_detailed_analysis_table(self, strategy_results):
"""Create detailed analysis table"""
table_data = []
for strategy_name, data in strategy_results.items():
metrics = data['metrics']
table_data.append({
'Strategy': strategy_name,
'Chunks': metrics['num_chunks'],
'Avg Size': f"{metrics['avg_chunk_size']:.0f}",
'Coverage (%)': f"{metrics['coverage']:.1f}",
'Coherence': f"{metrics['coherence']:.2f}",
'Speed': f"{metrics['speed']:.2f}",
'Memory': f"{metrics['memory']:.2f}"
})
df = pd.DataFrame(table_data)
st.dataframe(df, use_container_width=True)
def _create_comparison_table(self, strategy_results):
"""Create detailed comparison table"""
comparison_data = []
metrics = ['num_chunks', 'avg_chunk_size', 'coverage', 'coherence', 'speed', 'memory']
metric_names = ['Chunks', 'Avg Size', 'Coverage %', 'Coherence', 'Speed', 'Memory']
for i, metric in enumerate(metrics):
row = {'Metric': metric_names[i]}
for strategy_name, data in strategy_results.items():
value = data['metrics'].get(metric, 0)
if metric == 'coverage':
row[strategy_name] = f"{value:.1f}%"
else:
row[strategy_name] = f"{value:.3f}"
comparison_data.append(row)
df = pd.DataFrame(comparison_data)
st.dataframe(df, use_container_width=True)
def _create_metrics_dashboard(self, strategy_results, selected_metrics):
"""Create metrics dashboard"""
for metric in selected_metrics:
st.subheader(f"{metric} Analysis")
if metric == "Coverage":
self._create_coverage_chart(strategy_results)
elif metric == "Overlap":
self._create_overlap_chart(strategy_results)
elif metric == "Coherence":
self._create_coherence_chart(strategy_results)
elif metric == "Speed":
self._create_speed_chart(strategy_results)
elif metric == "Memory Usage":
self._create_memory_chart(strategy_results)
def _create_coverage_chart(self, strategy_results):
"""Create coverage analysis chart"""
strategies = list(strategy_results.keys())
coverage_values = [data['metrics']['coverage'] for data in strategy_results.values()]
fig = go.Figure(data=[
go.Bar(x=strategies, y=coverage_values, marker_color='lightblue')
])
fig.update_layout(
title="Text Coverage by Strategy",
xaxis_title="Strategy",
yaxis_title="Coverage (%)",
height=300
)
st.plotly_chart(fig, use_container_width=True)
def _create_overlap_chart(self, strategy_results):
"""Create overlap analysis chart"""
# Simulated overlap data
strategies = list(strategy_results.keys())
overlap_data = []
for strategy_name, data in strategy_results.items():
strategy = data['strategy']
overlap = strategy.get('overlap', 0)
overlap_data.append(overlap)
fig = go.Figure(data=[
go.Bar(x=strategies, y=overlap_data, marker_color='lightcoral')
])
fig.update_layout(
title="Overlap Percentage by Strategy",
xaxis_title="Strategy",
yaxis_title="Overlap (%)",
height=300
)
st.plotly_chart(fig, use_container_width=True)
def _create_coherence_chart(self, strategy_results):
"""Create coherence analysis chart"""
strategies = list(strategy_results.keys())
coherence_values = [data['metrics']['coherence'] for data in strategy_results.values()]
fig = go.Figure(data=[
go.Bar(x=strategies, y=coherence_values, marker_color='lightgreen')
])
fig.update_layout(
title="Coherence Score by Strategy",
xaxis_title="Strategy",
yaxis_title="Coherence Score",
yaxis=dict(range=[0, 1]),
height=300
)
st.plotly_chart(fig, use_container_width=True)
def _create_speed_chart(self, strategy_results):
"""Create speed analysis chart"""
strategies = list(strategy_results.keys())
speed_values = [data['metrics']['speed'] for data in strategy_results.values()]
fig = go.Figure(data=[
go.Bar(x=strategies, y=speed_values, marker_color='gold')
])
fig.update_layout(
title="Processing Speed by Strategy",
xaxis_title="Strategy",
yaxis_title="Speed Score",
height=300
)
st.plotly_chart(fig, use_container_width=True)
def _create_memory_chart(self, strategy_results):
"""Create memory usage analysis chart"""
strategies = list(strategy_results.keys())
memory_values = [data['metrics']['memory'] for data in strategy_results.values()]
fig = go.Figure(data=[
go.Bar(x=strategies, y=memory_values, marker_color='plum')
])
fig.update_layout(
title="Memory Usage by Strategy",
xaxis_title="Strategy",
yaxis_title="Memory Score",
height=300
)
st.plotly_chart(fig, use_container_width=True)
# Run the dashboard
if __name__ == "__main__":
dashboard = StreamlitChunkingDashboard()
dashboard.run()
4. Command Line Usage Examples
Basic Visualization
# Example 1: Basic ChunkViz usage
from chunkviz.advanced import AdvancedChunkViz
text = "Your long document text here..."
viz = AdvancedChunkViz()
# Compare multiple strategies
strategies = [
{"name": "Small Chunks", "chunk_size": 100, "overlap": 0},
{"name": "Medium Chunks", "chunk_size": 250, "overlap": 0},
{"name": "Large Chunks", "chunk_size": 500, "overlap": 0},
{"name": "With Overlap", "chunk_size": 250, "overlap": 20}
]
viz.visualize_multiple_strategies(text, strategies, save_path="chunking_comparison.png")
# Analyze overlap effects
viz.visualize_overlap_effects(text, 250, [0, 10, 20, 30], save_path="overlap_analysis.png")
Interactive Dashboard
# Example 2: Interactive dashboard
from chunkviz.dashboard import InteractiveChunkingDashboard
dashboard = InteractiveChunkingDashboard()
# Create comprehensive analysis
strategies = [
{"name": "Fixed-100", "chunk_size": 100, "overlap": 0},
{"name": "Fixed-250", "chunk_size": 250, "overlap": 0},
{"name": "Semantic", "method": "semantic"}
]
fig = dashboard.create_strategy_comparison_dashboard(text, strategies)
fig.show()
# Create size distribution comparison
fig = dashboard.compare_chunk_size_distributions(text, [100, 200, 300, 400, 500])
fig.show()
Streamlit Web Application
# Example 3: Run Streamlit dashboard
# Save as app.py and run: streamlit run app.py
from chunkviz.streamlit_app import StreamlitChunkingDashboard
dashboard = StreamlitChunkingDashboard()
dashboard.run()
These visualization and evaluation tools provide comprehensive capabilities for analyzing, comparing, and optimizing chunking strategies across different use cases and requirements.