Initial commit

This commit is contained in:
Zhongwei Li
2025-11-29 18:16:40 +08:00
commit f125e90b9f
370 changed files with 67769 additions and 0 deletions

View File

@@ -0,0 +1,634 @@
#!/usr/bin/env python3
"""
Conversation Processor for Claude Code Insights
Parses JSONL conversation files from ~/.claude/projects/, extracts metadata,
and stores in SQLite for fast querying. Supports incremental processing.
"""
import json
import sqlite3
import base64
import hashlib
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Any, Optional
from dataclasses import dataclass, asdict
import click
import re
@dataclass
class ConversationMetadata:
"""Structured conversation metadata"""
id: str
project_path: str
timestamp: datetime
message_count: int
user_messages: int
assistant_messages: int
files_read: List[str]
files_written: List[str]
files_edited: List[str]
tools_used: List[str]
topics: List[str]
first_user_message: str
last_assistant_message: str
conversation_hash: str
file_size_bytes: int
processed_at: datetime
class ConversationProcessor:
"""Processes Claude Code conversation JSONL files"""
def __init__(self, db_path: Path, verbose: bool = False):
self.db_path = db_path
self.verbose = verbose
self.conn = None
self._init_database()
def _init_database(self):
"""Initialize SQLite database with schema"""
self.db_path.parent.mkdir(parents=True, exist_ok=True)
self.conn = sqlite3.connect(str(self.db_path))
self.conn.row_factory = sqlite3.Row
# Create tables
self.conn.executescript("""
CREATE TABLE IF NOT EXISTS conversations (
id TEXT PRIMARY KEY,
project_path TEXT NOT NULL,
timestamp TEXT NOT NULL,
message_count INTEGER NOT NULL,
user_messages INTEGER NOT NULL,
assistant_messages INTEGER NOT NULL,
files_read TEXT, -- JSON array
files_written TEXT, -- JSON array
files_edited TEXT, -- JSON array
tools_used TEXT, -- JSON array
topics TEXT, -- JSON array
first_user_message TEXT,
last_assistant_message TEXT,
conversation_hash TEXT UNIQUE NOT NULL,
file_size_bytes INTEGER NOT NULL,
processed_at TEXT NOT NULL
);
CREATE INDEX IF NOT EXISTS idx_timestamp ON conversations(timestamp);
CREATE INDEX IF NOT EXISTS idx_project ON conversations(project_path);
CREATE INDEX IF NOT EXISTS idx_processed ON conversations(processed_at);
CREATE TABLE IF NOT EXISTS file_interactions (
id INTEGER PRIMARY KEY AUTOINCREMENT,
conversation_id TEXT NOT NULL,
file_path TEXT NOT NULL,
interaction_type TEXT NOT NULL, -- read, write, edit
FOREIGN KEY (conversation_id) REFERENCES conversations(id)
);
CREATE INDEX IF NOT EXISTS idx_file_path ON file_interactions(file_path);
CREATE INDEX IF NOT EXISTS idx_conversation ON file_interactions(conversation_id);
CREATE TABLE IF NOT EXISTS tool_usage (
id INTEGER PRIMARY KEY AUTOINCREMENT,
conversation_id TEXT NOT NULL,
tool_name TEXT NOT NULL,
usage_count INTEGER NOT NULL,
FOREIGN KEY (conversation_id) REFERENCES conversations(id)
);
CREATE INDEX IF NOT EXISTS idx_tool_name ON tool_usage(tool_name);
CREATE TABLE IF NOT EXISTS processing_state (
file_path TEXT PRIMARY KEY,
last_modified TEXT NOT NULL,
last_processed TEXT NOT NULL,
file_hash TEXT NOT NULL
);
""")
self.conn.commit()
def _log(self, message: str):
"""Log if verbose mode is enabled"""
if self.verbose:
print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")
def _compute_file_hash(self, file_path: Path) -> str:
"""Compute SHA256 hash of file for change detection"""
sha256 = hashlib.sha256()
with open(file_path, 'rb') as f:
for chunk in iter(lambda: f.read(8192), b''):
sha256.update(chunk)
return sha256.hexdigest()
def _needs_processing(self, file_path: Path, reindex: bool = False) -> bool:
"""Check if file needs (re)processing"""
if reindex:
return True
file_stat = file_path.stat()
file_hash = self._compute_file_hash(file_path)
cursor = self.conn.execute(
"SELECT last_modified, file_hash FROM processing_state WHERE file_path = ?",
(str(file_path),)
)
row = cursor.fetchone()
if not row:
return True # Never processed
last_modified, stored_hash = row
return stored_hash != file_hash # File changed
def _update_processing_state(self, file_path: Path):
"""Update processing state for file"""
file_hash = self._compute_file_hash(file_path)
last_modified = datetime.fromtimestamp(file_path.stat().st_mtime).isoformat()
self.conn.execute("""
INSERT OR REPLACE INTO processing_state (file_path, last_modified, last_processed, file_hash)
VALUES (?, ?, ?, ?)
""", (str(file_path), last_modified, datetime.now().isoformat(), file_hash))
def _parse_jsonl_file(self, file_path: Path) -> List[Dict[str, Any]]:
"""Parse JSONL file with base64-encoded content"""
messages = []
with open(file_path, 'r', encoding='utf-8') as f:
for line_num, line in enumerate(f, 1):
try:
if line.strip():
data = json.loads(line)
messages.append(data)
except json.JSONDecodeError as e:
self._log(f"Warning: Failed to parse line {line_num} in {file_path.name}: {e}")
return messages
def _extract_tool_uses(self, content: str) -> List[str]:
"""Extract tool names from assistant messages"""
tools = []
# Look for tool use patterns in content
tool_patterns = [
r'"name":\s*"([A-Z][a-zA-Z]+)"', # JSON tool calls
r'<tool>([A-Z][a-zA-Z]+)</tool>', # XML tool calls
]
for pattern in tool_patterns:
matches = re.findall(pattern, content)
tools.extend(matches)
return list(set(tools)) # Unique tools
def _extract_file_paths(self, content: str) -> Dict[str, List[str]]:
"""Extract file paths and their interaction types from content"""
files = {
'read': [],
'written': [],
'edited': []
}
# Patterns for file operations
read_patterns = [
r'Reading\s+(.+\.(?:py|js|ts|tsx|jsx|md|json|yaml|yml))',
r'Read\s+file:\s*(.+)',
r'"file_path":\s*"([^"]+)"', # Tool parameters
]
write_patterns = [
r'Writing\s+(.+\.(?:py|js|ts|tsx|jsx|md|json|yaml|yml))',
r'Created\s+file:\s*(.+)',
r'Write\s+(.+)',
]
edit_patterns = [
r'Editing\s+(.+\.(?:py|js|ts|tsx|jsx|md|json|yaml|yml))',
r'Modified\s+file:\s*(.+)',
r'Edit\s+(.+)',
]
for pattern in read_patterns:
files['read'].extend(re.findall(pattern, content, re.IGNORECASE))
for pattern in write_patterns:
files['written'].extend(re.findall(pattern, content, re.IGNORECASE))
for pattern in edit_patterns:
files['edited'].extend(re.findall(pattern, content, re.IGNORECASE))
# Deduplicate and clean
for key in files:
files[key] = list(set(path.strip() for path in files[key]))
return files
def _extract_topics(self, messages: List[Dict[str, Any]]) -> List[str]:
"""Extract topic keywords from conversation"""
# Combine first user message and some assistant responses
text = ""
user_count = 0
for msg in messages:
msg_type = msg.get('type', '')
# Handle event-stream format
if msg_type == 'user':
message_dict = msg.get('message', {})
content = message_dict.get('content', '') if isinstance(message_dict, dict) else ''
# Handle content that's a list (content blocks)
if isinstance(content, list):
message_content = ' '.join(
block.get('text', '') if isinstance(block, dict) and block.get('type') == 'text' else ''
for block in content
)
else:
message_content = content
if message_content:
text += message_content + " "
user_count += 1
if user_count >= 3: # Only use first few user messages
break
elif msg_type == 'assistant' and user_count < 3:
# Also include some assistant responses for context
message_dict = msg.get('message', {})
content = message_dict.get('content', '') if isinstance(message_dict, dict) else ''
# Handle content that's a list (content blocks)
if isinstance(content, list):
message_content = ' '.join(
block.get('text', '') if isinstance(block, dict) and block.get('type') == 'text' else ''
for block in content
)
else:
message_content = content
if message_content:
text += message_content[:200] + " " # Just a snippet
# Extract common programming keywords
keywords = []
common_topics = [
'authentication', 'auth', 'login', 'jwt', 'oauth',
'testing', 'test', 'unit test', 'integration test',
'bug', 'fix', 'error', 'issue', 'debug',
'performance', 'optimization', 'optimize', 'slow',
'refactor', 'refactoring', 'cleanup',
'feature', 'implement', 'add', 'create',
'database', 'sql', 'query', 'schema',
'api', 'endpoint', 'rest', 'graphql',
'typescript', 'javascript', 'react', 'node',
'css', 'style', 'styling', 'tailwind',
'security', 'vulnerability', 'xss', 'csrf',
'deploy', 'deployment', 'ci/cd', 'docker',
]
text_lower = text.lower()
for topic in common_topics:
if topic in text_lower:
keywords.append(topic)
return list(set(keywords))[:10] # Max 10 topics
def _process_conversation(self, file_path: Path, messages: List[Dict[str, Any]]) -> ConversationMetadata:
"""Extract metadata from parsed conversation"""
# Generate conversation ID from filename
conv_id = file_path.stem
# Count messages by role
user_messages = 0
assistant_messages = 0
first_user_msg = ""
last_assistant_msg = ""
all_tools = []
all_files = {'read': [], 'written': [], 'edited': []}
for msg in messages:
msg_type = msg.get('type', '')
# Handle event-stream format
if msg_type == 'user':
user_messages += 1
message_dict = msg.get('message', {})
content = message_dict.get('content', '') if isinstance(message_dict, dict) else ''
# Handle content that's a list (content blocks)
if isinstance(content, list):
message_content = ' '.join(
block.get('text', '') if isinstance(block, dict) and block.get('type') == 'text' else ''
for block in content
)
else:
message_content = content
if not first_user_msg and message_content:
first_user_msg = message_content[:500] # First 500 chars
elif msg_type == 'assistant':
assistant_messages += 1
message_dict = msg.get('message', {})
content = message_dict.get('content', '') if isinstance(message_dict, dict) else ''
# Handle content that's a list (content blocks)
if isinstance(content, list):
message_content = ' '.join(
block.get('text', '') if isinstance(block, dict) and block.get('type') == 'text' else ''
for block in content
)
# Also extract tools from content blocks
for block in content:
if isinstance(block, dict) and block.get('type') == 'tool_use':
tool_name = block.get('name', '')
if tool_name:
all_tools.append(tool_name)
else:
message_content = content
if message_content:
last_assistant_msg = message_content[:500]
# Extract tools and files from assistant messages
tools = self._extract_tool_uses(message_content)
all_tools.extend(tools)
files = self._extract_file_paths(message_content)
for key in all_files:
all_files[key].extend(files[key])
# Deduplicate
all_tools = list(set(all_tools))
for key in all_files:
all_files[key] = list(set(all_files[key]))
# Extract topics
topics = self._extract_topics(messages)
# File stats
file_stat = file_path.stat()
# Compute conversation hash
conv_hash = self._compute_file_hash(file_path)
# Extract timestamp (from filename or file mtime)
try:
# Try to get timestamp from file modification time
timestamp = datetime.fromtimestamp(file_stat.st_mtime)
except Exception:
timestamp = datetime.now()
return ConversationMetadata(
id=conv_id,
project_path=str(file_path.parent),
timestamp=timestamp,
message_count=len(messages),
user_messages=user_messages,
assistant_messages=assistant_messages,
files_read=all_files['read'],
files_written=all_files['written'],
files_edited=all_files['edited'],
tools_used=all_tools,
topics=topics,
first_user_message=first_user_msg,
last_assistant_message=last_assistant_msg,
conversation_hash=conv_hash,
file_size_bytes=file_stat.st_size,
processed_at=datetime.now()
)
def _store_conversation(self, metadata: ConversationMetadata):
"""Store conversation metadata in database"""
# Store main conversation record
self.conn.execute("""
INSERT OR REPLACE INTO conversations
(id, project_path, timestamp, message_count, user_messages, assistant_messages,
files_read, files_written, files_edited, tools_used, topics,
first_user_message, last_assistant_message, conversation_hash,
file_size_bytes, processed_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (
metadata.id,
metadata.project_path,
metadata.timestamp.isoformat(),
metadata.message_count,
metadata.user_messages,
metadata.assistant_messages,
json.dumps(metadata.files_read),
json.dumps(metadata.files_written),
json.dumps(metadata.files_edited),
json.dumps(metadata.tools_used),
json.dumps(metadata.topics),
metadata.first_user_message,
metadata.last_assistant_message,
metadata.conversation_hash,
metadata.file_size_bytes,
metadata.processed_at.isoformat()
))
# Store file interactions
self.conn.execute(
"DELETE FROM file_interactions WHERE conversation_id = ?",
(metadata.id,)
)
for file_path in metadata.files_read:
self.conn.execute(
"INSERT INTO file_interactions (conversation_id, file_path, interaction_type) VALUES (?, ?, ?)",
(metadata.id, file_path, 'read')
)
for file_path in metadata.files_written:
self.conn.execute(
"INSERT INTO file_interactions (conversation_id, file_path, interaction_type) VALUES (?, ?, ?)",
(metadata.id, file_path, 'write')
)
for file_path in metadata.files_edited:
self.conn.execute(
"INSERT INTO file_interactions (conversation_id, file_path, interaction_type) VALUES (?, ?, ?)",
(metadata.id, file_path, 'edit')
)
# Store tool usage
self.conn.execute(
"DELETE FROM tool_usage WHERE conversation_id = ?",
(metadata.id,)
)
for tool_name in metadata.tools_used:
self.conn.execute(
"INSERT INTO tool_usage (conversation_id, tool_name, usage_count) VALUES (?, ?, ?)",
(metadata.id, tool_name, 1)
)
def process_file(self, file_path: Path, reindex: bool = False) -> bool:
"""Process a single conversation file"""
if not self._needs_processing(file_path, reindex):
self._log(f"Skipping {file_path.name} (already processed)")
return False
self._log(f"Processing {file_path.name}...")
try:
# Parse JSONL
messages = self._parse_jsonl_file(file_path)
if not messages:
self._log(f"Warning: No messages found in {file_path.name}")
return False
# Extract metadata
metadata = self._process_conversation(file_path, messages)
# Store in database
self._store_conversation(metadata)
# Update processing state
self._update_processing_state(file_path)
self.conn.commit()
self._log(f"✓ Processed {file_path.name}: {metadata.message_count} messages, "
f"{metadata.user_messages} user, {metadata.assistant_messages} assistant")
return True
except Exception as e:
self._log(f"Error processing {file_path.name}: {e}")
import traceback
if self.verbose:
traceback.print_exc()
return False
def process_project(self, project_name: str, reindex: bool = False) -> int:
"""Process all conversations for a project"""
# Find conversation files
claude_projects = Path.home() / ".claude" / "projects"
if not claude_projects.exists():
self._log(f"Error: {claude_projects} does not exist")
return 0
# Find project directory (may be encoded)
project_dirs = list(claude_projects.glob(f"*{project_name}*"))
if not project_dirs:
self._log(f"Error: No project directory found matching '{project_name}'")
return 0
if len(project_dirs) > 1:
self._log(f"Warning: Multiple project directories found, using {project_dirs[0].name}")
project_dir = project_dirs[0]
self._log(f"Processing conversations from {project_dir}")
# Find all JSONL files
jsonl_files = list(project_dir.glob("*.jsonl"))
if not jsonl_files:
self._log(f"No conversation files found in {project_dir}")
return 0
self._log(f"Found {len(jsonl_files)} conversation files")
# Process each file
processed_count = 0
for jsonl_file in jsonl_files:
if self.process_file(jsonl_file, reindex):
processed_count += 1
self._log(f"\nProcessed {processed_count}/{len(jsonl_files)} conversations")
return processed_count
def get_stats(self) -> Dict[str, Any]:
"""Get processing statistics"""
cursor = self.conn.execute("""
SELECT
COUNT(*) as total_conversations,
SUM(message_count) as total_messages,
SUM(user_messages) as total_user_messages,
SUM(assistant_messages) as total_assistant_messages,
MIN(timestamp) as earliest_conversation,
MAX(timestamp) as latest_conversation
FROM conversations
""")
row = cursor.fetchone()
stats = {
'total_conversations': row['total_conversations'],
'total_messages': row['total_messages'],
'total_user_messages': row['total_user_messages'],
'total_assistant_messages': row['total_assistant_messages'],
'earliest_conversation': row['earliest_conversation'],
'latest_conversation': row['latest_conversation']
}
# Top files
cursor = self.conn.execute("""
SELECT file_path, COUNT(*) as interaction_count
FROM file_interactions
GROUP BY file_path
ORDER BY interaction_count DESC
LIMIT 10
""")
stats['top_files'] = [
{'file': row['file_path'], 'count': row['interaction_count']}
for row in cursor.fetchall()
]
# Top tools
cursor = self.conn.execute("""
SELECT tool_name, SUM(usage_count) as total_usage
FROM tool_usage
GROUP BY tool_name
ORDER BY total_usage DESC
LIMIT 10
""")
stats['top_tools'] = [
{'tool': row['tool_name'], 'count': row['total_usage']}
for row in cursor.fetchall()
]
return stats
def close(self):
"""Close database connection"""
if self.conn:
self.conn.close()
@click.command()
@click.option('--project-name', default='annex', help='Project name to process')
@click.option('--db-path', type=click.Path(), default='.claude/skills/cc-insights/.processed/conversations.db',
help='SQLite database path')
@click.option('--reindex', is_flag=True, help='Reprocess all conversations (ignore cache)')
@click.option('--verbose', is_flag=True, help='Show detailed processing logs')
@click.option('--stats', is_flag=True, help='Show statistics after processing')
def main(project_name: str, db_path: str, reindex: bool, verbose: bool, stats: bool):
"""Process Claude Code conversations and store metadata"""
db_path = Path(db_path)
processor = ConversationProcessor(db_path, verbose=verbose)
try:
# Process conversations
count = processor.process_project(project_name, reindex=reindex)
print(f"\n✓ Processed {count} conversations")
if stats:
print("\n=== Statistics ===")
stats_data = processor.get_stats()
print(f"Total conversations: {stats_data['total_conversations']}")
print(f"Total messages: {stats_data['total_messages']}")
print(f"User messages: {stats_data['total_user_messages']}")
print(f"Assistant messages: {stats_data['total_assistant_messages']}")
print(f"Date range: {stats_data['earliest_conversation']} to {stats_data['latest_conversation']}")
print("\nTop 10 Files:")
for item in stats_data['top_files']:
print(f" {item['file']}: {item['count']} interactions")
print("\nTop 10 Tools:")
for item in stats_data['top_tools']:
print(f" {item['tool']}: {item['count']} uses")
finally:
processor.close()
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,509 @@
#!/usr/bin/env python3
"""
Insight Generator for Claude Code Insights
Analyzes conversation patterns and generates insight reports with
visualizations, metrics, and actionable recommendations.
"""
import sqlite3
import json
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple
from datetime import datetime, timedelta
from collections import Counter, defaultdict
import click
try:
from jinja2 import Template, Environment, FileSystemLoader
except ImportError:
print("Error: jinja2 not installed. Run: pip install jinja2")
exit(1)
class PatternDetector:
"""Detects patterns in conversation data"""
def __init__(self, db_path: Path, verbose: bool = False):
self.db_path = db_path
self.verbose = verbose
self.conn = sqlite3.connect(str(db_path))
self.conn.row_factory = sqlite3.Row
def _log(self, message: str):
"""Log if verbose mode is enabled"""
if self.verbose:
print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")
def get_date_range_filter(self, date_from: Optional[str] = None, date_to: Optional[str] = None) -> Tuple[str, List]:
"""Build date range SQL filter"""
conditions = []
params = []
if date_from:
conditions.append("timestamp >= ?")
params.append(date_from)
if date_to:
conditions.append("timestamp <= ?")
params.append(date_to)
where_clause = " AND ".join(conditions) if conditions else "1=1"
return where_clause, params
def get_overview_metrics(self, date_from: Optional[str] = None, date_to: Optional[str] = None) -> Dict[str, Any]:
"""Get high-level overview metrics"""
where_clause, params = self.get_date_range_filter(date_from, date_to)
cursor = self.conn.execute(f"""
SELECT
COUNT(*) as total_conversations,
SUM(message_count) as total_messages,
SUM(user_messages) as total_user_messages,
SUM(assistant_messages) as total_assistant_messages,
AVG(message_count) as avg_messages_per_conversation,
MIN(timestamp) as earliest_conversation,
MAX(timestamp) as latest_conversation,
COUNT(DISTINCT DATE(timestamp)) as active_days
FROM conversations
WHERE {where_clause}
""", params)
row = cursor.fetchone()
return {
'total_conversations': row['total_conversations'] or 0,
'total_messages': row['total_messages'] or 0,
'total_user_messages': row['total_user_messages'] or 0,
'total_assistant_messages': row['total_assistant_messages'] or 0,
'avg_messages_per_conversation': round(row['avg_messages_per_conversation'] or 0, 1),
'earliest_conversation': row['earliest_conversation'],
'latest_conversation': row['latest_conversation'],
'active_days': row['active_days'] or 0
}
def get_file_hotspots(self, date_from: Optional[str] = None, date_to: Optional[str] = None, limit: int = 20) -> List[Dict[str, Any]]:
"""Get most frequently modified files"""
where_clause, params = self.get_date_range_filter(date_from, date_to)
cursor = self.conn.execute(f"""
SELECT
fi.file_path,
COUNT(DISTINCT fi.conversation_id) as conversation_count,
SUM(CASE WHEN fi.interaction_type = 'read' THEN 1 ELSE 0 END) as read_count,
SUM(CASE WHEN fi.interaction_type = 'write' THEN 1 ELSE 0 END) as write_count,
SUM(CASE WHEN fi.interaction_type = 'edit' THEN 1 ELSE 0 END) as edit_count
FROM file_interactions fi
JOIN conversations c ON fi.conversation_id = c.id
WHERE {where_clause}
GROUP BY fi.file_path
ORDER BY conversation_count DESC
LIMIT ?
""", params + [limit])
return [
{
'file_path': row['file_path'],
'conversation_count': row['conversation_count'],
'read_count': row['read_count'],
'write_count': row['write_count'],
'edit_count': row['edit_count'],
'total_interactions': row['read_count'] + row['write_count'] + row['edit_count']
}
for row in cursor.fetchall()
]
def get_tool_usage(self, date_from: Optional[str] = None, date_to: Optional[str] = None) -> List[Dict[str, Any]]:
"""Get tool usage statistics"""
where_clause, params = self.get_date_range_filter(date_from, date_to)
cursor = self.conn.execute(f"""
SELECT
tu.tool_name,
COUNT(DISTINCT tu.conversation_id) as conversation_count,
SUM(tu.usage_count) as total_uses
FROM tool_usage tu
JOIN conversations c ON tu.conversation_id = c.id
WHERE {where_clause}
GROUP BY tu.tool_name
ORDER BY total_uses DESC
""", params)
return [
{
'tool_name': row['tool_name'],
'conversation_count': row['conversation_count'],
'total_uses': row['total_uses']
}
for row in cursor.fetchall()
]
def get_topic_clusters(self, date_from: Optional[str] = None, date_to: Optional[str] = None) -> List[Dict[str, Any]]:
"""Get most common topics"""
where_clause, params = self.get_date_range_filter(date_from, date_to)
cursor = self.conn.execute(f"""
SELECT topics FROM conversations
WHERE {where_clause} AND topics IS NOT NULL
""", params)
topic_counter = Counter()
for row in cursor.fetchall():
topics = json.loads(row['topics'])
topic_counter.update(topics)
return [
{'topic': topic, 'count': count}
for topic, count in topic_counter.most_common(20)
]
def get_activity_timeline(self, date_from: Optional[str] = None, date_to: Optional[str] = None) -> Dict[str, int]:
"""Get conversation count by date"""
where_clause, params = self.get_date_range_filter(date_from, date_to)
cursor = self.conn.execute(f"""
SELECT DATE(timestamp) as date, COUNT(*) as count
FROM conversations
WHERE {where_clause}
GROUP BY DATE(timestamp)
ORDER BY date
""", params)
return {row['date']: row['count'] for row in cursor.fetchall()}
def get_hourly_distribution(self, date_from: Optional[str] = None, date_to: Optional[str] = None) -> Dict[int, int]:
"""Get conversation distribution by hour of day"""
where_clause, params = self.get_date_range_filter(date_from, date_to)
cursor = self.conn.execute(f"""
SELECT
CAST(strftime('%H', timestamp) AS INTEGER) as hour,
COUNT(*) as count
FROM conversations
WHERE {where_clause}
GROUP BY hour
ORDER BY hour
""", params)
return {row['hour']: row['count'] for row in cursor.fetchall()}
def get_weekday_distribution(self, date_from: Optional[str] = None, date_to: Optional[str] = None) -> Dict[str, int]:
"""Get conversation distribution by day of week"""
where_clause, params = self.get_date_range_filter(date_from, date_to)
cursor = self.conn.execute(f"""
SELECT
CASE CAST(strftime('%w', timestamp) AS INTEGER)
WHEN 0 THEN 'Sunday'
WHEN 1 THEN 'Monday'
WHEN 2 THEN 'Tuesday'
WHEN 3 THEN 'Wednesday'
WHEN 4 THEN 'Thursday'
WHEN 5 THEN 'Friday'
WHEN 6 THEN 'Saturday'
END as weekday,
COUNT(*) as count
FROM conversations
WHERE {where_clause}
GROUP BY weekday
""", params)
weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
result = {day: 0 for day in weekday_order}
for row in cursor.fetchall():
result[row['weekday']] = row['count']
return result
def close(self):
"""Close database connection"""
if self.conn:
self.conn.close()
class InsightGenerator:
"""Generates insight reports from pattern data"""
def __init__(self, db_path: Path, templates_dir: Path, verbose: bool = False):
self.db_path = db_path
self.templates_dir = templates_dir
self.verbose = verbose
self.detector = PatternDetector(db_path, verbose=verbose)
# Setup Jinja2 environment
if templates_dir.exists():
self.jinja_env = Environment(loader=FileSystemLoader(str(templates_dir)))
else:
self.jinja_env = None
def _log(self, message: str):
"""Log if verbose mode is enabled"""
if self.verbose:
print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")
def _create_ascii_bar_chart(self, data: Dict[str, int], max_width: int = 50) -> str:
"""Create ASCII bar chart"""
if not data:
return "No data"
max_value = max(data.values())
lines = []
for label, value in data.items():
bar_length = int((value / max_value) * max_width) if max_value > 0 else 0
bar = "" * bar_length
lines.append(f"{label:15} {bar} {value}")
return "\n".join(lines)
def _create_sparkline(self, values: List[int]) -> str:
"""Create sparkline chart"""
if not values:
return ""
chars = "▁▂▃▄▅▆▇█"
min_val = min(values)
max_val = max(values)
if max_val == min_val:
return chars[0] * len(values)
normalized = [(v - min_val) / (max_val - min_val) for v in values]
return "".join(chars[int(n * (len(chars) - 1))] for n in normalized)
def generate_weekly_report(self, date_from: Optional[str] = None, date_to: Optional[str] = None) -> str:
"""Generate weekly activity report"""
self._log("Generating weekly report...")
# Auto-calculate date range if not provided
if not date_from:
date_from = (datetime.now() - timedelta(days=7)).date().isoformat()
if not date_to:
date_to = datetime.now().date().isoformat()
# Gather data
overview = self.detector.get_overview_metrics(date_from, date_to)
file_hotspots = self.detector.get_file_hotspots(date_from, date_to, limit=10)
tool_usage = self.detector.get_tool_usage(date_from, date_to)
topics = self.detector.get_topic_clusters(date_from, date_to)
timeline = self.detector.get_activity_timeline(date_from, date_to)
weekday_dist = self.detector.get_weekday_distribution(date_from, date_to)
# Build report
report_lines = [
f"# Weekly Insights Report",
f"**Period:** {date_from} to {date_to}",
f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M')}",
"",
"## Overview",
f"- **Total Conversations:** {overview['total_conversations']}",
f"- **Active Days:** {overview['active_days']}",
f"- **Total Messages:** {overview['total_messages']}",
f"- **Avg Messages/Conversation:** {overview['avg_messages_per_conversation']}",
"",
"## Activity Timeline",
"```",
self._create_ascii_bar_chart(timeline, max_width=40),
"```",
"",
"## Weekday Distribution",
"```",
self._create_ascii_bar_chart(weekday_dist, max_width=40),
"```",
""
]
if file_hotspots:
report_lines.extend([
"## File Hotspots (Top 10)",
""
])
for i, file in enumerate(file_hotspots, 1):
heat = "🔥" * min(3, (file['conversation_count'] + 2) // 3)
report_lines.append(
f"{i}. {heat} **{file['file_path']}** "
f"({file['conversation_count']} conversations, "
f"R:{file['read_count']} W:{file['write_count']} E:{file['edit_count']})"
)
report_lines.append("")
if tool_usage:
report_lines.extend([
"## Tool Usage",
""
])
tool_dict = {t['tool_name']: t['total_uses'] for t in tool_usage[:10]}
report_lines.append("```")
report_lines.append(self._create_ascii_bar_chart(tool_dict, max_width=40))
report_lines.append("```")
report_lines.append("")
if topics:
report_lines.extend([
"## Top Topics",
""
])
topic_dict = {t['topic']: t['count'] for t in topics[:15]}
report_lines.append("```")
report_lines.append(self._create_ascii_bar_chart(topic_dict, max_width=40))
report_lines.append("```")
report_lines.append("")
# Insights and recommendations
report_lines.extend([
"## Insights & Recommendations",
""
])
# File hotspot insights
if file_hotspots and file_hotspots[0]['conversation_count'] >= 5:
top_file = file_hotspots[0]
report_lines.append(
f"- 🔥 **High Activity File:** `{top_file['file_path']}` was modified in "
f"{top_file['conversation_count']} conversations. Consider reviewing for refactoring opportunities."
)
# Topic insights
if topics and topics[0]['count'] >= 3:
top_topic = topics[0]
report_lines.append(
f"- 📌 **Trending Topic:** '{top_topic['topic']}' appeared in {top_topic['count']} conversations. "
f"This might warrant documentation or team knowledge sharing."
)
# Activity pattern insights
if overview['active_days'] < 3:
report_lines.append(
f"- 📅 **Low Activity:** Only {overview['active_days']} active days this week. "
f"Consider scheduling regular development sessions."
)
if not report_lines[-1]: # If no insights were added
report_lines.append("- No significant patterns detected this period.")
return "\n".join(report_lines)
def generate_file_heatmap_report(self, date_from: Optional[str] = None, date_to: Optional[str] = None) -> str:
"""Generate detailed file interaction heatmap"""
self._log("Generating file heatmap report...")
file_hotspots = self.detector.get_file_hotspots(date_from, date_to, limit=50)
report_lines = [
"# File Interaction Heatmap",
f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M')}",
"",
"## File Hotspots",
""
]
if not file_hotspots:
report_lines.append("No file interactions found in the specified period.")
return "\n".join(report_lines)
for i, file in enumerate(file_hotspots, 1):
heat_level = min(5, (file['conversation_count'] + 1) // 2)
heat_emoji = "🔥" * heat_level
report_lines.extend([
f"### {i}. {heat_emoji} {file['file_path']}",
f"- **Conversations:** {file['conversation_count']}",
f"- **Reads:** {file['read_count']}",
f"- **Writes:** {file['write_count']}",
f"- **Edits:** {file['edit_count']}",
f"- **Total Interactions:** {file['total_interactions']}",
""
])
return "\n".join(report_lines)
def generate_tool_usage_report(self, date_from: Optional[str] = None, date_to: Optional[str] = None) -> str:
"""Generate tool usage analytics report"""
self._log("Generating tool usage report...")
tool_usage = self.detector.get_tool_usage(date_from, date_to)
report_lines = [
"# Tool Usage Analytics",
f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M')}",
"",
"## Tool Statistics",
""
]
if not tool_usage:
report_lines.append("No tool usage data found.")
return "\n".join(report_lines)
total_uses = sum(t['total_uses'] for t in tool_usage)
for i, tool in enumerate(tool_usage, 1):
percentage = (tool['total_uses'] / total_uses * 100) if total_uses > 0 else 0
report_lines.extend([
f"### {i}. {tool['tool_name']}",
f"- **Total Uses:** {tool['total_uses']}",
f"- **Used in Conversations:** {tool['conversation_count']}",
f"- **Percentage of Total:** {percentage:.1f}%",
""
])
return "\n".join(report_lines)
def close(self):
"""Close connections"""
self.detector.close()
@click.command()
@click.argument('report_type', type=click.Choice(['weekly', 'file-heatmap', 'tool-usage', 'custom']))
@click.option('--db-path', type=click.Path(), default='.claude/skills/cc-insights/.processed/conversations.db',
help='SQLite database path')
@click.option('--templates-dir', type=click.Path(), default='.claude/skills/cc-insights/templates',
help='Templates directory')
@click.option('--date-from', type=str, help='Start date (ISO format)')
@click.option('--date-to', type=str, help='End date (ISO format)')
@click.option('--output', type=click.Path(), help='Save to file (default: stdout)')
@click.option('--verbose', is_flag=True, help='Show detailed logs')
def main(report_type: str, db_path: str, templates_dir: str, date_from: Optional[str],
date_to: Optional[str], output: Optional[str], verbose: bool):
"""Generate insight reports from conversation data
Report types:
weekly - Weekly activity summary with metrics
file-heatmap - File modification heatmap
tool-usage - Tool usage analytics
custom - Custom report from template
"""
db_path = Path(db_path)
templates_dir = Path(templates_dir)
if not db_path.exists():
print(f"Error: Database not found at {db_path}")
exit(1)
generator = InsightGenerator(db_path, templates_dir, verbose=verbose)
try:
# Generate report based on type
if report_type == 'weekly':
report = generator.generate_weekly_report(date_from, date_to)
elif report_type == 'file-heatmap':
report = generator.generate_file_heatmap_report(date_from, date_to)
elif report_type == 'tool-usage':
report = generator.generate_tool_usage_report(date_from, date_to)
else:
print("Custom templates not yet implemented")
exit(1)
# Output report
if output:
Path(output).write_text(report)
print(f"✓ Report saved to {output}")
else:
print(report)
finally:
generator.close()
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,298 @@
#!/usr/bin/env python3
"""
RAG Indexer for Claude Code Insights
Builds vector embeddings for semantic search using sentence-transformers
and ChromaDB. Supports incremental indexing and efficient similarity search.
"""
import sqlite3
import json
from pathlib import Path
from typing import List, Dict, Any, Optional
from datetime import datetime
import click
try:
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
except ImportError as e:
print(f"Error: Required packages not installed. Run: pip install sentence-transformers chromadb")
print(f"Missing: {e}")
exit(1)
class RAGIndexer:
"""Builds and manages vector embeddings for conversations"""
def __init__(self, db_path: Path, embeddings_dir: Path, model_name: str = "all-MiniLM-L6-v2", verbose: bool = False):
self.db_path = db_path
self.embeddings_dir = embeddings_dir
self.model_name = model_name
self.verbose = verbose
# Initialize sentence transformer model
self._log("Loading embedding model...")
self.model = SentenceTransformer(model_name)
self._log(f"✓ Loaded {model_name}")
# Initialize ChromaDB
self.embeddings_dir.mkdir(parents=True, exist_ok=True)
self.chroma_client = chromadb.PersistentClient(
path=str(self.embeddings_dir),
settings=Settings(anonymized_telemetry=False)
)
# Get or create collection
self.collection = self.chroma_client.get_or_create_collection(
name="conversations",
metadata={"hnsw:space": "cosine"} # Use cosine similarity
)
# Connect to SQLite
self.conn = sqlite3.connect(str(self.db_path))
self.conn.row_factory = sqlite3.Row
def _log(self, message: str):
"""Log if verbose mode is enabled"""
if self.verbose:
print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")
def _get_indexed_conversation_ids(self) -> set:
"""Get set of conversation IDs already indexed"""
try:
results = self.collection.get(include=[])
return set(results['ids'])
except Exception:
return set()
def _fetch_conversations_to_index(self, rebuild: bool = False) -> List[Dict[str, Any]]:
"""Fetch conversations that need indexing"""
if rebuild:
# Rebuild: get all conversations
cursor = self.conn.execute("""
SELECT id, first_user_message, last_assistant_message, topics,
files_read, files_written, files_edited, timestamp
FROM conversations
ORDER BY timestamp DESC
""")
else:
# Incremental: only get conversations not yet indexed
indexed_ids = self._get_indexed_conversation_ids()
if not indexed_ids:
# Nothing indexed yet, get all
cursor = self.conn.execute("""
SELECT id, first_user_message, last_assistant_message, topics,
files_read, files_written, files_edited, timestamp
FROM conversations
ORDER BY timestamp DESC
""")
else:
# Get conversations not in indexed set
placeholders = ','.join('?' * len(indexed_ids))
cursor = self.conn.execute(f"""
SELECT id, first_user_message, last_assistant_message, topics,
files_read, files_written, files_edited, timestamp
FROM conversations
WHERE id NOT IN ({placeholders})
ORDER BY timestamp DESC
""", tuple(indexed_ids))
conversations = []
for row in cursor.fetchall():
conversations.append({
'id': row['id'],
'first_user_message': row['first_user_message'] or "",
'last_assistant_message': row['last_assistant_message'] or "",
'topics': json.loads(row['topics']) if row['topics'] else [],
'files_read': json.loads(row['files_read']) if row['files_read'] else [],
'files_written': json.loads(row['files_written']) if row['files_written'] else [],
'files_edited': json.loads(row['files_edited']) if row['files_edited'] else [],
'timestamp': row['timestamp']
})
return conversations
def _create_document_text(self, conversation: Dict[str, Any]) -> str:
"""Create text document for embedding"""
# Combine relevant fields into searchable text
parts = []
if conversation['first_user_message']:
parts.append(f"User: {conversation['first_user_message']}")
if conversation['last_assistant_message']:
parts.append(f"Assistant: {conversation['last_assistant_message']}")
if conversation['topics']:
parts.append(f"Topics: {', '.join(conversation['topics'])}")
all_files = conversation['files_read'] + conversation['files_written'] + conversation['files_edited']
if all_files:
parts.append(f"Files: {', '.join(all_files)}")
return "\n\n".join(parts)
def _create_metadata(self, conversation: Dict[str, Any]) -> Dict[str, Any]:
"""Create metadata for ChromaDB"""
return {
'timestamp': conversation['timestamp'],
'topics': json.dumps(conversation['topics']),
'files_read': json.dumps(conversation['files_read']),
'files_written': json.dumps(conversation['files_written']),
'files_edited': json.dumps(conversation['files_edited']),
}
def index_conversations(self, rebuild: bool = False, batch_size: int = 32) -> int:
"""Index conversations for semantic search"""
if rebuild:
self._log("Rebuilding entire index...")
# Clear existing collection
self.chroma_client.delete_collection("conversations")
self.collection = self.chroma_client.create_collection(
name="conversations",
metadata={"hnsw:space": "cosine"}
)
else:
self._log("Incremental indexing...")
# Fetch conversations to index
conversations = self._fetch_conversations_to_index(rebuild)
if not conversations:
self._log("No conversations to index")
return 0
self._log(f"Indexing {len(conversations)} conversations...")
# Process in batches
indexed_count = 0
for i in range(0, len(conversations), batch_size):
batch = conversations[i:i + batch_size]
# Prepare batch data
ids = []
documents = []
metadatas = []
for conv in batch:
ids.append(conv['id'])
documents.append(self._create_document_text(conv))
metadatas.append(self._create_metadata(conv))
# Generate embeddings
embeddings = self.model.encode(documents, show_progress_bar=self.verbose)
# Add to ChromaDB
self.collection.add(
ids=ids,
documents=documents,
embeddings=embeddings.tolist(),
metadatas=metadatas
)
indexed_count += len(batch)
self._log(f"Indexed {indexed_count}/{len(conversations)} conversations")
self._log(f"✓ Indexing complete: {indexed_count} conversations")
return indexed_count
def search(self, query: str, n_results: int = 10, filters: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]:
"""Search conversations by semantic similarity"""
# Generate query embedding
query_embedding = self.model.encode([query])[0]
# Search in ChromaDB
results = self.collection.query(
query_embeddings=[query_embedding.tolist()],
n_results=n_results,
where=filters if filters else None
)
# Format results
formatted_results = []
for i in range(len(results['ids'][0])):
formatted_results.append({
'id': results['ids'][0][i],
'distance': results['distances'][0][i],
'similarity': 1 - results['distances'][0][i], # Convert distance to similarity
'document': results['documents'][0][i],
'metadata': results['metadatas'][0][i] if results['metadatas'] else {}
})
return formatted_results
def get_stats(self) -> Dict[str, Any]:
"""Get indexing statistics"""
try:
count = self.collection.count()
return {
'total_indexed': count,
'model': self.model_name,
'collection_name': self.collection.name,
'embedding_dimension': self.model.get_sentence_embedding_dimension()
}
except Exception as e:
return {
'error': str(e)
}
def close(self):
"""Close connections"""
if self.conn:
self.conn.close()
@click.command()
@click.option('--db-path', type=click.Path(), default='.claude/skills/cc-insights/.processed/conversations.db',
help='SQLite database path')
@click.option('--embeddings-dir', type=click.Path(), default='.claude/skills/cc-insights/.processed/embeddings',
help='ChromaDB embeddings directory')
@click.option('--model', default='all-MiniLM-L6-v2', help='Sentence transformer model name')
@click.option('--rebuild', is_flag=True, help='Rebuild entire index (delete and recreate)')
@click.option('--batch-size', default=32, help='Batch size for embedding generation')
@click.option('--verbose', is_flag=True, help='Show detailed logs')
@click.option('--stats', is_flag=True, help='Show statistics after indexing')
@click.option('--test-search', type=str, help='Test search with query')
def main(db_path: str, embeddings_dir: str, model: str, rebuild: bool, batch_size: int, verbose: bool, stats: bool, test_search: Optional[str]):
"""Build vector embeddings for semantic search"""
db_path = Path(db_path)
embeddings_dir = Path(embeddings_dir)
if not db_path.exists():
print(f"Error: Database not found at {db_path}")
print("Run conversation-processor.py first to process conversations")
exit(1)
indexer = RAGIndexer(db_path, embeddings_dir, model, verbose=verbose)
try:
# Index conversations
count = indexer.index_conversations(rebuild=rebuild, batch_size=batch_size)
print(f"\n✓ Indexed {count} conversations")
if stats:
print("\n=== Indexing Statistics ===")
stats_data = indexer.get_stats()
for key, value in stats_data.items():
print(f"{key}: {value}")
if test_search:
print(f"\n=== Test Search: '{test_search}' ===")
results = indexer.search(test_search, n_results=5)
if not results:
print("No results found")
else:
for i, result in enumerate(results, 1):
print(f"\n{i}. [Similarity: {result['similarity']:.3f}] {result['id']}")
print(f" {result['document'][:200]}...")
finally:
indexer.close()
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,384 @@
#!/usr/bin/env python3
"""
Search Interface for Claude Code Insights
Provides unified search across conversations using semantic (RAG) and keyword search.
Supports filtering by dates, files, and output formatting.
"""
import sqlite3
import json
from pathlib import Path
from typing import List, Dict, Any, Optional
from datetime import datetime
import click
try:
from rag_indexer import RAGIndexer
except ImportError:
print("Error: Cannot import rag_indexer. Ensure it's in the same directory.")
exit(1)
class ConversationSearch:
"""Unified search interface for conversations"""
def __init__(self, db_path: Path, embeddings_dir: Path, verbose: bool = False):
self.db_path = db_path
self.embeddings_dir = embeddings_dir
self.verbose = verbose
# Initialize RAG indexer for semantic search
self.indexer = RAGIndexer(db_path, embeddings_dir, verbose=verbose)
# Separate SQLite connection for metadata queries
self.conn = sqlite3.connect(str(db_path))
self.conn.row_factory = sqlite3.Row
def _log(self, message: str):
"""Log if verbose mode is enabled"""
if self.verbose:
print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")
def _get_conversation_details(self, conversation_id: str) -> Optional[Dict[str, Any]]:
"""Get full conversation details from SQLite"""
cursor = self.conn.execute("""
SELECT * FROM conversations WHERE id = ?
""", (conversation_id,))
row = cursor.fetchone()
if not row:
return None
return {
'id': row['id'],
'timestamp': row['timestamp'],
'message_count': row['message_count'],
'user_messages': row['user_messages'],
'assistant_messages': row['assistant_messages'],
'files_read': json.loads(row['files_read']) if row['files_read'] else [],
'files_written': json.loads(row['files_written']) if row['files_written'] else [],
'files_edited': json.loads(row['files_edited']) if row['files_edited'] else [],
'tools_used': json.loads(row['tools_used']) if row['tools_used'] else [],
'topics': json.loads(row['topics']) if row['topics'] else [],
'first_user_message': row['first_user_message'],
'last_assistant_message': row['last_assistant_message']
}
def semantic_search(
self,
query: str,
limit: int = 10,
date_from: Optional[str] = None,
date_to: Optional[str] = None,
file_pattern: Optional[str] = None
) -> List[Dict[str, Any]]:
"""Perform RAG-based semantic search"""
self._log(f"Semantic search: '{query}'")
# TODO: Add ChromaDB filters for dates/files when supported
results = self.indexer.search(query, n_results=limit * 2) # Get extra for filtering
# Enrich with full conversation details
enriched_results = []
for result in results:
details = self._get_conversation_details(result['id'])
if details:
# Apply post-search filters
if date_from and details['timestamp'] < date_from:
continue
if date_to and details['timestamp'] > date_to:
continue
if file_pattern:
all_files = details['files_read'] + details['files_written'] + details['files_edited']
if not any(file_pattern in f for f in all_files):
continue
enriched_results.append({
**result,
**details
})
if len(enriched_results) >= limit:
break
return enriched_results
def keyword_search(
self,
query: str,
limit: int = 10,
date_from: Optional[str] = None,
date_to: Optional[str] = None,
file_pattern: Optional[str] = None
) -> List[Dict[str, Any]]:
"""Perform SQL-based keyword search"""
self._log(f"Keyword search: '{query}'")
# Build SQL query
conditions = [
"(first_user_message LIKE ? OR last_assistant_message LIKE ? OR topics LIKE ?)"
]
params = [f"%{query}%", f"%{query}%", f"%{query}%"]
if date_from:
conditions.append("timestamp >= ?")
params.append(date_from)
if date_to:
conditions.append("timestamp <= ?")
params.append(date_to)
if file_pattern:
conditions.append(
"(files_read LIKE ? OR files_written LIKE ? OR files_edited LIKE ?)"
)
params.extend([f"%{file_pattern}%"] * 3)
where_clause = " AND ".join(conditions)
cursor = self.conn.execute(f"""
SELECT * FROM conversations
WHERE {where_clause}
ORDER BY timestamp DESC
LIMIT ?
""", params + [limit])
results = []
for row in cursor.fetchall():
results.append({
'id': row['id'],
'timestamp': row['timestamp'],
'message_count': row['message_count'],
'user_messages': row['user_messages'],
'assistant_messages': row['assistant_messages'],
'files_read': json.loads(row['files_read']) if row['files_read'] else [],
'files_written': json.loads(row['files_written']) if row['files_written'] else [],
'files_edited': json.loads(row['files_edited']) if row['files_edited'] else [],
'tools_used': json.loads(row['tools_used']) if row['tools_used'] else [],
'topics': json.loads(row['topics']) if row['topics'] else [],
'first_user_message': row['first_user_message'],
'last_assistant_message': row['last_assistant_message']
})
return results
def search_by_file(self, file_pattern: str, limit: int = 10) -> List[Dict[str, Any]]:
"""Find all conversations that touched specific files"""
self._log(f"File search: '{file_pattern}'")
cursor = self.conn.execute("""
SELECT DISTINCT c.*
FROM conversations c
JOIN file_interactions fi ON c.id = fi.conversation_id
WHERE fi.file_path LIKE ?
ORDER BY c.timestamp DESC
LIMIT ?
""", (f"%{file_pattern}%", limit))
results = []
for row in cursor.fetchall():
results.append({
'id': row['id'],
'timestamp': row['timestamp'],
'message_count': row['message_count'],
'files_read': json.loads(row['files_read']) if row['files_read'] else [],
'files_written': json.loads(row['files_written']) if row['files_written'] else [],
'files_edited': json.loads(row['files_edited']) if row['files_edited'] else [],
'tools_used': json.loads(row['tools_used']) if row['tools_used'] else [],
'topics': json.loads(row['topics']) if row['topics'] else [],
'first_user_message': row['first_user_message']
})
return results
def search_by_tool(self, tool_name: str, limit: int = 10) -> List[Dict[str, Any]]:
"""Find conversations using specific tools"""
self._log(f"Tool search: '{tool_name}'")
cursor = self.conn.execute("""
SELECT DISTINCT c.*
FROM conversations c
JOIN tool_usage tu ON c.id = tu.conversation_id
WHERE tu.tool_name LIKE ?
ORDER BY c.timestamp DESC
LIMIT ?
""", (f"%{tool_name}%", limit))
results = []
for row in cursor.fetchall():
results.append({
'id': row['id'],
'timestamp': row['timestamp'],
'message_count': row['message_count'],
'tools_used': json.loads(row['tools_used']) if row['tools_used'] else [],
'topics': json.loads(row['topics']) if row['topics'] else [],
'first_user_message': row['first_user_message']
})
return results
def format_results(self, results: List[Dict[str, Any]], format: str = 'text') -> str:
"""Format search results"""
if format == 'json':
return json.dumps(results, indent=2)
elif format == 'markdown':
output = [f"# Search Results ({len(results)} found)\n"]
for i, result in enumerate(results, 1):
timestamp = datetime.fromisoformat(result['timestamp']).strftime('%b %d, %Y %H:%M')
similarity = f"[Similarity: {result['similarity']:.3f}] " if 'similarity' in result else ""
output.append(f"## {i}. {similarity}{result['id']}")
output.append(f"**Date:** {timestamp}")
output.append(f"**Messages:** {result.get('message_count', 'N/A')}")
if result.get('topics'):
output.append(f"**Topics:** {', '.join(result['topics'])}")
all_files = (result.get('files_read', []) +
result.get('files_written', []) +
result.get('files_edited', []))
if all_files:
output.append(f"**Files:** {', '.join(all_files[:5])}")
if len(all_files) > 5:
output.append(f" _(and {len(all_files) - 5} more)_")
if result.get('tools_used'):
output.append(f"**Tools:** {', '.join(result['tools_used'][:5])}")
if result.get('first_user_message'):
msg = result['first_user_message'][:200]
output.append(f"\n**Snippet:** {msg}...")
output.append("")
return "\n".join(output)
else: # text format
output = [f"\nFound {len(results)} conversations:\n"]
for i, result in enumerate(results, 1):
timestamp = datetime.fromisoformat(result['timestamp']).strftime('%b %d, %Y %H:%M')
similarity = f"[Similarity: {result['similarity']:.3f}] " if 'similarity' in result else ""
output.append(f"{i}. {similarity}{result['id']}")
output.append(f" Date: {timestamp}")
output.append(f" Messages: {result.get('message_count', 'N/A')}")
if result.get('topics'):
output.append(f" Topics: {', '.join(result['topics'][:3])}")
all_files = (result.get('files_read', []) +
result.get('files_written', []) +
result.get('files_edited', []))
if all_files:
output.append(f" Files: {', '.join(all_files[:3])}")
if result.get('first_user_message'):
msg = result['first_user_message'][:150].replace('\n', ' ')
output.append(f" Preview: {msg}...")
output.append("")
return "\n".join(output)
def close(self):
"""Close connections"""
self.indexer.close()
if self.conn:
self.conn.close()
@click.command()
@click.argument('query', required=False)
@click.option('--db-path', type=click.Path(), default='.claude/skills/cc-insights/.processed/conversations.db',
help='SQLite database path')
@click.option('--embeddings-dir', type=click.Path(), default='.claude/skills/cc-insights/.processed/embeddings',
help='ChromaDB embeddings directory')
@click.option('--semantic/--keyword', default=True, help='Use semantic (RAG) or keyword search')
@click.option('--file', type=str, help='Filter by file pattern')
@click.option('--tool', type=str, help='Search by tool name')
@click.option('--date-from', type=str, help='Start date (ISO format)')
@click.option('--date-to', type=str, help='End date (ISO format)')
@click.option('--limit', default=10, help='Maximum results')
@click.option('--format', type=click.Choice(['text', 'json', 'markdown']), default='text', help='Output format')
@click.option('--verbose', is_flag=True, help='Show detailed logs')
def main(query: Optional[str], db_path: str, embeddings_dir: str, semantic: bool, file: Optional[str],
tool: Optional[str], date_from: Optional[str], date_to: Optional[str], limit: int, format: str, verbose: bool):
"""Search Claude Code conversations
Examples:
# Semantic search
python search-conversations.py "authentication bugs"
# Keyword search
python search-conversations.py "React optimization" --keyword
# Filter by file
python search-conversations.py "testing" --file "src/components"
# Search by tool
python search-conversations.py --tool "Write"
# Date range
python search-conversations.py "refactoring" --date-from 2025-10-01
# JSON output
python search-conversations.py "deployment" --format json
"""
db_path = Path(db_path)
embeddings_dir = Path(embeddings_dir)
if not db_path.exists():
print(f"Error: Database not found at {db_path}")
print("Run conversation-processor.py first")
exit(1)
searcher = ConversationSearch(db_path, embeddings_dir, verbose=verbose)
try:
results = []
if tool:
# Search by tool
results = searcher.search_by_tool(tool, limit=limit)
elif file:
# Search by file
results = searcher.search_by_file(file, limit=limit)
elif query:
# Text search
if semantic:
results = searcher.semantic_search(
query,
limit=limit,
date_from=date_from,
date_to=date_to,
file_pattern=file
)
else:
results = searcher.keyword_search(
query,
limit=limit,
date_from=date_from,
date_to=date_to,
file_pattern=file
)
else:
print("Error: Provide a query, --file, or --tool option")
exit(1)
# Format and output
output = searcher.format_results(results, format=format)
print(output)
finally:
searcher.close()
if __name__ == '__main__':
main()