Files
gh-cskiro-claudex-claude-co…/skills/cc-insights/scripts/search-conversations.py
2025-11-29 18:16:51 +08:00

385 lines
14 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Search Interface for Claude Code Insights
Provides unified search across conversations using semantic (RAG) and keyword search.
Supports filtering by dates, files, and output formatting.
"""
import sqlite3
import json
from pathlib import Path
from typing import List, Dict, Any, Optional
from datetime import datetime
import click
try:
from rag_indexer import RAGIndexer
except ImportError:
print("Error: Cannot import rag_indexer. Ensure it's in the same directory.")
exit(1)
class ConversationSearch:
"""Unified search interface for conversations"""
def __init__(self, db_path: Path, embeddings_dir: Path, verbose: bool = False):
self.db_path = db_path
self.embeddings_dir = embeddings_dir
self.verbose = verbose
# Initialize RAG indexer for semantic search
self.indexer = RAGIndexer(db_path, embeddings_dir, verbose=verbose)
# Separate SQLite connection for metadata queries
self.conn = sqlite3.connect(str(db_path))
self.conn.row_factory = sqlite3.Row
def _log(self, message: str):
"""Log if verbose mode is enabled"""
if self.verbose:
print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")
def _get_conversation_details(self, conversation_id: str) -> Optional[Dict[str, Any]]:
"""Get full conversation details from SQLite"""
cursor = self.conn.execute("""
SELECT * FROM conversations WHERE id = ?
""", (conversation_id,))
row = cursor.fetchone()
if not row:
return None
return {
'id': row['id'],
'timestamp': row['timestamp'],
'message_count': row['message_count'],
'user_messages': row['user_messages'],
'assistant_messages': row['assistant_messages'],
'files_read': json.loads(row['files_read']) if row['files_read'] else [],
'files_written': json.loads(row['files_written']) if row['files_written'] else [],
'files_edited': json.loads(row['files_edited']) if row['files_edited'] else [],
'tools_used': json.loads(row['tools_used']) if row['tools_used'] else [],
'topics': json.loads(row['topics']) if row['topics'] else [],
'first_user_message': row['first_user_message'],
'last_assistant_message': row['last_assistant_message']
}
def semantic_search(
self,
query: str,
limit: int = 10,
date_from: Optional[str] = None,
date_to: Optional[str] = None,
file_pattern: Optional[str] = None
) -> List[Dict[str, Any]]:
"""Perform RAG-based semantic search"""
self._log(f"Semantic search: '{query}'")
# TODO: Add ChromaDB filters for dates/files when supported
results = self.indexer.search(query, n_results=limit * 2) # Get extra for filtering
# Enrich with full conversation details
enriched_results = []
for result in results:
details = self._get_conversation_details(result['id'])
if details:
# Apply post-search filters
if date_from and details['timestamp'] < date_from:
continue
if date_to and details['timestamp'] > date_to:
continue
if file_pattern:
all_files = details['files_read'] + details['files_written'] + details['files_edited']
if not any(file_pattern in f for f in all_files):
continue
enriched_results.append({
**result,
**details
})
if len(enriched_results) >= limit:
break
return enriched_results
def keyword_search(
self,
query: str,
limit: int = 10,
date_from: Optional[str] = None,
date_to: Optional[str] = None,
file_pattern: Optional[str] = None
) -> List[Dict[str, Any]]:
"""Perform SQL-based keyword search"""
self._log(f"Keyword search: '{query}'")
# Build SQL query
conditions = [
"(first_user_message LIKE ? OR last_assistant_message LIKE ? OR topics LIKE ?)"
]
params = [f"%{query}%", f"%{query}%", f"%{query}%"]
if date_from:
conditions.append("timestamp >= ?")
params.append(date_from)
if date_to:
conditions.append("timestamp <= ?")
params.append(date_to)
if file_pattern:
conditions.append(
"(files_read LIKE ? OR files_written LIKE ? OR files_edited LIKE ?)"
)
params.extend([f"%{file_pattern}%"] * 3)
where_clause = " AND ".join(conditions)
cursor = self.conn.execute(f"""
SELECT * FROM conversations
WHERE {where_clause}
ORDER BY timestamp DESC
LIMIT ?
""", params + [limit])
results = []
for row in cursor.fetchall():
results.append({
'id': row['id'],
'timestamp': row['timestamp'],
'message_count': row['message_count'],
'user_messages': row['user_messages'],
'assistant_messages': row['assistant_messages'],
'files_read': json.loads(row['files_read']) if row['files_read'] else [],
'files_written': json.loads(row['files_written']) if row['files_written'] else [],
'files_edited': json.loads(row['files_edited']) if row['files_edited'] else [],
'tools_used': json.loads(row['tools_used']) if row['tools_used'] else [],
'topics': json.loads(row['topics']) if row['topics'] else [],
'first_user_message': row['first_user_message'],
'last_assistant_message': row['last_assistant_message']
})
return results
def search_by_file(self, file_pattern: str, limit: int = 10) -> List[Dict[str, Any]]:
"""Find all conversations that touched specific files"""
self._log(f"File search: '{file_pattern}'")
cursor = self.conn.execute("""
SELECT DISTINCT c.*
FROM conversations c
JOIN file_interactions fi ON c.id = fi.conversation_id
WHERE fi.file_path LIKE ?
ORDER BY c.timestamp DESC
LIMIT ?
""", (f"%{file_pattern}%", limit))
results = []
for row in cursor.fetchall():
results.append({
'id': row['id'],
'timestamp': row['timestamp'],
'message_count': row['message_count'],
'files_read': json.loads(row['files_read']) if row['files_read'] else [],
'files_written': json.loads(row['files_written']) if row['files_written'] else [],
'files_edited': json.loads(row['files_edited']) if row['files_edited'] else [],
'tools_used': json.loads(row['tools_used']) if row['tools_used'] else [],
'topics': json.loads(row['topics']) if row['topics'] else [],
'first_user_message': row['first_user_message']
})
return results
def search_by_tool(self, tool_name: str, limit: int = 10) -> List[Dict[str, Any]]:
"""Find conversations using specific tools"""
self._log(f"Tool search: '{tool_name}'")
cursor = self.conn.execute("""
SELECT DISTINCT c.*
FROM conversations c
JOIN tool_usage tu ON c.id = tu.conversation_id
WHERE tu.tool_name LIKE ?
ORDER BY c.timestamp DESC
LIMIT ?
""", (f"%{tool_name}%", limit))
results = []
for row in cursor.fetchall():
results.append({
'id': row['id'],
'timestamp': row['timestamp'],
'message_count': row['message_count'],
'tools_used': json.loads(row['tools_used']) if row['tools_used'] else [],
'topics': json.loads(row['topics']) if row['topics'] else [],
'first_user_message': row['first_user_message']
})
return results
def format_results(self, results: List[Dict[str, Any]], format: str = 'text') -> str:
"""Format search results"""
if format == 'json':
return json.dumps(results, indent=2)
elif format == 'markdown':
output = [f"# Search Results ({len(results)} found)\n"]
for i, result in enumerate(results, 1):
timestamp = datetime.fromisoformat(result['timestamp']).strftime('%b %d, %Y %H:%M')
similarity = f"[Similarity: {result['similarity']:.3f}] " if 'similarity' in result else ""
output.append(f"## {i}. {similarity}{result['id']}")
output.append(f"**Date:** {timestamp}")
output.append(f"**Messages:** {result.get('message_count', 'N/A')}")
if result.get('topics'):
output.append(f"**Topics:** {', '.join(result['topics'])}")
all_files = (result.get('files_read', []) +
result.get('files_written', []) +
result.get('files_edited', []))
if all_files:
output.append(f"**Files:** {', '.join(all_files[:5])}")
if len(all_files) > 5:
output.append(f" _(and {len(all_files) - 5} more)_")
if result.get('tools_used'):
output.append(f"**Tools:** {', '.join(result['tools_used'][:5])}")
if result.get('first_user_message'):
msg = result['first_user_message'][:200]
output.append(f"\n**Snippet:** {msg}...")
output.append("")
return "\n".join(output)
else: # text format
output = [f"\nFound {len(results)} conversations:\n"]
for i, result in enumerate(results, 1):
timestamp = datetime.fromisoformat(result['timestamp']).strftime('%b %d, %Y %H:%M')
similarity = f"[Similarity: {result['similarity']:.3f}] " if 'similarity' in result else ""
output.append(f"{i}. {similarity}{result['id']}")
output.append(f" Date: {timestamp}")
output.append(f" Messages: {result.get('message_count', 'N/A')}")
if result.get('topics'):
output.append(f" Topics: {', '.join(result['topics'][:3])}")
all_files = (result.get('files_read', []) +
result.get('files_written', []) +
result.get('files_edited', []))
if all_files:
output.append(f" Files: {', '.join(all_files[:3])}")
if result.get('first_user_message'):
msg = result['first_user_message'][:150].replace('\n', ' ')
output.append(f" Preview: {msg}...")
output.append("")
return "\n".join(output)
def close(self):
"""Close connections"""
self.indexer.close()
if self.conn:
self.conn.close()
@click.command()
@click.argument('query', required=False)
@click.option('--db-path', type=click.Path(), default='.claude/skills/cc-insights/.processed/conversations.db',
help='SQLite database path')
@click.option('--embeddings-dir', type=click.Path(), default='.claude/skills/cc-insights/.processed/embeddings',
help='ChromaDB embeddings directory')
@click.option('--semantic/--keyword', default=True, help='Use semantic (RAG) or keyword search')
@click.option('--file', type=str, help='Filter by file pattern')
@click.option('--tool', type=str, help='Search by tool name')
@click.option('--date-from', type=str, help='Start date (ISO format)')
@click.option('--date-to', type=str, help='End date (ISO format)')
@click.option('--limit', default=10, help='Maximum results')
@click.option('--format', type=click.Choice(['text', 'json', 'markdown']), default='text', help='Output format')
@click.option('--verbose', is_flag=True, help='Show detailed logs')
def main(query: Optional[str], db_path: str, embeddings_dir: str, semantic: bool, file: Optional[str],
tool: Optional[str], date_from: Optional[str], date_to: Optional[str], limit: int, format: str, verbose: bool):
"""Search Claude Code conversations
Examples:
# Semantic search
python search-conversations.py "authentication bugs"
# Keyword search
python search-conversations.py "React optimization" --keyword
# Filter by file
python search-conversations.py "testing" --file "src/components"
# Search by tool
python search-conversations.py --tool "Write"
# Date range
python search-conversations.py "refactoring" --date-from 2025-10-01
# JSON output
python search-conversations.py "deployment" --format json
"""
db_path = Path(db_path)
embeddings_dir = Path(embeddings_dir)
if not db_path.exists():
print(f"Error: Database not found at {db_path}")
print("Run conversation-processor.py first")
exit(1)
searcher = ConversationSearch(db_path, embeddings_dir, verbose=verbose)
try:
results = []
if tool:
# Search by tool
results = searcher.search_by_tool(tool, limit=limit)
elif file:
# Search by file
results = searcher.search_by_file(file, limit=limit)
elif query:
# Text search
if semantic:
results = searcher.semantic_search(
query,
limit=limit,
date_from=date_from,
date_to=date_to,
file_pattern=file
)
else:
results = searcher.keyword_search(
query,
limit=limit,
date_from=date_from,
date_to=date_to,
file_pattern=file
)
else:
print("Error: Provide a query, --file, or --tool option")
exit(1)
# Format and output
output = searcher.format_results(results, format=format)
print(output)
finally:
searcher.close()
if __name__ == '__main__':
main()