299 lines
11 KiB
Python
Executable File
299 lines
11 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
RAG Indexer for Claude Code Insights
|
|
|
|
Builds vector embeddings for semantic search using sentence-transformers
|
|
and ChromaDB. Supports incremental indexing and efficient similarity search.
|
|
"""
|
|
|
|
import sqlite3
|
|
import json
|
|
from pathlib import Path
|
|
from typing import List, Dict, Any, Optional
|
|
from datetime import datetime
|
|
import click
|
|
|
|
try:
|
|
from sentence_transformers import SentenceTransformer
|
|
import chromadb
|
|
from chromadb.config import Settings
|
|
except ImportError as e:
|
|
print(f"Error: Required packages not installed. Run: pip install sentence-transformers chromadb")
|
|
print(f"Missing: {e}")
|
|
exit(1)
|
|
|
|
|
|
class RAGIndexer:
|
|
"""Builds and manages vector embeddings for conversations"""
|
|
|
|
def __init__(self, db_path: Path, embeddings_dir: Path, model_name: str = "all-MiniLM-L6-v2", verbose: bool = False):
|
|
self.db_path = db_path
|
|
self.embeddings_dir = embeddings_dir
|
|
self.model_name = model_name
|
|
self.verbose = verbose
|
|
|
|
# Initialize sentence transformer model
|
|
self._log("Loading embedding model...")
|
|
self.model = SentenceTransformer(model_name)
|
|
self._log(f"✓ Loaded {model_name}")
|
|
|
|
# Initialize ChromaDB
|
|
self.embeddings_dir.mkdir(parents=True, exist_ok=True)
|
|
self.chroma_client = chromadb.PersistentClient(
|
|
path=str(self.embeddings_dir),
|
|
settings=Settings(anonymized_telemetry=False)
|
|
)
|
|
|
|
# Get or create collection
|
|
self.collection = self.chroma_client.get_or_create_collection(
|
|
name="conversations",
|
|
metadata={"hnsw:space": "cosine"} # Use cosine similarity
|
|
)
|
|
|
|
# Connect to SQLite
|
|
self.conn = sqlite3.connect(str(self.db_path))
|
|
self.conn.row_factory = sqlite3.Row
|
|
|
|
def _log(self, message: str):
|
|
"""Log if verbose mode is enabled"""
|
|
if self.verbose:
|
|
print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")
|
|
|
|
def _get_indexed_conversation_ids(self) -> set:
|
|
"""Get set of conversation IDs already indexed"""
|
|
try:
|
|
results = self.collection.get(include=[])
|
|
return set(results['ids'])
|
|
except Exception:
|
|
return set()
|
|
|
|
def _fetch_conversations_to_index(self, rebuild: bool = False) -> List[Dict[str, Any]]:
|
|
"""Fetch conversations that need indexing"""
|
|
if rebuild:
|
|
# Rebuild: get all conversations
|
|
cursor = self.conn.execute("""
|
|
SELECT id, first_user_message, last_assistant_message, topics,
|
|
files_read, files_written, files_edited, timestamp
|
|
FROM conversations
|
|
ORDER BY timestamp DESC
|
|
""")
|
|
else:
|
|
# Incremental: only get conversations not yet indexed
|
|
indexed_ids = self._get_indexed_conversation_ids()
|
|
if not indexed_ids:
|
|
# Nothing indexed yet, get all
|
|
cursor = self.conn.execute("""
|
|
SELECT id, first_user_message, last_assistant_message, topics,
|
|
files_read, files_written, files_edited, timestamp
|
|
FROM conversations
|
|
ORDER BY timestamp DESC
|
|
""")
|
|
else:
|
|
# Get conversations not in indexed set
|
|
placeholders = ','.join('?' * len(indexed_ids))
|
|
cursor = self.conn.execute(f"""
|
|
SELECT id, first_user_message, last_assistant_message, topics,
|
|
files_read, files_written, files_edited, timestamp
|
|
FROM conversations
|
|
WHERE id NOT IN ({placeholders})
|
|
ORDER BY timestamp DESC
|
|
""", tuple(indexed_ids))
|
|
|
|
conversations = []
|
|
for row in cursor.fetchall():
|
|
conversations.append({
|
|
'id': row['id'],
|
|
'first_user_message': row['first_user_message'] or "",
|
|
'last_assistant_message': row['last_assistant_message'] or "",
|
|
'topics': json.loads(row['topics']) if row['topics'] else [],
|
|
'files_read': json.loads(row['files_read']) if row['files_read'] else [],
|
|
'files_written': json.loads(row['files_written']) if row['files_written'] else [],
|
|
'files_edited': json.loads(row['files_edited']) if row['files_edited'] else [],
|
|
'timestamp': row['timestamp']
|
|
})
|
|
|
|
return conversations
|
|
|
|
def _create_document_text(self, conversation: Dict[str, Any]) -> str:
|
|
"""Create text document for embedding"""
|
|
# Combine relevant fields into searchable text
|
|
parts = []
|
|
|
|
if conversation['first_user_message']:
|
|
parts.append(f"User: {conversation['first_user_message']}")
|
|
|
|
if conversation['last_assistant_message']:
|
|
parts.append(f"Assistant: {conversation['last_assistant_message']}")
|
|
|
|
if conversation['topics']:
|
|
parts.append(f"Topics: {', '.join(conversation['topics'])}")
|
|
|
|
all_files = conversation['files_read'] + conversation['files_written'] + conversation['files_edited']
|
|
if all_files:
|
|
parts.append(f"Files: {', '.join(all_files)}")
|
|
|
|
return "\n\n".join(parts)
|
|
|
|
def _create_metadata(self, conversation: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Create metadata for ChromaDB"""
|
|
return {
|
|
'timestamp': conversation['timestamp'],
|
|
'topics': json.dumps(conversation['topics']),
|
|
'files_read': json.dumps(conversation['files_read']),
|
|
'files_written': json.dumps(conversation['files_written']),
|
|
'files_edited': json.dumps(conversation['files_edited']),
|
|
}
|
|
|
|
def index_conversations(self, rebuild: bool = False, batch_size: int = 32) -> int:
|
|
"""Index conversations for semantic search"""
|
|
if rebuild:
|
|
self._log("Rebuilding entire index...")
|
|
# Clear existing collection
|
|
self.chroma_client.delete_collection("conversations")
|
|
self.collection = self.chroma_client.create_collection(
|
|
name="conversations",
|
|
metadata={"hnsw:space": "cosine"}
|
|
)
|
|
else:
|
|
self._log("Incremental indexing...")
|
|
|
|
# Fetch conversations to index
|
|
conversations = self._fetch_conversations_to_index(rebuild)
|
|
|
|
if not conversations:
|
|
self._log("No conversations to index")
|
|
return 0
|
|
|
|
self._log(f"Indexing {len(conversations)} conversations...")
|
|
|
|
# Process in batches
|
|
indexed_count = 0
|
|
for i in range(0, len(conversations), batch_size):
|
|
batch = conversations[i:i + batch_size]
|
|
|
|
# Prepare batch data
|
|
ids = []
|
|
documents = []
|
|
metadatas = []
|
|
|
|
for conv in batch:
|
|
ids.append(conv['id'])
|
|
documents.append(self._create_document_text(conv))
|
|
metadatas.append(self._create_metadata(conv))
|
|
|
|
# Generate embeddings
|
|
embeddings = self.model.encode(documents, show_progress_bar=self.verbose)
|
|
|
|
# Add to ChromaDB
|
|
self.collection.add(
|
|
ids=ids,
|
|
documents=documents,
|
|
embeddings=embeddings.tolist(),
|
|
metadatas=metadatas
|
|
)
|
|
|
|
indexed_count += len(batch)
|
|
self._log(f"Indexed {indexed_count}/{len(conversations)} conversations")
|
|
|
|
self._log(f"✓ Indexing complete: {indexed_count} conversations")
|
|
return indexed_count
|
|
|
|
def search(self, query: str, n_results: int = 10, filters: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]:
|
|
"""Search conversations by semantic similarity"""
|
|
# Generate query embedding
|
|
query_embedding = self.model.encode([query])[0]
|
|
|
|
# Search in ChromaDB
|
|
results = self.collection.query(
|
|
query_embeddings=[query_embedding.tolist()],
|
|
n_results=n_results,
|
|
where=filters if filters else None
|
|
)
|
|
|
|
# Format results
|
|
formatted_results = []
|
|
for i in range(len(results['ids'][0])):
|
|
formatted_results.append({
|
|
'id': results['ids'][0][i],
|
|
'distance': results['distances'][0][i],
|
|
'similarity': 1 - results['distances'][0][i], # Convert distance to similarity
|
|
'document': results['documents'][0][i],
|
|
'metadata': results['metadatas'][0][i] if results['metadatas'] else {}
|
|
})
|
|
|
|
return formatted_results
|
|
|
|
def get_stats(self) -> Dict[str, Any]:
|
|
"""Get indexing statistics"""
|
|
try:
|
|
count = self.collection.count()
|
|
return {
|
|
'total_indexed': count,
|
|
'model': self.model_name,
|
|
'collection_name': self.collection.name,
|
|
'embedding_dimension': self.model.get_sentence_embedding_dimension()
|
|
}
|
|
except Exception as e:
|
|
return {
|
|
'error': str(e)
|
|
}
|
|
|
|
def close(self):
|
|
"""Close connections"""
|
|
if self.conn:
|
|
self.conn.close()
|
|
|
|
|
|
@click.command()
|
|
@click.option('--db-path', type=click.Path(), default='.claude/skills/cc-insights/.processed/conversations.db',
|
|
help='SQLite database path')
|
|
@click.option('--embeddings-dir', type=click.Path(), default='.claude/skills/cc-insights/.processed/embeddings',
|
|
help='ChromaDB embeddings directory')
|
|
@click.option('--model', default='all-MiniLM-L6-v2', help='Sentence transformer model name')
|
|
@click.option('--rebuild', is_flag=True, help='Rebuild entire index (delete and recreate)')
|
|
@click.option('--batch-size', default=32, help='Batch size for embedding generation')
|
|
@click.option('--verbose', is_flag=True, help='Show detailed logs')
|
|
@click.option('--stats', is_flag=True, help='Show statistics after indexing')
|
|
@click.option('--test-search', type=str, help='Test search with query')
|
|
def main(db_path: str, embeddings_dir: str, model: str, rebuild: bool, batch_size: int, verbose: bool, stats: bool, test_search: Optional[str]):
|
|
"""Build vector embeddings for semantic search"""
|
|
db_path = Path(db_path)
|
|
embeddings_dir = Path(embeddings_dir)
|
|
|
|
if not db_path.exists():
|
|
print(f"Error: Database not found at {db_path}")
|
|
print("Run conversation-processor.py first to process conversations")
|
|
exit(1)
|
|
|
|
indexer = RAGIndexer(db_path, embeddings_dir, model, verbose=verbose)
|
|
|
|
try:
|
|
# Index conversations
|
|
count = indexer.index_conversations(rebuild=rebuild, batch_size=batch_size)
|
|
|
|
print(f"\n✓ Indexed {count} conversations")
|
|
|
|
if stats:
|
|
print("\n=== Indexing Statistics ===")
|
|
stats_data = indexer.get_stats()
|
|
for key, value in stats_data.items():
|
|
print(f"{key}: {value}")
|
|
|
|
if test_search:
|
|
print(f"\n=== Test Search: '{test_search}' ===")
|
|
results = indexer.search(test_search, n_results=5)
|
|
|
|
if not results:
|
|
print("No results found")
|
|
else:
|
|
for i, result in enumerate(results, 1):
|
|
print(f"\n{i}. [Similarity: {result['similarity']:.3f}] {result['id']}")
|
|
print(f" {result['document'][:200]}...")
|
|
|
|
finally:
|
|
indexer.close()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|