Initial commit

This commit is contained in:
Zhongwei Li
2025-11-29 18:16:51 +08:00
commit 4e8a12140c
88 changed files with 17078 additions and 0 deletions

View File

@@ -0,0 +1,298 @@
#!/usr/bin/env python3
"""
RAG Indexer for Claude Code Insights
Builds vector embeddings for semantic search using sentence-transformers
and ChromaDB. Supports incremental indexing and efficient similarity search.
"""
import sqlite3
import json
from pathlib import Path
from typing import List, Dict, Any, Optional
from datetime import datetime
import click
try:
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
except ImportError as e:
print(f"Error: Required packages not installed. Run: pip install sentence-transformers chromadb")
print(f"Missing: {e}")
exit(1)
class RAGIndexer:
"""Builds and manages vector embeddings for conversations"""
def __init__(self, db_path: Path, embeddings_dir: Path, model_name: str = "all-MiniLM-L6-v2", verbose: bool = False):
self.db_path = db_path
self.embeddings_dir = embeddings_dir
self.model_name = model_name
self.verbose = verbose
# Initialize sentence transformer model
self._log("Loading embedding model...")
self.model = SentenceTransformer(model_name)
self._log(f"✓ Loaded {model_name}")
# Initialize ChromaDB
self.embeddings_dir.mkdir(parents=True, exist_ok=True)
self.chroma_client = chromadb.PersistentClient(
path=str(self.embeddings_dir),
settings=Settings(anonymized_telemetry=False)
)
# Get or create collection
self.collection = self.chroma_client.get_or_create_collection(
name="conversations",
metadata={"hnsw:space": "cosine"} # Use cosine similarity
)
# Connect to SQLite
self.conn = sqlite3.connect(str(self.db_path))
self.conn.row_factory = sqlite3.Row
def _log(self, message: str):
"""Log if verbose mode is enabled"""
if self.verbose:
print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")
def _get_indexed_conversation_ids(self) -> set:
"""Get set of conversation IDs already indexed"""
try:
results = self.collection.get(include=[])
return set(results['ids'])
except Exception:
return set()
def _fetch_conversations_to_index(self, rebuild: bool = False) -> List[Dict[str, Any]]:
"""Fetch conversations that need indexing"""
if rebuild:
# Rebuild: get all conversations
cursor = self.conn.execute("""
SELECT id, first_user_message, last_assistant_message, topics,
files_read, files_written, files_edited, timestamp
FROM conversations
ORDER BY timestamp DESC
""")
else:
# Incremental: only get conversations not yet indexed
indexed_ids = self._get_indexed_conversation_ids()
if not indexed_ids:
# Nothing indexed yet, get all
cursor = self.conn.execute("""
SELECT id, first_user_message, last_assistant_message, topics,
files_read, files_written, files_edited, timestamp
FROM conversations
ORDER BY timestamp DESC
""")
else:
# Get conversations not in indexed set
placeholders = ','.join('?' * len(indexed_ids))
cursor = self.conn.execute(f"""
SELECT id, first_user_message, last_assistant_message, topics,
files_read, files_written, files_edited, timestamp
FROM conversations
WHERE id NOT IN ({placeholders})
ORDER BY timestamp DESC
""", tuple(indexed_ids))
conversations = []
for row in cursor.fetchall():
conversations.append({
'id': row['id'],
'first_user_message': row['first_user_message'] or "",
'last_assistant_message': row['last_assistant_message'] or "",
'topics': json.loads(row['topics']) if row['topics'] else [],
'files_read': json.loads(row['files_read']) if row['files_read'] else [],
'files_written': json.loads(row['files_written']) if row['files_written'] else [],
'files_edited': json.loads(row['files_edited']) if row['files_edited'] else [],
'timestamp': row['timestamp']
})
return conversations
def _create_document_text(self, conversation: Dict[str, Any]) -> str:
"""Create text document for embedding"""
# Combine relevant fields into searchable text
parts = []
if conversation['first_user_message']:
parts.append(f"User: {conversation['first_user_message']}")
if conversation['last_assistant_message']:
parts.append(f"Assistant: {conversation['last_assistant_message']}")
if conversation['topics']:
parts.append(f"Topics: {', '.join(conversation['topics'])}")
all_files = conversation['files_read'] + conversation['files_written'] + conversation['files_edited']
if all_files:
parts.append(f"Files: {', '.join(all_files)}")
return "\n\n".join(parts)
def _create_metadata(self, conversation: Dict[str, Any]) -> Dict[str, Any]:
"""Create metadata for ChromaDB"""
return {
'timestamp': conversation['timestamp'],
'topics': json.dumps(conversation['topics']),
'files_read': json.dumps(conversation['files_read']),
'files_written': json.dumps(conversation['files_written']),
'files_edited': json.dumps(conversation['files_edited']),
}
def index_conversations(self, rebuild: bool = False, batch_size: int = 32) -> int:
"""Index conversations for semantic search"""
if rebuild:
self._log("Rebuilding entire index...")
# Clear existing collection
self.chroma_client.delete_collection("conversations")
self.collection = self.chroma_client.create_collection(
name="conversations",
metadata={"hnsw:space": "cosine"}
)
else:
self._log("Incremental indexing...")
# Fetch conversations to index
conversations = self._fetch_conversations_to_index(rebuild)
if not conversations:
self._log("No conversations to index")
return 0
self._log(f"Indexing {len(conversations)} conversations...")
# Process in batches
indexed_count = 0
for i in range(0, len(conversations), batch_size):
batch = conversations[i:i + batch_size]
# Prepare batch data
ids = []
documents = []
metadatas = []
for conv in batch:
ids.append(conv['id'])
documents.append(self._create_document_text(conv))
metadatas.append(self._create_metadata(conv))
# Generate embeddings
embeddings = self.model.encode(documents, show_progress_bar=self.verbose)
# Add to ChromaDB
self.collection.add(
ids=ids,
documents=documents,
embeddings=embeddings.tolist(),
metadatas=metadatas
)
indexed_count += len(batch)
self._log(f"Indexed {indexed_count}/{len(conversations)} conversations")
self._log(f"✓ Indexing complete: {indexed_count} conversations")
return indexed_count
def search(self, query: str, n_results: int = 10, filters: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]:
"""Search conversations by semantic similarity"""
# Generate query embedding
query_embedding = self.model.encode([query])[0]
# Search in ChromaDB
results = self.collection.query(
query_embeddings=[query_embedding.tolist()],
n_results=n_results,
where=filters if filters else None
)
# Format results
formatted_results = []
for i in range(len(results['ids'][0])):
formatted_results.append({
'id': results['ids'][0][i],
'distance': results['distances'][0][i],
'similarity': 1 - results['distances'][0][i], # Convert distance to similarity
'document': results['documents'][0][i],
'metadata': results['metadatas'][0][i] if results['metadatas'] else {}
})
return formatted_results
def get_stats(self) -> Dict[str, Any]:
"""Get indexing statistics"""
try:
count = self.collection.count()
return {
'total_indexed': count,
'model': self.model_name,
'collection_name': self.collection.name,
'embedding_dimension': self.model.get_sentence_embedding_dimension()
}
except Exception as e:
return {
'error': str(e)
}
def close(self):
"""Close connections"""
if self.conn:
self.conn.close()
@click.command()
@click.option('--db-path', type=click.Path(), default='.claude/skills/cc-insights/.processed/conversations.db',
help='SQLite database path')
@click.option('--embeddings-dir', type=click.Path(), default='.claude/skills/cc-insights/.processed/embeddings',
help='ChromaDB embeddings directory')
@click.option('--model', default='all-MiniLM-L6-v2', help='Sentence transformer model name')
@click.option('--rebuild', is_flag=True, help='Rebuild entire index (delete and recreate)')
@click.option('--batch-size', default=32, help='Batch size for embedding generation')
@click.option('--verbose', is_flag=True, help='Show detailed logs')
@click.option('--stats', is_flag=True, help='Show statistics after indexing')
@click.option('--test-search', type=str, help='Test search with query')
def main(db_path: str, embeddings_dir: str, model: str, rebuild: bool, batch_size: int, verbose: bool, stats: bool, test_search: Optional[str]):
"""Build vector embeddings for semantic search"""
db_path = Path(db_path)
embeddings_dir = Path(embeddings_dir)
if not db_path.exists():
print(f"Error: Database not found at {db_path}")
print("Run conversation-processor.py first to process conversations")
exit(1)
indexer = RAGIndexer(db_path, embeddings_dir, model, verbose=verbose)
try:
# Index conversations
count = indexer.index_conversations(rebuild=rebuild, batch_size=batch_size)
print(f"\n✓ Indexed {count} conversations")
if stats:
print("\n=== Indexing Statistics ===")
stats_data = indexer.get_stats()
for key, value in stats_data.items():
print(f"{key}: {value}")
if test_search:
print(f"\n=== Test Search: '{test_search}' ===")
results = indexer.search(test_search, n_results=5)
if not results:
print("No results found")
else:
for i, result in enumerate(results, 1):
print(f"\n{i}. [Similarity: {result['similarity']:.3f}] {result['id']}")
print(f" {result['document'][:200]}...")
finally:
indexer.close()
if __name__ == '__main__':
main()