Initial commit

2025-11-29 18:16:51 +08:00
commit 4e8a12140c
88 changed files with 17078 additions and 0 deletions
--- a/skills/cc-insights/scripts/rag_indexer.py
+++ b/skills/cc-insights/scripts/rag_indexer.py
@@ -0,0 +1,298 @@
+#!/usr/bin/env python3
+"""
+RAG Indexer for Claude Code Insights
+
+Builds vector embeddings for semantic search using sentence-transformers
+and ChromaDB. Supports incremental indexing and efficient similarity search.
+"""
+
+import sqlite3
+import json
+from pathlib import Path
+from typing import List, Dict, Any, Optional
+from datetime import datetime
+import click
+
+try:
+    from sentence_transformers import SentenceTransformer
+    import chromadb
+    from chromadb.config import Settings
+except ImportError as e:
+    print(f"Error: Required packages not installed. Run: pip install sentence-transformers chromadb")
+    print(f"Missing: {e}")
+    exit(1)
+
+
+class RAGIndexer:
+    """Builds and manages vector embeddings for conversations"""
+
+    def __init__(self, db_path: Path, embeddings_dir: Path, model_name: str = "all-MiniLM-L6-v2", verbose: bool = False):
+        self.db_path = db_path
+        self.embeddings_dir = embeddings_dir
+        self.model_name = model_name
+        self.verbose = verbose
+
+        # Initialize sentence transformer model
+        self._log("Loading embedding model...")
+        self.model = SentenceTransformer(model_name)
+        self._log(f"✓ Loaded {model_name}")
+
+        # Initialize ChromaDB
+        self.embeddings_dir.mkdir(parents=True, exist_ok=True)
+        self.chroma_client = chromadb.PersistentClient(
+            path=str(self.embeddings_dir),
+            settings=Settings(anonymized_telemetry=False)
+        )
+
+        # Get or create collection
+        self.collection = self.chroma_client.get_or_create_collection(
+            name="conversations",
+            metadata={"hnsw:space": "cosine"}  # Use cosine similarity
+        )
+
+        # Connect to SQLite
+        self.conn = sqlite3.connect(str(self.db_path))
+        self.conn.row_factory = sqlite3.Row
+
+    def _log(self, message: str):
+        """Log if verbose mode is enabled"""
+        if self.verbose:
+            print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")
+
+    def _get_indexed_conversation_ids(self) -> set:
+        """Get set of conversation IDs already indexed"""
+        try:
+            results = self.collection.get(include=[])
+            return set(results['ids'])
+        except Exception:
+            return set()
+
+    def _fetch_conversations_to_index(self, rebuild: bool = False) -> List[Dict[str, Any]]:
+        """Fetch conversations that need indexing"""
+        if rebuild:
+            # Rebuild: get all conversations
+            cursor = self.conn.execute("""
+                SELECT id, first_user_message, last_assistant_message, topics,
+                       files_read, files_written, files_edited, timestamp
+                FROM conversations
+                ORDER BY timestamp DESC
+            """)
+        else:
+            # Incremental: only get conversations not yet indexed
+            indexed_ids = self._get_indexed_conversation_ids()
+            if not indexed_ids:
+                # Nothing indexed yet, get all
+                cursor = self.conn.execute("""
+                    SELECT id, first_user_message, last_assistant_message, topics,
+                           files_read, files_written, files_edited, timestamp
+                    FROM conversations
+                    ORDER BY timestamp DESC
+                """)
+            else:
+                # Get conversations not in indexed set
+                placeholders = ','.join('?' * len(indexed_ids))
+                cursor = self.conn.execute(f"""
+                    SELECT id, first_user_message, last_assistant_message, topics,
+                           files_read, files_written, files_edited, timestamp
+                    FROM conversations
+                    WHERE id NOT IN ({placeholders})
+                    ORDER BY timestamp DESC
+                """, tuple(indexed_ids))
+
+        conversations = []
+        for row in cursor.fetchall():
+            conversations.append({
+                'id': row['id'],
+                'first_user_message': row['first_user_message'] or "",
+                'last_assistant_message': row['last_assistant_message'] or "",
+                'topics': json.loads(row['topics']) if row['topics'] else [],
+                'files_read': json.loads(row['files_read']) if row['files_read'] else [],
+                'files_written': json.loads(row['files_written']) if row['files_written'] else [],
+                'files_edited': json.loads(row['files_edited']) if row['files_edited'] else [],
+                'timestamp': row['timestamp']
+            })
+
+        return conversations
+
+    def _create_document_text(self, conversation: Dict[str, Any]) -> str:
+        """Create text document for embedding"""
+        # Combine relevant fields into searchable text
+        parts = []
+
+        if conversation['first_user_message']:
+            parts.append(f"User: {conversation['first_user_message']}")
+
+        if conversation['last_assistant_message']:
+            parts.append(f"Assistant: {conversation['last_assistant_message']}")
+
+        if conversation['topics']:
+            parts.append(f"Topics: {', '.join(conversation['topics'])}")
+
+        all_files = conversation['files_read'] + conversation['files_written'] + conversation['files_edited']
+        if all_files:
+            parts.append(f"Files: {', '.join(all_files)}")
+
+        return "\n\n".join(parts)
+
+    def _create_metadata(self, conversation: Dict[str, Any]) -> Dict[str, Any]:
+        """Create metadata for ChromaDB"""
+        return {
+            'timestamp': conversation['timestamp'],
+            'topics': json.dumps(conversation['topics']),
+            'files_read': json.dumps(conversation['files_read']),
+            'files_written': json.dumps(conversation['files_written']),
+            'files_edited': json.dumps(conversation['files_edited']),
+        }
+
+    def index_conversations(self, rebuild: bool = False, batch_size: int = 32) -> int:
+        """Index conversations for semantic search"""
+        if rebuild:
+            self._log("Rebuilding entire index...")
+            # Clear existing collection
+            self.chroma_client.delete_collection("conversations")
+            self.collection = self.chroma_client.create_collection(
+                name="conversations",
+                metadata={"hnsw:space": "cosine"}
+            )
+        else:
+            self._log("Incremental indexing...")
+
+        # Fetch conversations to index
+        conversations = self._fetch_conversations_to_index(rebuild)
+
+        if not conversations:
+            self._log("No conversations to index")
+            return 0
+
+        self._log(f"Indexing {len(conversations)} conversations...")
+
+        # Process in batches
+        indexed_count = 0
+        for i in range(0, len(conversations), batch_size):
+            batch = conversations[i:i + batch_size]
+
+            # Prepare batch data
+            ids = []
+            documents = []
+            metadatas = []
+
+            for conv in batch:
+                ids.append(conv['id'])
+                documents.append(self._create_document_text(conv))
+                metadatas.append(self._create_metadata(conv))
+
+            # Generate embeddings
+            embeddings = self.model.encode(documents, show_progress_bar=self.verbose)
+
+            # Add to ChromaDB
+            self.collection.add(
+                ids=ids,
+                documents=documents,
+                embeddings=embeddings.tolist(),
+                metadatas=metadatas
+            )
+
+            indexed_count += len(batch)
+            self._log(f"Indexed {indexed_count}/{len(conversations)} conversations")
+
+        self._log(f"✓ Indexing complete: {indexed_count} conversations")
+        return indexed_count
+
+    def search(self, query: str, n_results: int = 10, filters: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]:
+        """Search conversations by semantic similarity"""
+        # Generate query embedding
+        query_embedding = self.model.encode([query])[0]
+
+        # Search in ChromaDB
+        results = self.collection.query(
+            query_embeddings=[query_embedding.tolist()],
+            n_results=n_results,
+            where=filters if filters else None
+        )
+
+        # Format results
+        formatted_results = []
+        for i in range(len(results['ids'][0])):
+            formatted_results.append({
+                'id': results['ids'][0][i],
+                'distance': results['distances'][0][i],
+                'similarity': 1 - results['distances'][0][i],  # Convert distance to similarity
+                'document': results['documents'][0][i],
+                'metadata': results['metadatas'][0][i] if results['metadatas'] else {}
+            })
+
+        return formatted_results
+
+    def get_stats(self) -> Dict[str, Any]:
+        """Get indexing statistics"""
+        try:
+            count = self.collection.count()
+            return {
+                'total_indexed': count,
+                'model': self.model_name,
+                'collection_name': self.collection.name,
+                'embedding_dimension': self.model.get_sentence_embedding_dimension()
+            }
+        except Exception as e:
+            return {
+                'error': str(e)
+            }
+
+    def close(self):
+        """Close connections"""
+        if self.conn:
+            self.conn.close()
+
+
+@click.command()
+@click.option('--db-path', type=click.Path(), default='.claude/skills/cc-insights/.processed/conversations.db',
+              help='SQLite database path')
+@click.option('--embeddings-dir', type=click.Path(), default='.claude/skills/cc-insights/.processed/embeddings',
+              help='ChromaDB embeddings directory')
+@click.option('--model', default='all-MiniLM-L6-v2', help='Sentence transformer model name')
+@click.option('--rebuild', is_flag=True, help='Rebuild entire index (delete and recreate)')
+@click.option('--batch-size', default=32, help='Batch size for embedding generation')
+@click.option('--verbose', is_flag=True, help='Show detailed logs')
+@click.option('--stats', is_flag=True, help='Show statistics after indexing')
+@click.option('--test-search', type=str, help='Test search with query')
+def main(db_path: str, embeddings_dir: str, model: str, rebuild: bool, batch_size: int, verbose: bool, stats: bool, test_search: Optional[str]):
+    """Build vector embeddings for semantic search"""
+    db_path = Path(db_path)
+    embeddings_dir = Path(embeddings_dir)
+
+    if not db_path.exists():
+        print(f"Error: Database not found at {db_path}")
+        print("Run conversation-processor.py first to process conversations")
+        exit(1)
+
+    indexer = RAGIndexer(db_path, embeddings_dir, model, verbose=verbose)
+
+    try:
+        # Index conversations
+        count = indexer.index_conversations(rebuild=rebuild, batch_size=batch_size)
+
+        print(f"\n✓ Indexed {count} conversations")
+
+        if stats:
+            print("\n=== Indexing Statistics ===")
+            stats_data = indexer.get_stats()
+            for key, value in stats_data.items():
+                print(f"{key}: {value}")
+
+        if test_search:
+            print(f"\n=== Test Search: '{test_search}' ===")
+            results = indexer.search(test_search, n_results=5)
+
+            if not results:
+                print("No results found")
+            else:
+                for i, result in enumerate(results, 1):
+                    print(f"\n{i}. [Similarity: {result['similarity']:.3f}] {result['id']}")
+                    print(f"   {result['document'][:200]}...")
+
+    finally:
+        indexer.close()
+
+
+if __name__ == '__main__':
+    main()