Initial commit

2025-11-29 18:16:51 +08:00
commit 4e8a12140c
88 changed files with 17078 additions and 0 deletions
--- a/skills/cc-insights/scripts/search-conversations.py
+++ b/skills/cc-insights/scripts/search-conversations.py
@@ -0,0 +1,384 @@
+#!/usr/bin/env python3
+"""
+Search Interface for Claude Code Insights
+
+Provides unified search across conversations using semantic (RAG) and keyword search.
+Supports filtering by dates, files, and output formatting.
+"""
+
+import sqlite3
+import json
+from pathlib import Path
+from typing import List, Dict, Any, Optional
+from datetime import datetime
+import click
+
+try:
+    from rag_indexer import RAGIndexer
+except ImportError:
+    print("Error: Cannot import rag_indexer. Ensure it's in the same directory.")
+    exit(1)
+
+
+class ConversationSearch:
+    """Unified search interface for conversations"""
+
+    def __init__(self, db_path: Path, embeddings_dir: Path, verbose: bool = False):
+        self.db_path = db_path
+        self.embeddings_dir = embeddings_dir
+        self.verbose = verbose
+
+        # Initialize RAG indexer for semantic search
+        self.indexer = RAGIndexer(db_path, embeddings_dir, verbose=verbose)
+
+        # Separate SQLite connection for metadata queries
+        self.conn = sqlite3.connect(str(db_path))
+        self.conn.row_factory = sqlite3.Row
+
+    def _log(self, message: str):
+        """Log if verbose mode is enabled"""
+        if self.verbose:
+            print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")
+
+    def _get_conversation_details(self, conversation_id: str) -> Optional[Dict[str, Any]]:
+        """Get full conversation details from SQLite"""
+        cursor = self.conn.execute("""
+            SELECT * FROM conversations WHERE id = ?
+        """, (conversation_id,))
+
+        row = cursor.fetchone()
+        if not row:
+            return None
+
+        return {
+            'id': row['id'],
+            'timestamp': row['timestamp'],
+            'message_count': row['message_count'],
+            'user_messages': row['user_messages'],
+            'assistant_messages': row['assistant_messages'],
+            'files_read': json.loads(row['files_read']) if row['files_read'] else [],
+            'files_written': json.loads(row['files_written']) if row['files_written'] else [],
+            'files_edited': json.loads(row['files_edited']) if row['files_edited'] else [],
+            'tools_used': json.loads(row['tools_used']) if row['tools_used'] else [],
+            'topics': json.loads(row['topics']) if row['topics'] else [],
+            'first_user_message': row['first_user_message'],
+            'last_assistant_message': row['last_assistant_message']
+        }
+
+    def semantic_search(
+        self,
+        query: str,
+        limit: int = 10,
+        date_from: Optional[str] = None,
+        date_to: Optional[str] = None,
+        file_pattern: Optional[str] = None
+    ) -> List[Dict[str, Any]]:
+        """Perform RAG-based semantic search"""
+        self._log(f"Semantic search: '{query}'")
+
+        # TODO: Add ChromaDB filters for dates/files when supported
+        results = self.indexer.search(query, n_results=limit * 2)  # Get extra for filtering
+
+        # Enrich with full conversation details
+        enriched_results = []
+        for result in results:
+            details = self._get_conversation_details(result['id'])
+            if details:
+                # Apply post-search filters
+                if date_from and details['timestamp'] < date_from:
+                    continue
+                if date_to and details['timestamp'] > date_to:
+                    continue
+                if file_pattern:
+                    all_files = details['files_read'] + details['files_written'] + details['files_edited']
+                    if not any(file_pattern in f for f in all_files):
+                        continue
+
+                enriched_results.append({
+                    **result,
+                    **details
+                })
+
+            if len(enriched_results) >= limit:
+                break
+
+        return enriched_results
+
+    def keyword_search(
+        self,
+        query: str,
+        limit: int = 10,
+        date_from: Optional[str] = None,
+        date_to: Optional[str] = None,
+        file_pattern: Optional[str] = None
+    ) -> List[Dict[str, Any]]:
+        """Perform SQL-based keyword search"""
+        self._log(f"Keyword search: '{query}'")
+
+        # Build SQL query
+        conditions = [
+            "(first_user_message LIKE ? OR last_assistant_message LIKE ? OR topics LIKE ?)"
+        ]
+        params = [f"%{query}%", f"%{query}%", f"%{query}%"]
+
+        if date_from:
+            conditions.append("timestamp >= ?")
+            params.append(date_from)
+
+        if date_to:
+            conditions.append("timestamp <= ?")
+            params.append(date_to)
+
+        if file_pattern:
+            conditions.append(
+                "(files_read LIKE ? OR files_written LIKE ? OR files_edited LIKE ?)"
+            )
+            params.extend([f"%{file_pattern}%"] * 3)
+
+        where_clause = " AND ".join(conditions)
+
+        cursor = self.conn.execute(f"""
+            SELECT * FROM conversations
+            WHERE {where_clause}
+            ORDER BY timestamp DESC
+            LIMIT ?
+        """, params + [limit])
+
+        results = []
+        for row in cursor.fetchall():
+            results.append({
+                'id': row['id'],
+                'timestamp': row['timestamp'],
+                'message_count': row['message_count'],
+                'user_messages': row['user_messages'],
+                'assistant_messages': row['assistant_messages'],
+                'files_read': json.loads(row['files_read']) if row['files_read'] else [],
+                'files_written': json.loads(row['files_written']) if row['files_written'] else [],
+                'files_edited': json.loads(row['files_edited']) if row['files_edited'] else [],
+                'tools_used': json.loads(row['tools_used']) if row['tools_used'] else [],
+                'topics': json.loads(row['topics']) if row['topics'] else [],
+                'first_user_message': row['first_user_message'],
+                'last_assistant_message': row['last_assistant_message']
+            })
+
+        return results
+
+    def search_by_file(self, file_pattern: str, limit: int = 10) -> List[Dict[str, Any]]:
+        """Find all conversations that touched specific files"""
+        self._log(f"File search: '{file_pattern}'")
+
+        cursor = self.conn.execute("""
+            SELECT DISTINCT c.*
+            FROM conversations c
+            JOIN file_interactions fi ON c.id = fi.conversation_id
+            WHERE fi.file_path LIKE ?
+            ORDER BY c.timestamp DESC
+            LIMIT ?
+        """, (f"%{file_pattern}%", limit))
+
+        results = []
+        for row in cursor.fetchall():
+            results.append({
+                'id': row['id'],
+                'timestamp': row['timestamp'],
+                'message_count': row['message_count'],
+                'files_read': json.loads(row['files_read']) if row['files_read'] else [],
+                'files_written': json.loads(row['files_written']) if row['files_written'] else [],
+                'files_edited': json.loads(row['files_edited']) if row['files_edited'] else [],
+                'tools_used': json.loads(row['tools_used']) if row['tools_used'] else [],
+                'topics': json.loads(row['topics']) if row['topics'] else [],
+                'first_user_message': row['first_user_message']
+            })
+
+        return results
+
+    def search_by_tool(self, tool_name: str, limit: int = 10) -> List[Dict[str, Any]]:
+        """Find conversations using specific tools"""
+        self._log(f"Tool search: '{tool_name}'")
+
+        cursor = self.conn.execute("""
+            SELECT DISTINCT c.*
+            FROM conversations c
+            JOIN tool_usage tu ON c.id = tu.conversation_id
+            WHERE tu.tool_name LIKE ?
+            ORDER BY c.timestamp DESC
+            LIMIT ?
+        """, (f"%{tool_name}%", limit))
+
+        results = []
+        for row in cursor.fetchall():
+            results.append({
+                'id': row['id'],
+                'timestamp': row['timestamp'],
+                'message_count': row['message_count'],
+                'tools_used': json.loads(row['tools_used']) if row['tools_used'] else [],
+                'topics': json.loads(row['topics']) if row['topics'] else [],
+                'first_user_message': row['first_user_message']
+            })
+
+        return results
+
+    def format_results(self, results: List[Dict[str, Any]], format: str = 'text') -> str:
+        """Format search results"""
+        if format == 'json':
+            return json.dumps(results, indent=2)
+
+        elif format == 'markdown':
+            output = [f"# Search Results ({len(results)} found)\n"]
+
+            for i, result in enumerate(results, 1):
+                timestamp = datetime.fromisoformat(result['timestamp']).strftime('%b %d, %Y %H:%M')
+                similarity = f"[Similarity: {result['similarity']:.3f}] " if 'similarity' in result else ""
+
+                output.append(f"## {i}. {similarity}{result['id']}")
+                output.append(f"**Date:** {timestamp}")
+                output.append(f"**Messages:** {result.get('message_count', 'N/A')}")
+
+                if result.get('topics'):
+                    output.append(f"**Topics:** {', '.join(result['topics'])}")
+
+                all_files = (result.get('files_read', []) +
+                           result.get('files_written', []) +
+                           result.get('files_edited', []))
+                if all_files:
+                    output.append(f"**Files:** {', '.join(all_files[:5])}")
+                    if len(all_files) > 5:
+                        output.append(f"  _(and {len(all_files) - 5} more)_")
+
+                if result.get('tools_used'):
+                    output.append(f"**Tools:** {', '.join(result['tools_used'][:5])}")
+
+                if result.get('first_user_message'):
+                    msg = result['first_user_message'][:200]
+                    output.append(f"\n**Snippet:** {msg}...")
+
+                output.append("")
+
+            return "\n".join(output)
+
+        else:  # text format
+            output = [f"\nFound {len(results)} conversations:\n"]
+
+            for i, result in enumerate(results, 1):
+                timestamp = datetime.fromisoformat(result['timestamp']).strftime('%b %d, %Y %H:%M')
+                similarity = f"[Similarity: {result['similarity']:.3f}] " if 'similarity' in result else ""
+
+                output.append(f"{i}. {similarity}{result['id']}")
+                output.append(f"   Date: {timestamp}")
+                output.append(f"   Messages: {result.get('message_count', 'N/A')}")
+
+                if result.get('topics'):
+                    output.append(f"   Topics: {', '.join(result['topics'][:3])}")
+
+                all_files = (result.get('files_read', []) +
+                           result.get('files_written', []) +
+                           result.get('files_edited', []))
+                if all_files:
+                    output.append(f"   Files: {', '.join(all_files[:3])}")
+
+                if result.get('first_user_message'):
+                    msg = result['first_user_message'][:150].replace('\n', ' ')
+                    output.append(f"   Preview: {msg}...")
+
+                output.append("")
+
+            return "\n".join(output)
+
+    def close(self):
+        """Close connections"""
+        self.indexer.close()
+        if self.conn:
+            self.conn.close()
+
+
+@click.command()
+@click.argument('query', required=False)
+@click.option('--db-path', type=click.Path(), default='.claude/skills/cc-insights/.processed/conversations.db',
+              help='SQLite database path')
+@click.option('--embeddings-dir', type=click.Path(), default='.claude/skills/cc-insights/.processed/embeddings',
+              help='ChromaDB embeddings directory')
+@click.option('--semantic/--keyword', default=True, help='Use semantic (RAG) or keyword search')
+@click.option('--file', type=str, help='Filter by file pattern')
+@click.option('--tool', type=str, help='Search by tool name')
+@click.option('--date-from', type=str, help='Start date (ISO format)')
+@click.option('--date-to', type=str, help='End date (ISO format)')
+@click.option('--limit', default=10, help='Maximum results')
+@click.option('--format', type=click.Choice(['text', 'json', 'markdown']), default='text', help='Output format')
+@click.option('--verbose', is_flag=True, help='Show detailed logs')
+def main(query: Optional[str], db_path: str, embeddings_dir: str, semantic: bool, file: Optional[str],
+         tool: Optional[str], date_from: Optional[str], date_to: Optional[str], limit: int, format: str, verbose: bool):
+    """Search Claude Code conversations
+
+    Examples:
+
+      # Semantic search
+      python search-conversations.py "authentication bugs"
+
+      # Keyword search
+      python search-conversations.py "React optimization" --keyword
+
+      # Filter by file
+      python search-conversations.py "testing" --file "src/components"
+
+      # Search by tool
+      python search-conversations.py --tool "Write"
+
+      # Date range
+      python search-conversations.py "refactoring" --date-from 2025-10-01
+
+      # JSON output
+      python search-conversations.py "deployment" --format json
+    """
+    db_path = Path(db_path)
+    embeddings_dir = Path(embeddings_dir)
+
+    if not db_path.exists():
+        print(f"Error: Database not found at {db_path}")
+        print("Run conversation-processor.py first")
+        exit(1)
+
+    searcher = ConversationSearch(db_path, embeddings_dir, verbose=verbose)
+
+    try:
+        results = []
+
+        if tool:
+            # Search by tool
+            results = searcher.search_by_tool(tool, limit=limit)
+
+        elif file:
+            # Search by file
+            results = searcher.search_by_file(file, limit=limit)
+
+        elif query:
+            # Text search
+            if semantic:
+                results = searcher.semantic_search(
+                    query,
+                    limit=limit,
+                    date_from=date_from,
+                    date_to=date_to,
+                    file_pattern=file
+                )
+            else:
+                results = searcher.keyword_search(
+                    query,
+                    limit=limit,
+                    date_from=date_from,
+                    date_to=date_to,
+                    file_pattern=file
+                )
+        else:
+            print("Error: Provide a query, --file, or --tool option")
+            exit(1)
+
+        # Format and output
+        output = searcher.format_results(results, format=format)
+        print(output)
+
+    finally:
+        searcher.close()
+
+
+if __name__ == '__main__':
+    main()