gh-cskiro-claudex-claude-co…/skills/cc-insights/scripts/search-conversations.py

#!/usr/bin/env python3
"""
Search Interface for Claude Code Insights

Provides unified search across conversations using semantic (RAG) and keyword search.
Supports filtering by dates, files, and output formatting.
"""

import sqlite3
import json
from pathlib import Path
from typing import List, Dict, Any, Optional
from datetime import datetime
import click

try:
    from rag_indexer import RAGIndexer
except ImportError:
    print("Error: Cannot import rag_indexer. Ensure it's in the same directory.")
    exit(1)


class ConversationSearch:
    """Unified search interface for conversations"""

    def __init__(self, db_path: Path, embeddings_dir: Path, verbose: bool = False):
        self.db_path = db_path
        self.embeddings_dir = embeddings_dir
        self.verbose = verbose

        # Initialize RAG indexer for semantic search
        self.indexer = RAGIndexer(db_path, embeddings_dir, verbose=verbose)

        # Separate SQLite connection for metadata queries
        self.conn = sqlite3.connect(str(db_path))
        self.conn.row_factory = sqlite3.Row

    def _log(self, message: str):
        """Log if verbose mode is enabled"""
        if self.verbose:
            print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")

    def _get_conversation_details(self, conversation_id: str) -> Optional[Dict[str, Any]]:
        """Get full conversation details from SQLite"""
        cursor = self.conn.execute("""
            SELECT * FROM conversations WHERE id = ?
        """, (conversation_id,))

        row = cursor.fetchone()
        if not row:
            return None

        return {
            'id': row['id'],
            'timestamp': row['timestamp'],
            'message_count': row['message_count'],
            'user_messages': row['user_messages'],
            'assistant_messages': row['assistant_messages'],
            'files_read': json.loads(row['files_read']) if row['files_read'] else [],
            'files_written': json.loads(row['files_written']) if row['files_written'] else [],
            'files_edited': json.loads(row['files_edited']) if row['files_edited'] else [],
            'tools_used': json.loads(row['tools_used']) if row['tools_used'] else [],
            'topics': json.loads(row['topics']) if row['topics'] else [],
            'first_user_message': row['first_user_message'],
            'last_assistant_message': row['last_assistant_message']
        }

    def semantic_search(
        self,
        query: str,
        limit: int = 10,
        date_from: Optional[str] = None,
        date_to: Optional[str] = None,
        file_pattern: Optional[str] = None
    ) -> List[Dict[str, Any]]:
        """Perform RAG-based semantic search"""
        self._log(f"Semantic search: '{query}'")

        # TODO: Add ChromaDB filters for dates/files when supported
        results = self.indexer.search(query, n_results=limit * 2)  # Get extra for filtering

        # Enrich with full conversation details
        enriched_results = []
        for result in results:
            details = self._get_conversation_details(result['id'])
            if details:
                # Apply post-search filters
                if date_from and details['timestamp'] < date_from:
                    continue
                if date_to and details['timestamp'] > date_to:
                    continue
                if file_pattern:
                    all_files = details['files_read'] + details['files_written'] + details['files_edited']
                    if not any(file_pattern in f for f in all_files):
                        continue

                enriched_results.append({
                    **result,
                    **details
                })

            if len(enriched_results) >= limit:
                break

        return enriched_results

    def keyword_search(
        self,
        query: str,
        limit: int = 10,
        date_from: Optional[str] = None,
        date_to: Optional[str] = None,
        file_pattern: Optional[str] = None
    ) -> List[Dict[str, Any]]:
        """Perform SQL-based keyword search"""
        self._log(f"Keyword search: '{query}'")

        # Build SQL query
        conditions = [
            "(first_user_message LIKE ? OR last_assistant_message LIKE ? OR topics LIKE ?)"
        ]
        params = [f"%{query}%", f"%{query}%", f"%{query}%"]

        if date_from:
            conditions.append("timestamp >= ?")
            params.append(date_from)

        if date_to:
            conditions.append("timestamp <= ?")
            params.append(date_to)

        if file_pattern:
            conditions.append(
                "(files_read LIKE ? OR files_written LIKE ? OR files_edited LIKE ?)"
            )
            params.extend([f"%{file_pattern}%"] * 3)

        where_clause = " AND ".join(conditions)

        cursor = self.conn.execute(f"""
            SELECT * FROM conversations
            WHERE {where_clause}
            ORDER BY timestamp DESC
            LIMIT ?
        """, params + [limit])

        results = []
        for row in cursor.fetchall():
            results.append({
                'id': row['id'],
                'timestamp': row['timestamp'],
                'message_count': row['message_count'],
                'user_messages': row['user_messages'],
                'assistant_messages': row['assistant_messages'],
                'files_read': json.loads(row['files_read']) if row['files_read'] else [],
                'files_written': json.loads(row['files_written']) if row['files_written'] else [],
                'files_edited': json.loads(row['files_edited']) if row['files_edited'] else [],
                'tools_used': json.loads(row['tools_used']) if row['tools_used'] else [],
                'topics': json.loads(row['topics']) if row['topics'] else [],
                'first_user_message': row['first_user_message'],
                'last_assistant_message': row['last_assistant_message']
            })

        return results

    def search_by_file(self, file_pattern: str, limit: int = 10) -> List[Dict[str, Any]]:
        """Find all conversations that touched specific files"""
        self._log(f"File search: '{file_pattern}'")

        cursor = self.conn.execute("""
            SELECT DISTINCT c.*
            FROM conversations c
            JOIN file_interactions fi ON c.id = fi.conversation_id
            WHERE fi.file_path LIKE ?
            ORDER BY c.timestamp DESC
            LIMIT ?
        """, (f"%{file_pattern}%", limit))

        results = []
        for row in cursor.fetchall():
            results.append({
                'id': row['id'],
                'timestamp': row['timestamp'],
                'message_count': row['message_count'],
                'files_read': json.loads(row['files_read']) if row['files_read'] else [],
                'files_written': json.loads(row['files_written']) if row['files_written'] else [],
                'files_edited': json.loads(row['files_edited']) if row['files_edited'] else [],
                'tools_used': json.loads(row['tools_used']) if row['tools_used'] else [],
                'topics': json.loads(row['topics']) if row['topics'] else [],
                'first_user_message': row['first_user_message']
            })

        return results

    def search_by_tool(self, tool_name: str, limit: int = 10) -> List[Dict[str, Any]]:
        """Find conversations using specific tools"""
        self._log(f"Tool search: '{tool_name}'")

        cursor = self.conn.execute("""
            SELECT DISTINCT c.*
            FROM conversations c
            JOIN tool_usage tu ON c.id = tu.conversation_id
            WHERE tu.tool_name LIKE ?
            ORDER BY c.timestamp DESC
            LIMIT ?
        """, (f"%{tool_name}%", limit))

        results = []
        for row in cursor.fetchall():
            results.append({
                'id': row['id'],
                'timestamp': row['timestamp'],
                'message_count': row['message_count'],
                'tools_used': json.loads(row['tools_used']) if row['tools_used'] else [],
                'topics': json.loads(row['topics']) if row['topics'] else [],
                'first_user_message': row['first_user_message']
            })

        return results

    def format_results(self, results: List[Dict[str, Any]], format: str = 'text') -> str:
        """Format search results"""
        if format == 'json':
            return json.dumps(results, indent=2)

        elif format == 'markdown':
            output = [f"# Search Results ({len(results)} found)\n"]

            for i, result in enumerate(results, 1):
                timestamp = datetime.fromisoformat(result['timestamp']).strftime('%b %d, %Y %H:%M')
                similarity = f"[Similarity: {result['similarity']:.3f}] " if 'similarity' in result else ""

                output.append(f"## {i}. {similarity}{result['id']}")
                output.append(f"**Date:** {timestamp}")
                output.append(f"**Messages:** {result.get('message_count', 'N/A')}")

                if result.get('topics'):
                    output.append(f"**Topics:** {', '.join(result['topics'])}")

                all_files = (result.get('files_read', []) +
                           result.get('files_written', []) +
                           result.get('files_edited', []))
                if all_files:
                    output.append(f"**Files:** {', '.join(all_files[:5])}")
                    if len(all_files) > 5:
                        output.append(f"  _(and {len(all_files) - 5} more)_")

                if result.get('tools_used'):
                    output.append(f"**Tools:** {', '.join(result['tools_used'][:5])}")

                if result.get('first_user_message'):
                    msg = result['first_user_message'][:200]
                    output.append(f"\n**Snippet:** {msg}...")

                output.append("")

            return "\n".join(output)

        else:  # text format
            output = [f"\nFound {len(results)} conversations:\n"]

            for i, result in enumerate(results, 1):
                timestamp = datetime.fromisoformat(result['timestamp']).strftime('%b %d, %Y %H:%M')
                similarity = f"[Similarity: {result['similarity']:.3f}] " if 'similarity' in result else ""

                output.append(f"{i}. {similarity}{result['id']}")
                output.append(f"   Date: {timestamp}")
                output.append(f"   Messages: {result.get('message_count', 'N/A')}")

                if result.get('topics'):
                    output.append(f"   Topics: {', '.join(result['topics'][:3])}")

                all_files = (result.get('files_read', []) +
                           result.get('files_written', []) +
                           result.get('files_edited', []))
                if all_files:
                    output.append(f"   Files: {', '.join(all_files[:3])}")

                if result.get('first_user_message'):
                    msg = result['first_user_message'][:150].replace('\n', ' ')
                    output.append(f"   Preview: {msg}...")

                output.append("")

            return "\n".join(output)

    def close(self):
        """Close connections"""
        self.indexer.close()
        if self.conn:
            self.conn.close()


@click.command()
@click.argument('query', required=False)
@click.option('--db-path', type=click.Path(), default='.claude/skills/cc-insights/.processed/conversations.db',
              help='SQLite database path')
@click.option('--embeddings-dir', type=click.Path(), default='.claude/skills/cc-insights/.processed/embeddings',
              help='ChromaDB embeddings directory')
@click.option('--semantic/--keyword', default=True, help='Use semantic (RAG) or keyword search')
@click.option('--file', type=str, help='Filter by file pattern')
@click.option('--tool', type=str, help='Search by tool name')
@click.option('--date-from', type=str, help='Start date (ISO format)')
@click.option('--date-to', type=str, help='End date (ISO format)')
@click.option('--limit', default=10, help='Maximum results')
@click.option('--format', type=click.Choice(['text', 'json', 'markdown']), default='text', help='Output format')
@click.option('--verbose', is_flag=True, help='Show detailed logs')
def main(query: Optional[str], db_path: str, embeddings_dir: str, semantic: bool, file: Optional[str],
         tool: Optional[str], date_from: Optional[str], date_to: Optional[str], limit: int, format: str, verbose: bool):
    """Search Claude Code conversations

    Examples:

      # Semantic search
      python search-conversations.py "authentication bugs"

      # Keyword search
      python search-conversations.py "React optimization" --keyword

      # Filter by file
      python search-conversations.py "testing" --file "src/components"

      # Search by tool
      python search-conversations.py --tool "Write"

      # Date range
      python search-conversations.py "refactoring" --date-from 2025-10-01

      # JSON output
      python search-conversations.py "deployment" --format json
    """
    db_path = Path(db_path)
    embeddings_dir = Path(embeddings_dir)

    if not db_path.exists():
        print(f"Error: Database not found at {db_path}")
        print("Run conversation-processor.py first")
        exit(1)

    searcher = ConversationSearch(db_path, embeddings_dir, verbose=verbose)

    try:
        results = []

        if tool:
            # Search by tool
            results = searcher.search_by_tool(tool, limit=limit)

        elif file:
            # Search by file
            results = searcher.search_by_file(file, limit=limit)

        elif query:
            # Text search
            if semantic:
                results = searcher.semantic_search(
                    query,
                    limit=limit,
                    date_from=date_from,
                    date_to=date_to,
                    file_pattern=file
                )
            else:
                results = searcher.keyword_search(
                    query,
                    limit=limit,
                    date_from=date_from,
                    date_to=date_to,
                    file_pattern=file
                )
        else:
            print("Error: Provide a query, --file, or --tool option")
            exit(1)

        # Format and output
        output = searcher.format_results(results, format=format)
        print(output)

    finally:
        searcher.close()


if __name__ == '__main__':
    main()