gh-cskiro-claudex-claude-co…/skills/cc-insights/scripts/conversation-processor.py

#!/usr/bin/env python3
"""
Conversation Processor for Claude Code Insights

Parses JSONL conversation files from ~/.claude/projects/, extracts metadata,
and stores in SQLite for fast querying. Supports incremental processing.
"""

import json
import sqlite3
import base64
import hashlib
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Any, Optional
from dataclasses import dataclass, asdict
import click
import re


@dataclass
class ConversationMetadata:
    """Structured conversation metadata"""
    id: str
    project_path: str
    timestamp: datetime
    message_count: int
    user_messages: int
    assistant_messages: int
    files_read: List[str]
    files_written: List[str]
    files_edited: List[str]
    tools_used: List[str]
    topics: List[str]
    first_user_message: str
    last_assistant_message: str
    conversation_hash: str
    file_size_bytes: int
    processed_at: datetime


class ConversationProcessor:
    """Processes Claude Code conversation JSONL files"""

    def __init__(self, db_path: Path, verbose: bool = False):
        self.db_path = db_path
        self.verbose = verbose
        self.conn = None
        self._init_database()

    def _init_database(self):
        """Initialize SQLite database with schema"""
        self.db_path.parent.mkdir(parents=True, exist_ok=True)
        self.conn = sqlite3.connect(str(self.db_path))
        self.conn.row_factory = sqlite3.Row

        # Create tables
        self.conn.executescript("""
            CREATE TABLE IF NOT EXISTS conversations (
                id TEXT PRIMARY KEY,
                project_path TEXT NOT NULL,
                timestamp TEXT NOT NULL,
                message_count INTEGER NOT NULL,
                user_messages INTEGER NOT NULL,
                assistant_messages INTEGER NOT NULL,
                files_read TEXT,  -- JSON array
                files_written TEXT,  -- JSON array
                files_edited TEXT,  -- JSON array
                tools_used TEXT,  -- JSON array
                topics TEXT,  -- JSON array
                first_user_message TEXT,
                last_assistant_message TEXT,
                conversation_hash TEXT UNIQUE NOT NULL,
                file_size_bytes INTEGER NOT NULL,
                processed_at TEXT NOT NULL
            );

            CREATE INDEX IF NOT EXISTS idx_timestamp ON conversations(timestamp);
            CREATE INDEX IF NOT EXISTS idx_project ON conversations(project_path);
            CREATE INDEX IF NOT EXISTS idx_processed ON conversations(processed_at);

            CREATE TABLE IF NOT EXISTS file_interactions (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                conversation_id TEXT NOT NULL,
                file_path TEXT NOT NULL,
                interaction_type TEXT NOT NULL,  -- read, write, edit
                FOREIGN KEY (conversation_id) REFERENCES conversations(id)
            );

            CREATE INDEX IF NOT EXISTS idx_file_path ON file_interactions(file_path);
            CREATE INDEX IF NOT EXISTS idx_conversation ON file_interactions(conversation_id);

            CREATE TABLE IF NOT EXISTS tool_usage (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                conversation_id TEXT NOT NULL,
                tool_name TEXT NOT NULL,
                usage_count INTEGER NOT NULL,
                FOREIGN KEY (conversation_id) REFERENCES conversations(id)
            );

            CREATE INDEX IF NOT EXISTS idx_tool_name ON tool_usage(tool_name);

            CREATE TABLE IF NOT EXISTS processing_state (
                file_path TEXT PRIMARY KEY,
                last_modified TEXT NOT NULL,
                last_processed TEXT NOT NULL,
                file_hash TEXT NOT NULL
            );
        """)
        self.conn.commit()

    def _log(self, message: str):
        """Log if verbose mode is enabled"""
        if self.verbose:
            print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")

    def _compute_file_hash(self, file_path: Path) -> str:
        """Compute SHA256 hash of file for change detection"""
        sha256 = hashlib.sha256()
        with open(file_path, 'rb') as f:
            for chunk in iter(lambda: f.read(8192), b''):
                sha256.update(chunk)
        return sha256.hexdigest()

    def _needs_processing(self, file_path: Path, reindex: bool = False) -> bool:
        """Check if file needs (re)processing"""
        if reindex:
            return True

        file_stat = file_path.stat()
        file_hash = self._compute_file_hash(file_path)

        cursor = self.conn.execute(
            "SELECT last_modified, file_hash FROM processing_state WHERE file_path = ?",
            (str(file_path),)
        )
        row = cursor.fetchone()

        if not row:
            return True  # Never processed

        last_modified, stored_hash = row
        return stored_hash != file_hash  # File changed

    def _update_processing_state(self, file_path: Path):
        """Update processing state for file"""
        file_hash = self._compute_file_hash(file_path)
        last_modified = datetime.fromtimestamp(file_path.stat().st_mtime).isoformat()

        self.conn.execute("""
            INSERT OR REPLACE INTO processing_state (file_path, last_modified, last_processed, file_hash)
            VALUES (?, ?, ?, ?)
        """, (str(file_path), last_modified, datetime.now().isoformat(), file_hash))

    def _parse_jsonl_file(self, file_path: Path) -> List[Dict[str, Any]]:
        """Parse JSONL file with base64-encoded content"""
        messages = []
        with open(file_path, 'r', encoding='utf-8') as f:
            for line_num, line in enumerate(f, 1):
                try:
                    if line.strip():
                        data = json.loads(line)
                        messages.append(data)
                except json.JSONDecodeError as e:
                    self._log(f"Warning: Failed to parse line {line_num} in {file_path.name}: {e}")
        return messages

    def _extract_tool_uses(self, content: str) -> List[str]:
        """Extract tool names from assistant messages"""
        tools = []
        # Look for tool use patterns in content
        tool_patterns = [
            r'"name":\s*"([A-Z][a-zA-Z]+)"',  # JSON tool calls
            r'<tool>([A-Z][a-zA-Z]+)</tool>',  # XML tool calls
        ]

        for pattern in tool_patterns:
            matches = re.findall(pattern, content)
            tools.extend(matches)

        return list(set(tools))  # Unique tools

    def _extract_file_paths(self, content: str) -> Dict[str, List[str]]:
        """Extract file paths and their interaction types from content"""
        files = {
            'read': [],
            'written': [],
            'edited': []
        }

        # Patterns for file operations
        read_patterns = [
            r'Reading\s+(.+\.(?:py|js|ts|tsx|jsx|md|json|yaml|yml))',
            r'Read\s+file:\s*(.+)',
            r'"file_path":\s*"([^"]+)"',  # Tool parameters
        ]

        write_patterns = [
            r'Writing\s+(.+\.(?:py|js|ts|tsx|jsx|md|json|yaml|yml))',
            r'Created\s+file:\s*(.+)',
            r'Write\s+(.+)',
        ]

        edit_patterns = [
            r'Editing\s+(.+\.(?:py|js|ts|tsx|jsx|md|json|yaml|yml))',
            r'Modified\s+file:\s*(.+)',
            r'Edit\s+(.+)',
        ]

        for pattern in read_patterns:
            files['read'].extend(re.findall(pattern, content, re.IGNORECASE))
        for pattern in write_patterns:
            files['written'].extend(re.findall(pattern, content, re.IGNORECASE))
        for pattern in edit_patterns:
            files['edited'].extend(re.findall(pattern, content, re.IGNORECASE))

        # Deduplicate and clean
        for key in files:
            files[key] = list(set(path.strip() for path in files[key]))

        return files

    def _extract_topics(self, messages: List[Dict[str, Any]]) -> List[str]:
        """Extract topic keywords from conversation"""
        # Combine first user message and some assistant responses
        text = ""
        user_count = 0
        for msg in messages:
            msg_type = msg.get('type', '')

            # Handle event-stream format
            if msg_type == 'user':
                message_dict = msg.get('message', {})
                content = message_dict.get('content', '') if isinstance(message_dict, dict) else ''

                # Handle content that's a list (content blocks)
                if isinstance(content, list):
                    message_content = ' '.join(
                        block.get('text', '') if isinstance(block, dict) and block.get('type') == 'text' else ''
                        for block in content
                    )
                else:
                    message_content = content

                if message_content:
                    text += message_content + " "
                    user_count += 1
                    if user_count >= 3:  # Only use first few user messages
                        break
            elif msg_type == 'assistant' and user_count < 3:
                # Also include some assistant responses for context
                message_dict = msg.get('message', {})
                content = message_dict.get('content', '') if isinstance(message_dict, dict) else ''

                # Handle content that's a list (content blocks)
                if isinstance(content, list):
                    message_content = ' '.join(
                        block.get('text', '') if isinstance(block, dict) and block.get('type') == 'text' else ''
                        for block in content
                    )
                else:
                    message_content = content

                if message_content:
                    text += message_content[:200] + " "  # Just a snippet

        # Extract common programming keywords
        keywords = []
        common_topics = [
            'authentication', 'auth', 'login', 'jwt', 'oauth',
            'testing', 'test', 'unit test', 'integration test',
            'bug', 'fix', 'error', 'issue', 'debug',
            'performance', 'optimization', 'optimize', 'slow',
            'refactor', 'refactoring', 'cleanup',
            'feature', 'implement', 'add', 'create',
            'database', 'sql', 'query', 'schema',
            'api', 'endpoint', 'rest', 'graphql',
            'typescript', 'javascript', 'react', 'node',
            'css', 'style', 'styling', 'tailwind',
            'security', 'vulnerability', 'xss', 'csrf',
            'deploy', 'deployment', 'ci/cd', 'docker',
        ]

        text_lower = text.lower()
        for topic in common_topics:
            if topic in text_lower:
                keywords.append(topic)

        return list(set(keywords))[:10]  # Max 10 topics

    def _process_conversation(self, file_path: Path, messages: List[Dict[str, Any]]) -> ConversationMetadata:
        """Extract metadata from parsed conversation"""
        # Generate conversation ID from filename
        conv_id = file_path.stem

        # Count messages by role
        user_messages = 0
        assistant_messages = 0
        first_user_msg = ""
        last_assistant_msg = ""
        all_tools = []
        all_files = {'read': [], 'written': [], 'edited': []}

        for msg in messages:
            msg_type = msg.get('type', '')

            # Handle event-stream format
            if msg_type == 'user':
                user_messages += 1
                message_dict = msg.get('message', {})
                content = message_dict.get('content', '') if isinstance(message_dict, dict) else ''

                # Handle content that's a list (content blocks)
                if isinstance(content, list):
                    message_content = ' '.join(
                        block.get('text', '') if isinstance(block, dict) and block.get('type') == 'text' else ''
                        for block in content
                    )
                else:
                    message_content = content

                if not first_user_msg and message_content:
                    first_user_msg = message_content[:500]  # First 500 chars

            elif msg_type == 'assistant':
                assistant_messages += 1
                message_dict = msg.get('message', {})
                content = message_dict.get('content', '') if isinstance(message_dict, dict) else ''

                # Handle content that's a list (content blocks)
                if isinstance(content, list):
                    message_content = ' '.join(
                        block.get('text', '') if isinstance(block, dict) and block.get('type') == 'text' else ''
                        for block in content
                    )
                    # Also extract tools from content blocks
                    for block in content:
                        if isinstance(block, dict) and block.get('type') == 'tool_use':
                            tool_name = block.get('name', '')
                            if tool_name:
                                all_tools.append(tool_name)
                else:
                    message_content = content

                if message_content:
                    last_assistant_msg = message_content[:500]

                    # Extract tools and files from assistant messages
                    tools = self._extract_tool_uses(message_content)
                    all_tools.extend(tools)

                    files = self._extract_file_paths(message_content)
                    for key in all_files:
                        all_files[key].extend(files[key])

        # Deduplicate
        all_tools = list(set(all_tools))
        for key in all_files:
            all_files[key] = list(set(all_files[key]))

        # Extract topics
        topics = self._extract_topics(messages)

        # File stats
        file_stat = file_path.stat()

        # Compute conversation hash
        conv_hash = self._compute_file_hash(file_path)

        # Extract timestamp (from filename or file mtime)
        try:
            # Try to get timestamp from file modification time
            timestamp = datetime.fromtimestamp(file_stat.st_mtime)
        except Exception:
            timestamp = datetime.now()

        return ConversationMetadata(
            id=conv_id,
            project_path=str(file_path.parent),
            timestamp=timestamp,
            message_count=len(messages),
            user_messages=user_messages,
            assistant_messages=assistant_messages,
            files_read=all_files['read'],
            files_written=all_files['written'],
            files_edited=all_files['edited'],
            tools_used=all_tools,
            topics=topics,
            first_user_message=first_user_msg,
            last_assistant_message=last_assistant_msg,
            conversation_hash=conv_hash,
            file_size_bytes=file_stat.st_size,
            processed_at=datetime.now()
        )

    def _store_conversation(self, metadata: ConversationMetadata):
        """Store conversation metadata in database"""
        # Store main conversation record
        self.conn.execute("""
            INSERT OR REPLACE INTO conversations
            (id, project_path, timestamp, message_count, user_messages, assistant_messages,
             files_read, files_written, files_edited, tools_used, topics,
             first_user_message, last_assistant_message, conversation_hash,
             file_size_bytes, processed_at)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        """, (
            metadata.id,
            metadata.project_path,
            metadata.timestamp.isoformat(),
            metadata.message_count,
            metadata.user_messages,
            metadata.assistant_messages,
            json.dumps(metadata.files_read),
            json.dumps(metadata.files_written),
            json.dumps(metadata.files_edited),
            json.dumps(metadata.tools_used),
            json.dumps(metadata.topics),
            metadata.first_user_message,
            metadata.last_assistant_message,
            metadata.conversation_hash,
            metadata.file_size_bytes,
            metadata.processed_at.isoformat()
        ))

        # Store file interactions
        self.conn.execute(
            "DELETE FROM file_interactions WHERE conversation_id = ?",
            (metadata.id,)
        )
        for file_path in metadata.files_read:
            self.conn.execute(
                "INSERT INTO file_interactions (conversation_id, file_path, interaction_type) VALUES (?, ?, ?)",
                (metadata.id, file_path, 'read')
            )
        for file_path in metadata.files_written:
            self.conn.execute(
                "INSERT INTO file_interactions (conversation_id, file_path, interaction_type) VALUES (?, ?, ?)",
                (metadata.id, file_path, 'write')
            )
        for file_path in metadata.files_edited:
            self.conn.execute(
                "INSERT INTO file_interactions (conversation_id, file_path, interaction_type) VALUES (?, ?, ?)",
                (metadata.id, file_path, 'edit')
            )

        # Store tool usage
        self.conn.execute(
            "DELETE FROM tool_usage WHERE conversation_id = ?",
            (metadata.id,)
        )
        for tool_name in metadata.tools_used:
            self.conn.execute(
                "INSERT INTO tool_usage (conversation_id, tool_name, usage_count) VALUES (?, ?, ?)",
                (metadata.id, tool_name, 1)
            )

    def process_file(self, file_path: Path, reindex: bool = False) -> bool:
        """Process a single conversation file"""
        if not self._needs_processing(file_path, reindex):
            self._log(f"Skipping {file_path.name} (already processed)")
            return False

        self._log(f"Processing {file_path.name}...")

        try:
            # Parse JSONL
            messages = self._parse_jsonl_file(file_path)

            if not messages:
                self._log(f"Warning: No messages found in {file_path.name}")
                return False

            # Extract metadata
            metadata = self._process_conversation(file_path, messages)

            # Store in database
            self._store_conversation(metadata)

            # Update processing state
            self._update_processing_state(file_path)

            self.conn.commit()

            self._log(f"✓ Processed {file_path.name}: {metadata.message_count} messages, "
                     f"{metadata.user_messages} user, {metadata.assistant_messages} assistant")

            return True

        except Exception as e:
            self._log(f"Error processing {file_path.name}: {e}")
            import traceback
            if self.verbose:
                traceback.print_exc()
            return False

    def process_project(self, project_name: str, reindex: bool = False) -> int:
        """Process all conversations for a project"""
        # Find conversation files
        claude_projects = Path.home() / ".claude" / "projects"

        if not claude_projects.exists():
            self._log(f"Error: {claude_projects} does not exist")
            return 0

        # Find project directory (may be encoded)
        project_dirs = list(claude_projects.glob(f"*{project_name}*"))

        if not project_dirs:
            self._log(f"Error: No project directory found matching '{project_name}'")
            return 0

        if len(project_dirs) > 1:
            self._log(f"Warning: Multiple project directories found, using {project_dirs[0].name}")

        project_dir = project_dirs[0]
        self._log(f"Processing conversations from {project_dir}")

        # Find all JSONL files
        jsonl_files = list(project_dir.glob("*.jsonl"))

        if not jsonl_files:
            self._log(f"No conversation files found in {project_dir}")
            return 0

        self._log(f"Found {len(jsonl_files)} conversation files")

        # Process each file
        processed_count = 0
        for jsonl_file in jsonl_files:
            if self.process_file(jsonl_file, reindex):
                processed_count += 1

        self._log(f"\nProcessed {processed_count}/{len(jsonl_files)} conversations")
        return processed_count

    def get_stats(self) -> Dict[str, Any]:
        """Get processing statistics"""
        cursor = self.conn.execute("""
            SELECT
                COUNT(*) as total_conversations,
                SUM(message_count) as total_messages,
                SUM(user_messages) as total_user_messages,
                SUM(assistant_messages) as total_assistant_messages,
                MIN(timestamp) as earliest_conversation,
                MAX(timestamp) as latest_conversation
            FROM conversations
        """)
        row = cursor.fetchone()

        stats = {
            'total_conversations': row['total_conversations'],
            'total_messages': row['total_messages'],
            'total_user_messages': row['total_user_messages'],
            'total_assistant_messages': row['total_assistant_messages'],
            'earliest_conversation': row['earliest_conversation'],
            'latest_conversation': row['latest_conversation']
        }

        # Top files
        cursor = self.conn.execute("""
            SELECT file_path, COUNT(*) as interaction_count
            FROM file_interactions
            GROUP BY file_path
            ORDER BY interaction_count DESC
            LIMIT 10
        """)
        stats['top_files'] = [
            {'file': row['file_path'], 'count': row['interaction_count']}
            for row in cursor.fetchall()
        ]

        # Top tools
        cursor = self.conn.execute("""
            SELECT tool_name, SUM(usage_count) as total_usage
            FROM tool_usage
            GROUP BY tool_name
            ORDER BY total_usage DESC
            LIMIT 10
        """)
        stats['top_tools'] = [
            {'tool': row['tool_name'], 'count': row['total_usage']}
            for row in cursor.fetchall()
        ]

        return stats

    def close(self):
        """Close database connection"""
        if self.conn:
            self.conn.close()


@click.command()
@click.option('--project-name', default='annex', help='Project name to process')
@click.option('--db-path', type=click.Path(), default='.claude/skills/cc-insights/.processed/conversations.db',
              help='SQLite database path')
@click.option('--reindex', is_flag=True, help='Reprocess all conversations (ignore cache)')
@click.option('--verbose', is_flag=True, help='Show detailed processing logs')
@click.option('--stats', is_flag=True, help='Show statistics after processing')
def main(project_name: str, db_path: str, reindex: bool, verbose: bool, stats: bool):
    """Process Claude Code conversations and store metadata"""
    db_path = Path(db_path)

    processor = ConversationProcessor(db_path, verbose=verbose)

    try:
        # Process conversations
        count = processor.process_project(project_name, reindex=reindex)

        print(f"\n✓ Processed {count} conversations")

        if stats:
            print("\n=== Statistics ===")
            stats_data = processor.get_stats()
            print(f"Total conversations: {stats_data['total_conversations']}")
            print(f"Total messages: {stats_data['total_messages']}")
            print(f"User messages: {stats_data['total_user_messages']}")
            print(f"Assistant messages: {stats_data['total_assistant_messages']}")
            print(f"Date range: {stats_data['earliest_conversation']} to {stats_data['latest_conversation']}")

            print("\nTop 10 Files:")
            for item in stats_data['top_files']:
                print(f"  {item['file']}: {item['count']} interactions")

            print("\nTop 10 Tools:")
            for item in stats_data['top_tools']:
                print(f"  {item['tool']}: {item['count']} uses")

    finally:
        processor.close()


if __name__ == '__main__':
    main()