Initial commit

2025-11-29 18:16:51 +08:00
commit 4e8a12140c
88 changed files with 17078 additions and 0 deletions
--- a/skills/cc-insights/scripts/conversation-processor.py
+++ b/skills/cc-insights/scripts/conversation-processor.py
@@ -0,0 +1,634 @@
+#!/usr/bin/env python3
+"""
+Conversation Processor for Claude Code Insights
+
+Parses JSONL conversation files from ~/.claude/projects/, extracts metadata,
+and stores in SQLite for fast querying. Supports incremental processing.
+"""
+
+import json
+import sqlite3
+import base64
+import hashlib
+from pathlib import Path
+from datetime import datetime
+from typing import List, Dict, Any, Optional
+from dataclasses import dataclass, asdict
+import click
+import re
+
+
+@dataclass
+class ConversationMetadata:
+    """Structured conversation metadata"""
+    id: str
+    project_path: str
+    timestamp: datetime
+    message_count: int
+    user_messages: int
+    assistant_messages: int
+    files_read: List[str]
+    files_written: List[str]
+    files_edited: List[str]
+    tools_used: List[str]
+    topics: List[str]
+    first_user_message: str
+    last_assistant_message: str
+    conversation_hash: str
+    file_size_bytes: int
+    processed_at: datetime
+
+
+class ConversationProcessor:
+    """Processes Claude Code conversation JSONL files"""
+
+    def __init__(self, db_path: Path, verbose: bool = False):
+        self.db_path = db_path
+        self.verbose = verbose
+        self.conn = None
+        self._init_database()
+
+    def _init_database(self):
+        """Initialize SQLite database with schema"""
+        self.db_path.parent.mkdir(parents=True, exist_ok=True)
+        self.conn = sqlite3.connect(str(self.db_path))
+        self.conn.row_factory = sqlite3.Row
+
+        # Create tables
+        self.conn.executescript("""
+            CREATE TABLE IF NOT EXISTS conversations (
+                id TEXT PRIMARY KEY,
+                project_path TEXT NOT NULL,
+                timestamp TEXT NOT NULL,
+                message_count INTEGER NOT NULL,
+                user_messages INTEGER NOT NULL,
+                assistant_messages INTEGER NOT NULL,
+                files_read TEXT,  -- JSON array
+                files_written TEXT,  -- JSON array
+                files_edited TEXT,  -- JSON array
+                tools_used TEXT,  -- JSON array
+                topics TEXT,  -- JSON array
+                first_user_message TEXT,
+                last_assistant_message TEXT,
+                conversation_hash TEXT UNIQUE NOT NULL,
+                file_size_bytes INTEGER NOT NULL,
+                processed_at TEXT NOT NULL
+            );
+
+            CREATE INDEX IF NOT EXISTS idx_timestamp ON conversations(timestamp);
+            CREATE INDEX IF NOT EXISTS idx_project ON conversations(project_path);
+            CREATE INDEX IF NOT EXISTS idx_processed ON conversations(processed_at);
+
+            CREATE TABLE IF NOT EXISTS file_interactions (
+                id INTEGER PRIMARY KEY AUTOINCREMENT,
+                conversation_id TEXT NOT NULL,
+                file_path TEXT NOT NULL,
+                interaction_type TEXT NOT NULL,  -- read, write, edit
+                FOREIGN KEY (conversation_id) REFERENCES conversations(id)
+            );
+
+            CREATE INDEX IF NOT EXISTS idx_file_path ON file_interactions(file_path);
+            CREATE INDEX IF NOT EXISTS idx_conversation ON file_interactions(conversation_id);
+
+            CREATE TABLE IF NOT EXISTS tool_usage (
+                id INTEGER PRIMARY KEY AUTOINCREMENT,
+                conversation_id TEXT NOT NULL,
+                tool_name TEXT NOT NULL,
+                usage_count INTEGER NOT NULL,
+                FOREIGN KEY (conversation_id) REFERENCES conversations(id)
+            );
+
+            CREATE INDEX IF NOT EXISTS idx_tool_name ON tool_usage(tool_name);
+
+            CREATE TABLE IF NOT EXISTS processing_state (
+                file_path TEXT PRIMARY KEY,
+                last_modified TEXT NOT NULL,
+                last_processed TEXT NOT NULL,
+                file_hash TEXT NOT NULL
+            );
+        """)
+        self.conn.commit()
+
+    def _log(self, message: str):
+        """Log if verbose mode is enabled"""
+        if self.verbose:
+            print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")
+
+    def _compute_file_hash(self, file_path: Path) -> str:
+        """Compute SHA256 hash of file for change detection"""
+        sha256 = hashlib.sha256()
+        with open(file_path, 'rb') as f:
+            for chunk in iter(lambda: f.read(8192), b''):
+                sha256.update(chunk)
+        return sha256.hexdigest()
+
+    def _needs_processing(self, file_path: Path, reindex: bool = False) -> bool:
+        """Check if file needs (re)processing"""
+        if reindex:
+            return True
+
+        file_stat = file_path.stat()
+        file_hash = self._compute_file_hash(file_path)
+
+        cursor = self.conn.execute(
+            "SELECT last_modified, file_hash FROM processing_state WHERE file_path = ?",
+            (str(file_path),)
+        )
+        row = cursor.fetchone()
+
+        if not row:
+            return True  # Never processed
+
+        last_modified, stored_hash = row
+        return stored_hash != file_hash  # File changed
+
+    def _update_processing_state(self, file_path: Path):
+        """Update processing state for file"""
+        file_hash = self._compute_file_hash(file_path)
+        last_modified = datetime.fromtimestamp(file_path.stat().st_mtime).isoformat()
+
+        self.conn.execute("""
+            INSERT OR REPLACE INTO processing_state (file_path, last_modified, last_processed, file_hash)
+            VALUES (?, ?, ?, ?)
+        """, (str(file_path), last_modified, datetime.now().isoformat(), file_hash))
+
+    def _parse_jsonl_file(self, file_path: Path) -> List[Dict[str, Any]]:
+        """Parse JSONL file with base64-encoded content"""
+        messages = []
+        with open(file_path, 'r', encoding='utf-8') as f:
+            for line_num, line in enumerate(f, 1):
+                try:
+                    if line.strip():
+                        data = json.loads(line)
+                        messages.append(data)
+                except json.JSONDecodeError as e:
+                    self._log(f"Warning: Failed to parse line {line_num} in {file_path.name}: {e}")
+        return messages
+
+    def _extract_tool_uses(self, content: str) -> List[str]:
+        """Extract tool names from assistant messages"""
+        tools = []
+        # Look for tool use patterns in content
+        tool_patterns = [
+            r'"name":\s*"([A-Z][a-zA-Z]+)"',  # JSON tool calls
+            r'<tool>([A-Z][a-zA-Z]+)</tool>',  # XML tool calls
+        ]
+
+        for pattern in tool_patterns:
+            matches = re.findall(pattern, content)
+            tools.extend(matches)
+
+        return list(set(tools))  # Unique tools
+
+    def _extract_file_paths(self, content: str) -> Dict[str, List[str]]:
+        """Extract file paths and their interaction types from content"""
+        files = {
+            'read': [],
+            'written': [],
+            'edited': []
+        }
+
+        # Patterns for file operations
+        read_patterns = [
+            r'Reading\s+(.+\.(?:py|js|ts|tsx|jsx|md|json|yaml|yml))',
+            r'Read\s+file:\s*(.+)',
+            r'"file_path":\s*"([^"]+)"',  # Tool parameters
+        ]
+
+        write_patterns = [
+            r'Writing\s+(.+\.(?:py|js|ts|tsx|jsx|md|json|yaml|yml))',
+            r'Created\s+file:\s*(.+)',
+            r'Write\s+(.+)',
+        ]
+
+        edit_patterns = [
+            r'Editing\s+(.+\.(?:py|js|ts|tsx|jsx|md|json|yaml|yml))',
+            r'Modified\s+file:\s*(.+)',
+            r'Edit\s+(.+)',
+        ]
+
+        for pattern in read_patterns:
+            files['read'].extend(re.findall(pattern, content, re.IGNORECASE))
+        for pattern in write_patterns:
+            files['written'].extend(re.findall(pattern, content, re.IGNORECASE))
+        for pattern in edit_patterns:
+            files['edited'].extend(re.findall(pattern, content, re.IGNORECASE))
+
+        # Deduplicate and clean
+        for key in files:
+            files[key] = list(set(path.strip() for path in files[key]))
+
+        return files
+
+    def _extract_topics(self, messages: List[Dict[str, Any]]) -> List[str]:
+        """Extract topic keywords from conversation"""
+        # Combine first user message and some assistant responses
+        text = ""
+        user_count = 0
+        for msg in messages:
+            msg_type = msg.get('type', '')
+
+            # Handle event-stream format
+            if msg_type == 'user':
+                message_dict = msg.get('message', {})
+                content = message_dict.get('content', '') if isinstance(message_dict, dict) else ''
+
+                # Handle content that's a list (content blocks)
+                if isinstance(content, list):
+                    message_content = ' '.join(
+                        block.get('text', '') if isinstance(block, dict) and block.get('type') == 'text' else ''
+                        for block in content
+                    )
+                else:
+                    message_content = content
+
+                if message_content:
+                    text += message_content + " "
+                    user_count += 1
+                    if user_count >= 3:  # Only use first few user messages
+                        break
+            elif msg_type == 'assistant' and user_count < 3:
+                # Also include some assistant responses for context
+                message_dict = msg.get('message', {})
+                content = message_dict.get('content', '') if isinstance(message_dict, dict) else ''
+
+                # Handle content that's a list (content blocks)
+                if isinstance(content, list):
+                    message_content = ' '.join(
+                        block.get('text', '') if isinstance(block, dict) and block.get('type') == 'text' else ''
+                        for block in content
+                    )
+                else:
+                    message_content = content
+
+                if message_content:
+                    text += message_content[:200] + " "  # Just a snippet
+
+        # Extract common programming keywords
+        keywords = []
+        common_topics = [
+            'authentication', 'auth', 'login', 'jwt', 'oauth',
+            'testing', 'test', 'unit test', 'integration test',
+            'bug', 'fix', 'error', 'issue', 'debug',
+            'performance', 'optimization', 'optimize', 'slow',
+            'refactor', 'refactoring', 'cleanup',
+            'feature', 'implement', 'add', 'create',
+            'database', 'sql', 'query', 'schema',
+            'api', 'endpoint', 'rest', 'graphql',
+            'typescript', 'javascript', 'react', 'node',
+            'css', 'style', 'styling', 'tailwind',
+            'security', 'vulnerability', 'xss', 'csrf',
+            'deploy', 'deployment', 'ci/cd', 'docker',
+        ]
+
+        text_lower = text.lower()
+        for topic in common_topics:
+            if topic in text_lower:
+                keywords.append(topic)
+
+        return list(set(keywords))[:10]  # Max 10 topics
+
+    def _process_conversation(self, file_path: Path, messages: List[Dict[str, Any]]) -> ConversationMetadata:
+        """Extract metadata from parsed conversation"""
+        # Generate conversation ID from filename
+        conv_id = file_path.stem
+
+        # Count messages by role
+        user_messages = 0
+        assistant_messages = 0
+        first_user_msg = ""
+        last_assistant_msg = ""
+        all_tools = []
+        all_files = {'read': [], 'written': [], 'edited': []}
+
+        for msg in messages:
+            msg_type = msg.get('type', '')
+
+            # Handle event-stream format
+            if msg_type == 'user':
+                user_messages += 1
+                message_dict = msg.get('message', {})
+                content = message_dict.get('content', '') if isinstance(message_dict, dict) else ''
+
+                # Handle content that's a list (content blocks)
+                if isinstance(content, list):
+                    message_content = ' '.join(
+                        block.get('text', '') if isinstance(block, dict) and block.get('type') == 'text' else ''
+                        for block in content
+                    )
+                else:
+                    message_content = content
+
+                if not first_user_msg and message_content:
+                    first_user_msg = message_content[:500]  # First 500 chars
+
+            elif msg_type == 'assistant':
+                assistant_messages += 1
+                message_dict = msg.get('message', {})
+                content = message_dict.get('content', '') if isinstance(message_dict, dict) else ''
+
+                # Handle content that's a list (content blocks)
+                if isinstance(content, list):
+                    message_content = ' '.join(
+                        block.get('text', '') if isinstance(block, dict) and block.get('type') == 'text' else ''
+                        for block in content
+                    )
+                    # Also extract tools from content blocks
+                    for block in content:
+                        if isinstance(block, dict) and block.get('type') == 'tool_use':
+                            tool_name = block.get('name', '')
+                            if tool_name:
+                                all_tools.append(tool_name)
+                else:
+                    message_content = content
+
+                if message_content:
+                    last_assistant_msg = message_content[:500]
+
+                    # Extract tools and files from assistant messages
+                    tools = self._extract_tool_uses(message_content)
+                    all_tools.extend(tools)
+
+                    files = self._extract_file_paths(message_content)
+                    for key in all_files:
+                        all_files[key].extend(files[key])
+
+        # Deduplicate
+        all_tools = list(set(all_tools))
+        for key in all_files:
+            all_files[key] = list(set(all_files[key]))
+
+        # Extract topics
+        topics = self._extract_topics(messages)
+
+        # File stats
+        file_stat = file_path.stat()
+
+        # Compute conversation hash
+        conv_hash = self._compute_file_hash(file_path)
+
+        # Extract timestamp (from filename or file mtime)
+        try:
+            # Try to get timestamp from file modification time
+            timestamp = datetime.fromtimestamp(file_stat.st_mtime)
+        except Exception:
+            timestamp = datetime.now()
+
+        return ConversationMetadata(
+            id=conv_id,
+            project_path=str(file_path.parent),
+            timestamp=timestamp,
+            message_count=len(messages),
+            user_messages=user_messages,
+            assistant_messages=assistant_messages,
+            files_read=all_files['read'],
+            files_written=all_files['written'],
+            files_edited=all_files['edited'],
+            tools_used=all_tools,
+            topics=topics,
+            first_user_message=first_user_msg,
+            last_assistant_message=last_assistant_msg,
+            conversation_hash=conv_hash,
+            file_size_bytes=file_stat.st_size,
+            processed_at=datetime.now()
+        )
+
+    def _store_conversation(self, metadata: ConversationMetadata):
+        """Store conversation metadata in database"""
+        # Store main conversation record
+        self.conn.execute("""
+            INSERT OR REPLACE INTO conversations
+            (id, project_path, timestamp, message_count, user_messages, assistant_messages,
+             files_read, files_written, files_edited, tools_used, topics,
+             first_user_message, last_assistant_message, conversation_hash,
+             file_size_bytes, processed_at)
+            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+        """, (
+            metadata.id,
+            metadata.project_path,
+            metadata.timestamp.isoformat(),
+            metadata.message_count,
+            metadata.user_messages,
+            metadata.assistant_messages,
+            json.dumps(metadata.files_read),
+            json.dumps(metadata.files_written),
+            json.dumps(metadata.files_edited),
+            json.dumps(metadata.tools_used),
+            json.dumps(metadata.topics),
+            metadata.first_user_message,
+            metadata.last_assistant_message,
+            metadata.conversation_hash,
+            metadata.file_size_bytes,
+            metadata.processed_at.isoformat()
+        ))
+
+        # Store file interactions
+        self.conn.execute(
+            "DELETE FROM file_interactions WHERE conversation_id = ?",
+            (metadata.id,)
+        )
+        for file_path in metadata.files_read:
+            self.conn.execute(
+                "INSERT INTO file_interactions (conversation_id, file_path, interaction_type) VALUES (?, ?, ?)",
+                (metadata.id, file_path, 'read')
+            )
+        for file_path in metadata.files_written:
+            self.conn.execute(
+                "INSERT INTO file_interactions (conversation_id, file_path, interaction_type) VALUES (?, ?, ?)",
+                (metadata.id, file_path, 'write')
+            )
+        for file_path in metadata.files_edited:
+            self.conn.execute(
+                "INSERT INTO file_interactions (conversation_id, file_path, interaction_type) VALUES (?, ?, ?)",
+                (metadata.id, file_path, 'edit')
+            )
+
+        # Store tool usage
+        self.conn.execute(
+            "DELETE FROM tool_usage WHERE conversation_id = ?",
+            (metadata.id,)
+        )
+        for tool_name in metadata.tools_used:
+            self.conn.execute(
+                "INSERT INTO tool_usage (conversation_id, tool_name, usage_count) VALUES (?, ?, ?)",
+                (metadata.id, tool_name, 1)
+            )
+
+    def process_file(self, file_path: Path, reindex: bool = False) -> bool:
+        """Process a single conversation file"""
+        if not self._needs_processing(file_path, reindex):
+            self._log(f"Skipping {file_path.name} (already processed)")
+            return False
+
+        self._log(f"Processing {file_path.name}...")
+
+        try:
+            # Parse JSONL
+            messages = self._parse_jsonl_file(file_path)
+
+            if not messages:
+                self._log(f"Warning: No messages found in {file_path.name}")
+                return False
+
+            # Extract metadata
+            metadata = self._process_conversation(file_path, messages)
+
+            # Store in database
+            self._store_conversation(metadata)
+
+            # Update processing state
+            self._update_processing_state(file_path)
+
+            self.conn.commit()
+
+            self._log(f"✓ Processed {file_path.name}: {metadata.message_count} messages, "
+                     f"{metadata.user_messages} user, {metadata.assistant_messages} assistant")
+
+            return True
+
+        except Exception as e:
+            self._log(f"Error processing {file_path.name}: {e}")
+            import traceback
+            if self.verbose:
+                traceback.print_exc()
+            return False
+
+    def process_project(self, project_name: str, reindex: bool = False) -> int:
+        """Process all conversations for a project"""
+        # Find conversation files
+        claude_projects = Path.home() / ".claude" / "projects"
+
+        if not claude_projects.exists():
+            self._log(f"Error: {claude_projects} does not exist")
+            return 0
+
+        # Find project directory (may be encoded)
+        project_dirs = list(claude_projects.glob(f"*{project_name}*"))
+
+        if not project_dirs:
+            self._log(f"Error: No project directory found matching '{project_name}'")
+            return 0
+
+        if len(project_dirs) > 1:
+            self._log(f"Warning: Multiple project directories found, using {project_dirs[0].name}")
+
+        project_dir = project_dirs[0]
+        self._log(f"Processing conversations from {project_dir}")
+
+        # Find all JSONL files
+        jsonl_files = list(project_dir.glob("*.jsonl"))
+
+        if not jsonl_files:
+            self._log(f"No conversation files found in {project_dir}")
+            return 0
+
+        self._log(f"Found {len(jsonl_files)} conversation files")
+
+        # Process each file
+        processed_count = 0
+        for jsonl_file in jsonl_files:
+            if self.process_file(jsonl_file, reindex):
+                processed_count += 1
+
+        self._log(f"\nProcessed {processed_count}/{len(jsonl_files)} conversations")
+        return processed_count
+
+    def get_stats(self) -> Dict[str, Any]:
+        """Get processing statistics"""
+        cursor = self.conn.execute("""
+            SELECT
+                COUNT(*) as total_conversations,
+                SUM(message_count) as total_messages,
+                SUM(user_messages) as total_user_messages,
+                SUM(assistant_messages) as total_assistant_messages,
+                MIN(timestamp) as earliest_conversation,
+                MAX(timestamp) as latest_conversation
+            FROM conversations
+        """)
+        row = cursor.fetchone()
+
+        stats = {
+            'total_conversations': row['total_conversations'],
+            'total_messages': row['total_messages'],
+            'total_user_messages': row['total_user_messages'],
+            'total_assistant_messages': row['total_assistant_messages'],
+            'earliest_conversation': row['earliest_conversation'],
+            'latest_conversation': row['latest_conversation']
+        }
+
+        # Top files
+        cursor = self.conn.execute("""
+            SELECT file_path, COUNT(*) as interaction_count
+            FROM file_interactions
+            GROUP BY file_path
+            ORDER BY interaction_count DESC
+            LIMIT 10
+        """)
+        stats['top_files'] = [
+            {'file': row['file_path'], 'count': row['interaction_count']}
+            for row in cursor.fetchall()
+        ]
+
+        # Top tools
+        cursor = self.conn.execute("""
+            SELECT tool_name, SUM(usage_count) as total_usage
+            FROM tool_usage
+            GROUP BY tool_name
+            ORDER BY total_usage DESC
+            LIMIT 10
+        """)
+        stats['top_tools'] = [
+            {'tool': row['tool_name'], 'count': row['total_usage']}
+            for row in cursor.fetchall()
+        ]
+
+        return stats
+
+    def close(self):
+        """Close database connection"""
+        if self.conn:
+            self.conn.close()
+
+
+@click.command()
+@click.option('--project-name', default='annex', help='Project name to process')
+@click.option('--db-path', type=click.Path(), default='.claude/skills/cc-insights/.processed/conversations.db',
+              help='SQLite database path')
+@click.option('--reindex', is_flag=True, help='Reprocess all conversations (ignore cache)')
+@click.option('--verbose', is_flag=True, help='Show detailed processing logs')
+@click.option('--stats', is_flag=True, help='Show statistics after processing')
+def main(project_name: str, db_path: str, reindex: bool, verbose: bool, stats: bool):
+    """Process Claude Code conversations and store metadata"""
+    db_path = Path(db_path)
+
+    processor = ConversationProcessor(db_path, verbose=verbose)
+
+    try:
+        # Process conversations
+        count = processor.process_project(project_name, reindex=reindex)
+
+        print(f"\n✓ Processed {count} conversations")
+
+        if stats:
+            print("\n=== Statistics ===")
+            stats_data = processor.get_stats()
+            print(f"Total conversations: {stats_data['total_conversations']}")
+            print(f"Total messages: {stats_data['total_messages']}")
+            print(f"User messages: {stats_data['total_user_messages']}")
+            print(f"Assistant messages: {stats_data['total_assistant_messages']}")
+            print(f"Date range: {stats_data['earliest_conversation']} to {stats_data['latest_conversation']}")
+
+            print("\nTop 10 Files:")
+            for item in stats_data['top_files']:
+                print(f"  {item['file']}: {item['count']} interactions")
+
+            print("\nTop 10 Tools:")
+            for item in stats_data['top_tools']:
+                print(f"  {item['tool']}: {item['count']} uses")
+
+    finally:
+        processor.close()
+
+
+if __name__ == '__main__':
+    main()