Files
gh-overlord-z-claudeshack/skills/oracle/scripts/analyze_history.py
2025-11-30 08:46:50 +08:00

702 lines
23 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Oracle Conversation History Analyzer
Analyzes Claude Code conversation history from ~/.claude/projects/ and extracts:
- Patterns and repeated tasks
- Corrections and learnings
- User preferences and gotchas
- Automation opportunities
This script mines existing conversation data without requiring manual capture.
Usage:
python analyze_history.py [options]
python analyze_history.py --project-hash abc123 --auto-populate
python analyze_history.py --all-projects --recent-days 30
python analyze_history.py --analyze-only
Examples:
python analyze_history.py --auto-populate
python analyze_history.py --project-hash abc123def456
python analyze_history.py --all-projects --min-confidence 0.7
"""
import os
import sys
import json
import argparse
from datetime import datetime, timedelta
from pathlib import Path
from collections import defaultdict, Counter
import re
import uuid
CLAUDE_PROJECTS_PATH = Path.home() / '.claude' / 'projects'
# Configuration constants
CONFIG = {
'MAX_TITLE_LENGTH': 200,
'ACTION_CONTEXT_MIN_LEN': 10,
'ACTION_CONTEXT_MAX_LEN': 50,
'TOP_TOOLS_TO_REPORT': 20,
'TOP_CORRECTIONS_TO_ADD': 10,
'TOP_GOTCHAS_TO_ADD': 10,
'TOP_TASKS_TO_ADD': 5,
'MAX_PREFERENCES_TO_ADD': 10,
'DEFAULT_MIN_TASK_OCCURRENCES': 3,
'SNIPPET_LENGTH': 80,
}
# Precompiled regex patterns for performance
CORRECTION_PATTERNS = [
re.compile(r"(?:that's|thats)\s+(?:wrong|incorrect|not right)", re.IGNORECASE),
re.compile(r"(?:don't|dont|do not)\s+(?:do|use|implement)", re.IGNORECASE),
re.compile(r"(?:should|need to)\s+(?:use|do|implement).+(?:instead|not)", re.IGNORECASE),
re.compile(r"(?:actually|correction|fix)[:,]\s+", re.IGNORECASE),
re.compile(r"(?:no|nope),?\s+(?:use|do|try|implement)", re.IGNORECASE),
re.compile(r"(?:wrong|incorrect|mistake)[:,]", re.IGNORECASE),
re.compile(r"(?:better to|prefer to|should)\s+(?:use|do)", re.IGNORECASE),
]
PREFERENCE_PATTERNS = [
re.compile(r"(?:i prefer|i'd prefer|prefer to|i like)\s+(.+)", re.IGNORECASE),
re.compile(r"(?:always|never)\s+(?:use|do|implement)\s+(.+)", re.IGNORECASE),
re.compile(r"(?:i want|i'd like|i need)\s+(.+)", re.IGNORECASE),
re.compile(r"(?:make sure|ensure|remember)\s+(?:to|that)?\s+(.+)", re.IGNORECASE),
re.compile(r"(?:use|implement|do)\s+(.+)\s+(?:instead|not)", re.IGNORECASE),
]
GOTCHA_PATTERNS = [
re.compile(r"(?:error|issue|problem|bug|failing|broken)[:,]?\s+(.+)", re.IGNORECASE),
re.compile(r"(?:warning|careful|watch out)[:,]?\s+(.+)", re.IGNORECASE),
re.compile(r"(?:doesn't work|not working|fails when)\s+(.+)", re.IGNORECASE),
re.compile(r"(?:remember|don't forget)[:,]?\s+(.+)", re.IGNORECASE),
]
def truncate_text(text, max_length=100, suffix='...'):
"""Truncate text to max_length, breaking at word boundaries."""
if len(text) <= max_length:
return text
truncated = text[:max_length].rsplit(' ', 1)[0]
return truncated + suffix
def ensure_knowledge_file(file_path, default_content=None):
"""Ensure knowledge file exists, create with default content if missing."""
if not file_path.exists():
file_path.parent.mkdir(parents=True, exist_ok=True)
with open(file_path, 'w', encoding='utf-8') as f:
json.dump(default_content or [], f, indent=2)
with open(file_path, 'r', encoding='utf-8') as f:
return json.load(f)
def find_oracle_root():
"""Find the .oracle directory."""
current = Path.cwd()
while current != current.parent:
oracle_path = current / '.oracle'
if oracle_path.exists():
return oracle_path
current = current.parent
return None
def find_project_hash(oracle_path):
"""Try to determine the project hash for current project."""
# The project hash is based on the project path
# We'll look for recent activity in claude projects that might match
if not CLAUDE_PROJECTS_PATH.exists():
return None
project_root = oracle_path.parent
project_name = project_root.name
# Get all project directories
project_dirs = [d for d in CLAUDE_PROJECTS_PATH.iterdir() if d.is_dir()]
# Sort by most recent modification
project_dirs.sort(key=lambda x: x.stat().st_mtime, reverse=True)
# Return the most recent one (likely current project)
if project_dirs:
return project_dirs[0].name
return None
def load_conversation_history(project_hash, recent_days=None):
"""Load conversation history from JSONL files."""
project_path = CLAUDE_PROJECTS_PATH / project_hash
if not project_path.exists():
print(f"[ERROR] Project path not found: {project_path}")
return []
conversations = []
cutoff_date = None
if recent_days:
cutoff_date = datetime.now() - timedelta(days=recent_days)
# Find all JSONL files
jsonl_files = list(project_path.glob('*.jsonl'))
print(f"[INFO] Found {len(jsonl_files)} conversation files in project {project_hash[:8]}...")
for jsonl_file in jsonl_files:
# Check modification date
if cutoff_date:
mtime = datetime.fromtimestamp(jsonl_file.stat().st_mtime)
if mtime < cutoff_date:
continue
try:
# Use streaming approach for memory efficiency
session_data = {
'session_id': jsonl_file.stem,
'file_path': jsonl_file,
'messages': [],
'tools_used': [],
'created': datetime.fromtimestamp(jsonl_file.stat().st_mtime)
}
with open(jsonl_file, 'r', encoding='utf-8') as f:
for line in f: # Stream line by line - memory efficient
if line.strip():
try:
entry = json.loads(line)
session_data['messages'].append(entry)
# Extract tool usage
if 'message' in entry:
content = entry['message'].get('content', [])
if isinstance(content, list):
for item in content:
if isinstance(item, dict) and item.get('type') == 'tool_use':
session_data['tools_used'].append(item.get('name'))
except json.JSONDecodeError:
continue
conversations.append(session_data)
except Exception as e:
print(f"[WARNING] Failed to load {jsonl_file.name}: {e}")
continue
print(f"[OK] Loaded {len(conversations)} conversations")
return conversations
def extract_messages_by_role(conversations, role='user'):
"""Extract messages of specified role from conversations."""
messages = []
for session in conversations:
for msg in session['messages']:
if 'message' not in msg:
continue
message = msg['message']
if message.get('role') != role:
continue
content = message.get('content', '')
# Handle both string and list content
if isinstance(content, list):
text_parts = []
for item in content:
if isinstance(item, dict) and item.get('type') == 'text':
text_parts.append(item.get('text', ''))
content = ' '.join(text_parts)
if content:
messages.append({
'session_id': session['session_id'],
'content': content,
'timestamp': session['created']
})
return messages
def detect_corrections(user_messages):
"""Detect correction patterns in user messages."""
corrections = []
for msg in user_messages:
content = msg['content']
for pattern in CORRECTION_PATTERNS:
if pattern.search(content):
corrections.append({
'session_id': msg['session_id'],
'content': msg['content'],
'timestamp': msg['timestamp'],
'pattern_matched': pattern.pattern
})
break
return corrections
def detect_preferences(user_messages):
"""Detect user preferences from messages."""
preferences = []
for msg in user_messages:
content = msg['content']
for pattern in PREFERENCE_PATTERNS:
matches = pattern.findall(content)
if matches:
for match in matches:
match_text = match.strip() if isinstance(match, str) else match
# Only capture meaningful preferences
if len(match_text) > 5:
preferences.append({
'session_id': msg['session_id'],
'preference': match_text,
'full_context': content,
'timestamp': msg['timestamp']
})
return preferences
def detect_repeated_tasks(user_messages, min_occurrences=None):
"""Detect repeated tasks that could be automated."""
if min_occurrences is None:
min_occurrences = CONFIG['DEFAULT_MIN_TASK_OCCURRENCES']
# Extract common patterns
task_patterns = defaultdict(list)
# Common action verbs
action_verbs = [
'create', 'add', 'update', 'delete', 'remove', 'fix', 'refactor',
'implement', 'write', 'generate', 'build', 'run', 'test', 'deploy'
]
for msg in user_messages:
content = msg['content'].lower()
# Extract sentences with action verbs
for verb in action_verbs:
# Use word boundaries to capture complete phrases
pattern = rf'\b{verb}\b\s+([a-zA-Z\s-]{{' + str(CONFIG['ACTION_CONTEXT_MIN_LEN']) + ',' + str(CONFIG['ACTION_CONTEXT_MAX_LEN']) + '}})'
matches = re.findall(pattern, content)
for match in matches:
# Clean up the match
clean_match = re.sub(r'[^\w\s-]', '', match).strip()
if len(clean_match) > 5:
task_patterns[f"{verb} {clean_match}"].append({
'session_id': msg['session_id'],
'full_content': msg['content'],
'timestamp': msg['timestamp']
})
# Find tasks that occur multiple times
repeated_tasks = []
for task, occurrences in task_patterns.items():
if len(occurrences) >= min_occurrences:
repeated_tasks.append({
'task': task,
'occurrences': len(occurrences),
'instances': occurrences
})
# Sort by frequency
repeated_tasks.sort(key=lambda x: x['occurrences'], reverse=True)
return repeated_tasks
def detect_gotchas(user_messages, assistant_messages):
"""Detect gotchas from conversations about problems/errors."""
gotchas = []
# Check user messages for problem reports
for msg in user_messages:
content = msg['content']
for pattern in GOTCHA_PATTERNS:
matches = pattern.findall(content)
if matches:
for match in matches:
match_text = match.strip() if isinstance(match, str) else match
gotchas.append({
'session_id': msg['session_id'],
'gotcha': match_text,
'context': content,
'timestamp': msg['timestamp'],
'source': 'user'
})
return gotchas
def analyze_tool_usage(conversations):
"""Analyze which tools are used most frequently."""
tool_counter = Counter()
for session in conversations:
for tool in session['tools_used']:
tool_counter[tool] += 1
return tool_counter.most_common(CONFIG['TOP_TOOLS_TO_REPORT'])
def create_knowledge_entry(category, title, content, context='', priority='medium',
learned_from='conversation_history', tags=None):
"""Create a knowledge entry in Oracle format."""
return {
'id': str(uuid.uuid4()),
'category': category,
'priority': priority,
'title': truncate_text(title, CONFIG['MAX_TITLE_LENGTH']),
'content': content,
'context': context,
'examples': [],
'learned_from': learned_from,
'created': datetime.now().isoformat(),
'last_used': datetime.now().isoformat(),
'use_count': 1,
'tags': tags or []
}
def populate_oracle_knowledge(oracle_path, corrections, preferences, gotchas, repeated_tasks):
"""Populate Oracle knowledge base with extracted data."""
knowledge_dir = oracle_path / 'knowledge'
# Ensure knowledge directory exists
knowledge_dir.mkdir(parents=True, exist_ok=True)
added_counts = {
'corrections': 0,
'preferences': 0,
'gotchas': 0,
'patterns': 0
}
# Add corrections
if corrections:
corrections_file = knowledge_dir / 'corrections.json'
existing_corrections = ensure_knowledge_file(corrections_file, [])
for correction in corrections[:CONFIG['TOP_CORRECTIONS_TO_ADD']]:
# Create entry
entry = create_knowledge_entry(
category='correction',
title=f"Correction: {correction['content']}",
content=correction['content'],
context='Extracted from conversation history',
priority='high',
learned_from='conversation_history_analyzer',
tags=['auto-extracted', 'correction']
)
existing_corrections.append(entry)
added_counts['corrections'] += 1
with open(corrections_file, 'w', encoding='utf-8') as f:
json.dump(existing_corrections, f, indent=2)
# Add preferences
if preferences:
preferences_file = knowledge_dir / 'preferences.json'
existing_preferences = ensure_knowledge_file(preferences_file, [])
# Deduplicate preferences
seen_preferences = set()
for pref in preferences:
pref_text = pref['preference'].lower()
# Skip if too similar to existing
if pref_text in seen_preferences:
continue
seen_preferences.add(pref_text)
entry = create_knowledge_entry(
category='preference',
title=f"Preference: {pref['preference']}",
content=pref['preference'],
context=truncate_text(pref['full_context'], 500),
priority='medium',
learned_from='conversation_history_analyzer',
tags=['auto-extracted', 'preference']
)
existing_preferences.append(entry)
added_counts['preferences'] += 1
if added_counts['preferences'] >= CONFIG['MAX_PREFERENCES_TO_ADD']:
break
with open(preferences_file, 'w', encoding='utf-8') as f:
json.dump(existing_preferences, f, indent=2)
# Add gotchas
if gotchas:
gotchas_file = knowledge_dir / 'gotchas.json'
existing_gotchas = ensure_knowledge_file(gotchas_file, [])
for gotcha in gotchas[:CONFIG['TOP_GOTCHAS_TO_ADD']]:
entry = create_knowledge_entry(
category='gotcha',
title=f"Gotcha: {gotcha['gotcha']}",
content=gotcha['gotcha'],
context=truncate_text(gotcha['context'], 500),
priority='high',
learned_from='conversation_history_analyzer',
tags=['auto-extracted', 'gotcha']
)
existing_gotchas.append(entry)
added_counts['gotchas'] += 1
with open(gotchas_file, 'w', encoding='utf-8') as f:
json.dump(existing_gotchas, f, indent=2)
# Add repeated tasks as patterns (automation candidates)
if repeated_tasks:
patterns_file = knowledge_dir / 'patterns.json'
existing_patterns = ensure_knowledge_file(patterns_file, [])
for task in repeated_tasks[:CONFIG['TOP_TASKS_TO_ADD']]:
entry = create_knowledge_entry(
category='pattern',
title=f"Repeated task: {task['task']}",
content=f"This task has been performed {task['occurrences']} times. Consider automating it.",
context='Detected from conversation history analysis',
priority='medium',
learned_from='conversation_history_analyzer',
tags=['auto-extracted', 'automation-candidate', 'repeated-task']
)
existing_patterns.append(entry)
added_counts['patterns'] += 1
with open(patterns_file, 'w', encoding='utf-8') as f:
json.dump(existing_patterns, f, indent=2)
return added_counts
def generate_analysis_report(conversations, corrections, preferences, gotchas,
repeated_tasks, tool_usage):
"""Generate a comprehensive analysis report."""
report = []
report.append("="*70)
report.append("Oracle Conversation History Analysis Report")
report.append("="*70)
report.append("")
# Summary
total_messages = sum(len(c['messages']) for c in conversations)
report.append(f"Analyzed Conversations: {len(conversations)}")
report.append(f"Total Messages: {total_messages}")
report.append("")
# Corrections
report.append(f"Corrections Detected: {len(corrections)}")
if corrections:
report.append(" Top Corrections:")
for i, corr in enumerate(corrections[:5], 1):
snippet = truncate_text(corr['content'].replace('\n', ' '), CONFIG['SNIPPET_LENGTH'])
report.append(f" {i}. {snippet}")
report.append("")
# Preferences
report.append(f"User Preferences Detected: {len(preferences)}")
if preferences:
report.append(" Sample Preferences:")
for i, pref in enumerate(preferences[:5], 1):
snippet = truncate_text(pref['preference'], CONFIG['SNIPPET_LENGTH'])
report.append(f" {i}. {snippet}")
report.append("")
# Gotchas
report.append(f"Gotchas/Issues Detected: {len(gotchas)}")
if gotchas:
report.append(" Sample Gotchas:")
for i, gotcha in enumerate(gotchas[:5], 1):
snippet = truncate_text(str(gotcha['gotcha']), CONFIG['SNIPPET_LENGTH'])
report.append(f" {i}. {snippet}")
report.append("")
# Repeated Tasks
report.append(f"Repeated Tasks (Automation Candidates): {len(repeated_tasks)}")
if repeated_tasks:
report.append(" Top Repeated Tasks:")
for i, task in enumerate(repeated_tasks[:5], 1):
report.append(f" {i}. {task['task']} (x{task['occurrences']})")
report.append("")
# Tool Usage
report.append("Most Used Tools:")
for i, (tool, count) in enumerate(tool_usage[:10], 1):
report.append(f" {i}. {tool}: {count} times")
report.append("")
report.append("="*70)
return "\n".join(report)
def main():
parser = argparse.ArgumentParser(
description='Analyze Claude Code conversation history',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python analyze_history.py --auto-populate
python analyze_history.py --project-hash abc123def456
python analyze_history.py --all-projects --recent-days 30
python analyze_history.py --analyze-only --min-confidence 0.8
"""
)
parser.add_argument(
'--project-hash',
help='Specific project hash to analyze'
)
parser.add_argument(
'--all-projects',
action='store_true',
help='Analyze all projects (not recommended - may be slow)'
)
parser.add_argument(
'--recent-days',
type=int,
help='Only analyze conversations from last N days'
)
parser.add_argument(
'--auto-populate',
action='store_true',
help='Automatically populate Oracle knowledge base'
)
parser.add_argument(
'--analyze-only',
action='store_true',
help='Only analyze and report, do not populate Oracle'
)
parser.add_argument(
'--min-task-occurrences',
type=int,
default=CONFIG['DEFAULT_MIN_TASK_OCCURRENCES'],
help='Minimum occurrences to consider a task as repeated'
)
args = parser.parse_args()
# Find Oracle
oracle_path = find_oracle_root()
if not oracle_path and not args.analyze_only:
print("[ERROR] .oracle directory not found.")
print(" Run: python .claude/skills/oracle/scripts/init_oracle.py")
sys.exit(1)
# Determine project hash
if args.project_hash:
project_hash = args.project_hash
elif oracle_path:
project_hash = find_project_hash(oracle_path)
if not project_hash:
print("[ERROR] Could not determine project hash.")
print(" Use --project-hash to specify manually")
sys.exit(1)
else:
print("[ERROR] Please specify --project-hash")
sys.exit(1)
print(f"\n[INFO] Analyzing project: {project_hash[:8]}...")
print(f"[INFO] Claude projects path: {CLAUDE_PROJECTS_PATH}\n")
# Load conversations
conversations = load_conversation_history(project_hash, args.recent_days)
if not conversations:
print("[ERROR] No conversations found.")
sys.exit(1) # Exit with error code
# Extract messages
print("[INFO] Extracting user and assistant messages...")
user_messages = extract_messages_by_role(conversations, role='user')
assistant_messages = extract_messages_by_role(conversations, role='assistant')
print(f"[OK] Found {len(user_messages)} user messages")
print(f"[OK] Found {len(assistant_messages)} assistant messages\n")
# Analyze
print("[INFO] Detecting corrections...")
corrections = detect_corrections(user_messages)
print("[INFO] Detecting preferences...")
preferences = detect_preferences(user_messages)
print("[INFO] Detecting gotchas...")
gotchas = detect_gotchas(user_messages, assistant_messages)
print("[INFO] Detecting repeated tasks...")
repeated_tasks = detect_repeated_tasks(user_messages, args.min_task_occurrences)
print("[INFO] Analyzing tool usage...")
tool_usage = analyze_tool_usage(conversations)
print("")
# Generate report
report = generate_analysis_report(
conversations, corrections, preferences, gotchas,
repeated_tasks, tool_usage
)
print(report)
# Populate Oracle if requested
if args.auto_populate and oracle_path and not args.analyze_only:
print("\n[INFO] Populating Oracle knowledge base...")
added_counts = populate_oracle_knowledge(
oracle_path, corrections, preferences, gotchas, repeated_tasks
)
print("\n[OK] Knowledge base updated:")
for category, count in added_counts.items():
if count > 0:
print(f" {category.capitalize()}: +{count} entries")
print("\n[OK] Analysis complete! Knowledge base has been updated.")
print(" Query knowledge: python .claude/skills/oracle/scripts/query_knowledge.py")
elif args.analyze_only:
print("\n[INFO] Analysis complete (no changes made to Oracle)")
if __name__ == '__main__':
main()