gh-overlord-z-claudeshack/skills/oracle/scripts/analyze_patterns.py

#!/usr/bin/env python3
"""
Oracle Pattern Analysis Script

Analyze Oracle knowledge base and session logs to identify:
- Repeated tasks (candidates for automation)
- Common corrections (update defaults/documentation)
- Frequent queries (add to auto-inject context)
- Token-heavy operations (automate)

Usage:
    python analyze_patterns.py
    python analyze_patterns.py --generate-scripts
    python analyze_patterns.py --threshold 3

Examples:
    python analyze_patterns.py
    python analyze_patterns.py --generate-scripts --threshold 5
"""

import os
import sys
import json
import argparse
from datetime import datetime
from pathlib import Path
from collections import Counter, defaultdict
import re


def find_oracle_root():
    """Find the .oracle directory."""
    current = Path.cwd()

    while current != current.parent:
        oracle_path = current / '.oracle'
        if oracle_path.exists():
            return oracle_path
        current = current.parent

    return None


def load_all_sessions(oracle_path):
    """Load all session logs."""
    sessions_dir = oracle_path / 'sessions'
    sessions = []

    for session_file in sessions_dir.glob('*.md'):
        try:
            with open(session_file, 'r') as f:
                content = f.read()
                sessions.append({
                    'id': session_file.stem,
                    'file': session_file,
                    'content': content
                })
        except Exception as e:
            print(f"Warning: Could not read {session_file}: {e}")

    return sessions


def analyze_repeated_activities(sessions):
    """Find repeated activities across sessions."""
    all_activities = []

    for session in sessions:
        # Extract activities from session log
        content = session['content']
        if '## Activities' in content:
            activities_section = content.split('## Activities')[1].split('\n\n')[0]
            activities = re.findall(r'^- (.+)$', activities_section, re.MULTILINE)
            all_activities.extend(activities)

    # Count occurrences
    activity_counts = Counter(all_activities)

    return activity_counts


def analyze_corrections(oracle_path):
    """Analyze correction patterns."""
    knowledge_dir = oracle_path / 'knowledge'
    corrections_file = knowledge_dir / 'corrections.json'

    if not corrections_file.exists():
        return {}

    with open(corrections_file, 'r') as f:
        corrections = json.load(f)

    # Group by common themes
    themes = defaultdict(list)

    for correction in corrections:
        content = correction.get('content', '')

        # Try to identify theme
        if 'async' in content.lower() or 'await' in content.lower():
            themes['async-programming'].append(correction)
        elif 'security' in content.lower() or 'xss' in content.lower() or 'injection' in content.lower():
            themes['security'].append(correction)
        elif 'performance' in content.lower() or 'optimization' in content.lower():
            themes['performance'].append(correction)
        elif 'test' in content.lower():
            themes['testing'].append(correction)
        else:
            themes['general'].append(correction)

    return themes


def analyze_file_patterns(sessions):
    """Analyze which files are changed most often."""
    file_changes = Counter()

    for session in sessions:
        content = session['content']
        if '## Changes Made' in content:
            # Extract file paths
            files = re.findall(r'\*\*File\*\*: `([^`]+)`', content)
            file_changes.update(files)

    return file_changes


def identify_automation_candidates(activity_counts, threshold=3):
    """Identify tasks that are repeated enough to warrant automation."""
    candidates = []

    for activity, count in activity_counts.items():
        if count >= threshold:
            # Analyze if it's automatable
            automation_score = 0

            # Keyword-based scoring
            deterministic_keywords = ['run tests', 'build', 'lint', 'format', 'deploy', 'update dependencies']
            for keyword in deterministic_keywords:
                if keyword in activity.lower():
                    automation_score += 2

            if automation_score > 0 or count >= threshold * 2:
                candidates.append({
                    'activity': activity,
                    'count': count,
                    'automation_score': automation_score,
                    'confidence': 'high' if automation_score >= 2 else 'medium'
                })

    return sorted(candidates, key=lambda x: (x['automation_score'], x['count']), reverse=True)


def generate_automation_script(activity):
    """Generate a basic automation script for an activity."""
    activity_lower = activity.lower()

    script_name = re.sub(r'[^a-z0-9]+', '_', activity_lower).strip('_')
    script_name = f"auto_{script_name}.sh"

    # Basic script template
    script = f"""#!/bin/bash
# Auto-generated by Oracle Pattern Analysis
# Purpose: {activity}
# Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

set -e  # Exit on error

echo " Automated task: {activity}"
echo "---"

# TODO: Implement automation logic
# Based on the activity pattern, add appropriate commands here

"""

    # Add common commands based on keywords
    if 'test' in activity_lower:
        script += """# Run tests
# npm test
# pytest
# cargo test
"""
    elif 'build' in activity_lower:
        script += """# Build project
# npm run build
# cargo build
# make
"""
    elif 'lint' in activity_lower:
        script += """# Run linter
# npm run lint
# cargo clippy
# pylint
"""
    elif 'format' in activity_lower:
        script += """# Format code
# npm run format
# cargo fmt
# black .
"""

    script += """
echo "---"
echo "[OK] Completed: {activity}"
""".format(activity=activity)

    return script_name, script


def generate_report(oracle_path, sessions, threshold):
    """Generate analysis report."""
    print("="*70)
    print("[SEARCH] Oracle Pattern Analysis Report")
    print("="*70)
    print(f"\nAnalyzing {len(sessions)} sessions\n")

    # Repeated activities
    print("## Repeated Activities\n")
    activity_counts = analyze_repeated_activities(sessions)

    if activity_counts:
        print("Top repeated tasks:\n")
        for activity, count in activity_counts.most_common(10):
            emoji = "" if count >= threshold else ""
            print(f"  {emoji} [{count}x] {activity}")
    else:
        print("  No repeated activities found\n")

    print()

    # Automation candidates
    print("## Automation Opportunities\n")
    candidates = identify_automation_candidates(activity_counts, threshold)

    if candidates:
        print(f"Found {len(candidates)} automation candidates:\n")
        for candidate in candidates:
            confidence_emoji = "" if candidate['confidence'] == 'high' else ""
            print(f"  {confidence_emoji} [{candidate['count']}x] {candidate['activity']}")
            print(f"      Confidence: {candidate['confidence']}, Score: {candidate['automation_score']}\n")
    else:
        print(f"  No automation candidates (threshold: {threshold} occurrences)\n")

    print()

    # Correction patterns
    print("## Correction Patterns\n")
    correction_themes = analyze_corrections(oracle_path)

    if correction_themes:
        print("Corrections by theme:\n")
        for theme, corrections in sorted(correction_themes.items(), key=lambda x: len(x[1]), reverse=True):
            print(f"   {theme.capitalize()}: {len(corrections)} corrections")

        print("\n[WARNING]  Consider updating documentation or creating safeguards for common themes\n")
    else:
        print("  No corrections recorded yet\n")

    print()

    # File change patterns
    print("## Frequently Modified Files\n")
    file_changes = analyze_file_patterns(sessions)

    if file_changes:
        print("Most frequently changed files:\n")
        for file_path, count in file_changes.most_common(10):
            print(f"  [{count}x] {file_path}")

        print("\n[TIP] Consider if these files need refactoring or better structure\n")
    else:
        print("  No file change patterns found\n")

    print()

    # Recommendations
    print("="*70)
    print("[INFO] Recommendations")
    print("="*70)
    print()

    if candidates:
        print(f"1. **Automate {len(candidates)} repeated tasks**")
        print(f"   Run with --generate-scripts to create automation scripts\n")

    if correction_themes:
        most_common_theme = max(correction_themes.items(), key=lambda x: len(x[1]))[0]
        print(f"2. **Address {most_common_theme} corrections**")
        print(f"   Review and create guidelines or linting rules\n")

    if file_changes:
        top_file = file_changes.most_common(1)[0]
        print(f"3. **Review frequently changed file: {top_file[0]}**")
        print(f"   Changed {top_file[1]} times - may need refactoring\n")

    print("="*70)


def save_automation_scripts(oracle_path, candidates):
    """Generate and save automation scripts."""
    scripts_dir = oracle_path / 'scripts'
    scripts_generated = []

    for candidate in candidates:
        script_name, script_content = generate_automation_script(candidate['activity'])
        script_path = scripts_dir / script_name

        with open(script_path, 'w') as f:
            f.write(script_content)

        # Make executable
        os.chmod(script_path, 0o755)

        scripts_generated.append(script_path)

        print(f"[OK] Generated: {script_path}")

    # Create README in scripts dir
    readme_path = scripts_dir / 'README.md'
    readme_content = f"""# Auto-Generated Automation Scripts

These scripts were generated by Oracle pattern analysis on {datetime.now().strftime('%Y-%m-%d')}.

## Scripts

"""

    for candidate in candidates:
        script_name = re.sub(r'[^a-z0-9]+', '_', candidate['activity'].lower()).strip('_')
        readme_content += f"- `auto_{script_name}.sh` - {candidate['activity']} (used {candidate['count']}x)\n"

    readme_content += """
## Usage

Each script is executable:

```bash
./auto_script_name.sh
```

## Customization

These scripts are templates. Review and customize them for your specific needs.
"""

    with open(readme_path, 'w') as f:
        f.write(readme_content)

    print(f"\n Created README: {readme_path}")

    return scripts_generated


def main():
    parser = argparse.ArgumentParser(
        description='Analyze Oracle patterns and identify automation opportunities',
        formatter_class=argparse.RawDescriptionHelpFormatter
    )

    parser.add_argument(
        '--threshold',
        type=int,
        default=3,
        help='Minimum occurrences to consider for automation (default: 3)'
    )

    parser.add_argument(
        '--generate-scripts',
        action='store_true',
        help='Generate automation scripts for candidates'
    )

    args = parser.parse_args()

    # Find Oracle
    oracle_path = find_oracle_root()

    if not oracle_path:
        print("[ERROR] Error: .oracle directory not found.")
        sys.exit(1)

    # Load sessions
    sessions = load_all_sessions(oracle_path)

    if not sessions:
        print("[WARNING]  No sessions found. Start recording sessions to enable pattern analysis.")
        sys.exit(0)

    # Generate report
    generate_report(oracle_path, sessions, args.threshold)

    # Generate scripts if requested
    if args.generate_scripts:
        activity_counts = analyze_repeated_activities(sessions)
        candidates = identify_automation_candidates(activity_counts, args.threshold)

        if candidates:
            print("\n" + "="*70)
            print(" Generating Automation Scripts")
            print("="*70 + "\n")

            scripts = save_automation_scripts(oracle_path, candidates)

            print(f"\n Generated {len(scripts)} automation scripts!")
            print(f"   Location: {oracle_path / 'scripts'}")
            print("\n[WARNING]  Review and customize these scripts before use.\n")
        else:
            print("\n[WARNING]  No automation candidates found (threshold: {args.threshold})\n")


if __name__ == '__main__':
    main()