gh-dhofheinz-open-plugins-p…/commands/history-analysis/.scripts/pattern-detector.py

#!/usr/bin/env python3

"""
============================================================================
SCRIPT: pattern-detector.py
PURPOSE: Detect commit message patterns and conventions from git history
VERSION: 1.0.0
USAGE: ./pattern-detector.py --count N --branch BRANCH [--detailed]
RETURNS: JSON format with pattern detection results
EXIT CODES:
  0 - Success
  1 - Not a git repository
  2 - No commit history
  3 - Git command failed
DEPENDENCIES: git, python3
============================================================================
"""

import subprocess
import sys
import json
import re
import argparse
from collections import defaultdict
from typing import Dict, List, Tuple


def run_git_command(cmd: List[str]) -> Tuple[int, str]:
    """Execute git command and return exit code and output."""
    try:
        result = subprocess.run(
            cmd,
            capture_output=True,
            text=True,
            check=False
        )
        return result.returncode, result.stdout.strip()
    except Exception as e:
        return 1, str(e)


def is_git_repository() -> bool:
    """Check if current directory is a git repository."""
    code, _ = run_git_command(['git', 'rev-parse', '--git-dir'])
    return code == 0


def has_commits() -> bool:
    """Check if repository has any commits."""
    code, _ = run_git_command(['git', 'log', '-1'])
    return code == 0


def get_commits(count: int, branch: str) -> List[Dict[str, str]]:
    """Fetch commit messages from git log."""
    code, output = run_git_command([
        'git', 'log',
        f'-{count}',
        branch,
        '--format=%H%n%s%n%b%n---COMMIT_SEPARATOR---'
    ])

    if code != 0:
        return []

    commits = []
    lines = output.split('\n')
    i = 0

    while i < len(lines):
        if i + 1 >= len(lines):
            break

        commit_hash = lines[i]
        subject = lines[i + 1] if i + 1 < len(lines) else ""

        # Find body (lines until separator)
        body_lines = []
        i += 2
        while i < len(lines) and lines[i] != '---COMMIT_SEPARATOR---':
            if lines[i].strip():  # Skip empty lines at start
                body_lines.append(lines[i])
            i += 1

        body = '\n'.join(body_lines).strip()

        commits.append({
            'hash': commit_hash,
            'subject': subject,
            'body': body,
            'full': subject + '\n\n' + body if body else subject
        })

        i += 1  # Skip separator

    return commits


def is_conventional_commit(subject: str) -> bool:
    """Check if commit follows conventional commits format."""
    pattern = r'^[a-z]+(\([^)]+\))?: .+'
    return bool(re.match(pattern, subject))


def has_prefix(subject: str) -> bool:
    """Check if commit has prefix format like [PREFIX]."""
    pattern = r'^\[[^\]]+\]'
    return bool(re.match(pattern, subject))


def has_tag(subject: str) -> bool:
    """Check if commit starts with tag like #tag."""
    return subject.startswith('#')


def is_imperative_mood(subject: str) -> bool:
    """
    Check if subject uses imperative mood.
    Simple heuristic: starts with common imperative verbs.
    """
    # Extract first word after type/scope if conventional
    words = subject.lower()
    if ':' in words:
        words = words.split(':', 1)[1].strip()

    # Common imperative verbs and their non-imperative forms to avoid
    imperative_verbs = [
        'add', 'fix', 'update', 'remove', 'delete', 'create', 'implement',
        'change', 'improve', 'optimize', 'refactor', 'enhance', 'correct',
        'resolve', 'merge', 'bump', 'revert', 'document', 'upgrade',
        'downgrade', 'rename', 'move', 'replace', 'extract', 'simplify'
    ]

    # Non-imperative indicators
    non_imperative = ['added', 'fixed', 'updated', 'removed', 'deleted',
                      'created', 'implemented', 'changed', 'improved',
                      'adding', 'fixing', 'updating']

    first_word = words.split()[0] if words.split() else ""

    if first_word in non_imperative:
        return False

    return first_word in imperative_verbs


def is_capitalized(subject: str) -> bool:
    """Check if subject is properly capitalized."""
    # Extract text after type/scope if conventional
    text = subject
    if ':' in text:
        text = text.split(':', 1)[1].strip()

    return text[0].isupper() if text else False


def has_no_period_end(subject: str) -> bool:
    """Check if subject doesn't end with period."""
    return not subject.endswith('.')


def has_blank_line_before_body(full_message: str) -> bool:
    """Check if there's a blank line between subject and body."""
    lines = full_message.split('\n')
    if len(lines) < 3:
        return True  # No body or only one line body

    # Check if second line is empty
    return lines[1].strip() == ''


def is_body_wrapped(body: str, max_width: int = 72) -> bool:
    """Check if body lines are wrapped at max_width."""
    if not body:
        return True

    lines = body.split('\n')
    for line in lines:
        # Allow bullet points and URLs to exceed limit
        if line.strip().startswith(('-', '*', '•', 'http://', 'https://')):
            continue
        if len(line) > max_width:
            return False

    return True


def has_footer(full_message: str) -> bool:
    """Check if commit has footer (BREAKING CHANGE, issue refs, etc.)."""
    footer_patterns = [
        r'BREAKING CHANGE:',
        r'Closes #\d+',
        r'Fixes #\d+',
        r'Refs #\d+',
        r'Co-authored-by:',
        r'Signed-off-by:'
    ]

    for pattern in footer_patterns:
        if re.search(pattern, full_message):
            return True

    return False


def references_issues(full_message: str) -> bool:
    """Check if commit references issues."""
    pattern = r'#\d+|[Cc]loses|[Ff]ixes|[Rr]efs'
    return bool(re.search(pattern, full_message))


def mentions_breaking(full_message: str) -> bool:
    """Check if commit mentions breaking changes."""
    return 'BREAKING CHANGE:' in full_message or 'BREAKING-CHANGE:' in full_message


def has_co_authors(full_message: str) -> bool:
    """Check if commit has co-authors."""
    return 'Co-authored-by:' in full_message


def is_signed_off(full_message: str) -> bool:
    """Check if commit is signed off."""
    return 'Signed-off-by:' in full_message


def includes_rationale(body: str) -> bool:
    """Check if body includes rationale (why/because/to/for)."""
    if not body:
        return False
    words = ['because', 'to ', 'for ', 'why', 'since', 'as ', 'in order to']
    body_lower = body.lower()
    return any(word in body_lower for word in words)


def mentions_impact(body: str) -> bool:
    """Check if body mentions impact."""
    if not body:
        return False
    words = ['affect', 'impact', 'change', 'improve', 'break', 'fix']
    body_lower = body.lower()
    return any(word in body_lower for word in words)


def analyze_patterns(commits: List[Dict[str, str]]) -> Dict:
    """Analyze commit patterns and return results."""
    total = len(commits)

    # Initialize counters
    patterns = {
        'format': defaultdict(int),
        'conventions': defaultdict(int),
        'content': defaultdict(int)
    }

    # Count commits with bodies (for calculations)
    commits_with_body = 0

    for commit in commits:
        subject = commit['subject']
        body = commit['body']
        full = commit['full']

        # Format patterns
        if is_conventional_commit(subject):
            patterns['format']['conventional_commits'] += 1
        elif has_prefix(subject):
            patterns['format']['prefixed'] += 1
        elif has_tag(subject):
            patterns['format']['tagged'] += 1
        else:
            patterns['format']['simple_subject'] += 1

        # Convention patterns
        if is_imperative_mood(subject):
            patterns['conventions']['imperative_mood'] += 1

        if is_capitalized(subject):
            patterns['conventions']['capitalized_subject'] += 1

        if has_no_period_end(subject):
            patterns['conventions']['no_period_end'] += 1

        if body:
            commits_with_body += 1
            if has_blank_line_before_body(full):
                patterns['conventions']['blank_line_before_body'] += 1

            if is_body_wrapped(body):
                patterns['conventions']['wrapped_body'] += 1

        if has_footer(full):
            patterns['conventions']['has_footer'] += 1

        # Content patterns
        if references_issues(full):
            patterns['content']['references_issues'] += 1

        if mentions_breaking(full):
            patterns['content']['mentions_breaking'] += 1

        if has_co_authors(full):
            patterns['content']['has_co_authors'] += 1

        if is_signed_off(full):
            patterns['content']['signed_off'] += 1

        if includes_rationale(body):
            patterns['content']['includes_rationale'] += 1

        if mentions_impact(body):
            patterns['content']['mentions_impact'] += 1

    # Calculate percentages and strength
    def calc_percentage(count, denominator=total):
        return round((count / denominator * 100), 1) if denominator > 0 else 0

    def get_strength(percentage):
        if percentage >= 95:
            return "perfect"
        elif percentage >= 80:
            return "strong"
        elif percentage >= 65:
            return "dominant"
        elif percentage >= 45:
            return "common"
        elif percentage >= 25:
            return "moderate"
        elif percentage >= 10:
            return "occasional"
        elif percentage >= 1:
            return "rare"
        else:
            return "absent"

    # Build results
    results = {
        'format': {},
        'conventions': {},
        'content': {}
    }

    for category, counters in patterns.items():
        for pattern_name, count in counters.items():
            # Use commits_with_body as denominator for body-specific patterns
            if pattern_name in ['blank_line_before_body', 'wrapped_body']:
                denominator = commits_with_body
            else:
                denominator = total

            percentage = calc_percentage(count, denominator)
            results[category][pattern_name] = {
                'count': count,
                'percentage': percentage,
                'strength': get_strength(percentage)
            }

    # Calculate consistency score
    # Weight: format(40), conventions(40), content(20)
    format_score = results['format'].get('conventional_commits', {}).get('percentage', 0)
    convention_scores = [
        results['conventions'].get('imperative_mood', {}).get('percentage', 0),
        results['conventions'].get('capitalized_subject', {}).get('percentage', 0),
        results['conventions'].get('no_period_end', {}).get('percentage', 0)
    ]
    avg_convention = sum(convention_scores) / len(convention_scores) if convention_scores else 0
    content_scores = [
        results['content'].get('references_issues', {}).get('percentage', 0),
        results['content'].get('includes_rationale', {}).get('percentage', 0)
    ]
    avg_content = sum(content_scores) / len(content_scores) if content_scores else 0

    consistency_score = int(format_score * 0.4 + avg_convention * 0.4 + avg_content * 0.2)

    # Determine dominant pattern
    format_patterns = results['format']
    dominant_pattern = max(format_patterns.items(), key=lambda x: x[1]['count'])[0] if format_patterns else "unknown"

    return {
        'commits_analyzed': total,
        'patterns': results,
        'consistency_score': consistency_score,
        'dominant_pattern': dominant_pattern
    }


def main():
    parser = argparse.ArgumentParser(description='Detect commit message patterns')
    parser.add_argument('--count', type=int, default=50, help='Number of commits to analyze')
    parser.add_argument('--branch', default='HEAD', help='Branch to analyze')
    parser.add_argument('--detailed', action='store_true', help='Include detailed breakdown')

    args = parser.parse_args()

    # Validate git repository
    if not is_git_repository():
        print(json.dumps({'error': 'Not in a git repository'}), file=sys.stderr)
        sys.exit(1)

    if not has_commits():
        print(json.dumps({'error': 'No commit history found'}), file=sys.stderr)
        sys.exit(2)

    # Fetch commits
    commits = get_commits(args.count, args.branch)
    if not commits:
        print(json.dumps({'error': 'Failed to fetch commits'}), file=sys.stderr)
        sys.exit(3)

    # Analyze patterns
    results = analyze_patterns(commits)
    results['branch'] = args.branch
    results['detailed'] = args.detailed

    # Output JSON
    print(json.dumps(results, indent=2))
    sys.exit(0)


if __name__ == '__main__':
    main()