#!/usr/bin/env python3 """ Split Analyzer - Determine if changes should be split Purpose: Analyze git changes to determine if they should be split into multiple commits Version: 1.0.0 Usage: ./split-analyzer.py [--verbose] [--threshold N] Returns: Exit 0: Should split Exit 1: Already atomic Exit 2: Error occurred Dependencies: git, python3 """ import sys import subprocess import re import json from collections import defaultdict from typing import List, Dict, Tuple, Optional # Conventional commit types COMMIT_TYPES = ['feat', 'fix', 'docs', 'style', 'refactor', 'test', 'chore', 'perf', 'ci', 'build'] class SplitAnalyzer: def __init__(self, threshold: int = 10, verbose: bool = False): self.threshold = threshold self.verbose = verbose self.files = [] self.types = defaultdict(list) self.scopes = defaultdict(list) self.concerns = [] def log(self, message: str): """Print message if verbose mode enabled""" if self.verbose: print(f"[DEBUG] {message}", file=sys.stderr) def run_git_command(self, args: List[str]) -> str: """Execute git command and return output""" try: result = subprocess.run( ['git'] + args, capture_output=True, text=True, check=True ) return result.stdout except subprocess.CalledProcessError as e: print(f"Error running git command: {e}", file=sys.stderr) sys.exit(2) def get_changed_files(self) -> List[str]: """Get list of changed files (staged and unstaged)""" # Staged files staged = self.run_git_command(['diff', '--cached', '--name-only']) # Unstaged files unstaged = self.run_git_command(['diff', '--name-only']) files = set() files.update(filter(None, staged.split('\n'))) files.update(filter(None, unstaged.split('\n'))) return list(files) def get_file_diff(self, file_path: str) -> str: """Get diff for a specific file""" try: # Try staged first diff = self.run_git_command(['diff', '--cached', file_path]) if not diff: # Try unstaged diff = self.run_git_command(['diff', file_path]) return diff except: return "" def detect_type_from_diff(self, file_path: str, diff: str) -> str: """Detect commit type from file path and diff content""" # Documentation files if any(file_path.endswith(ext) for ext in ['.md', '.txt', '.rst', '.adoc']): return 'docs' # Test files if any(pattern in file_path for pattern in ['test/', 'tests/', 'spec/', '__tests__', '.test.', '.spec.']): return 'test' # CI/CD files if any(pattern in file_path for pattern in ['.github/', '.gitlab-ci', 'jenkins', '.circleci']): return 'ci' # Build files if any(file_path.endswith(ext) for ext in ['package.json', 'pom.xml', 'build.gradle', 'Makefile', 'CMakeLists.txt']): return 'build' # Analyze diff content if not diff: return 'chore' # Look for new functionality added_lines = [line for line in diff.split('\n') if line.startswith('+') and not line.startswith('+++')] # Check for function/class additions (new features) if any(keyword in ' '.join(added_lines) for keyword in ['function ', 'class ', 'def ', 'const ', 'let ', 'var ']): if any(keyword in ' '.join(added_lines) for keyword in ['new ', 'add', 'implement', 'create']): return 'feat' # Check for bug fix patterns if any(keyword in ' '.join(added_lines).lower() for keyword in ['fix', 'bug', 'error', 'issue', 'null', 'undefined']): return 'fix' # Check for refactoring if any(keyword in ' '.join(added_lines).lower() for keyword in ['refactor', 'rename', 'move', 'extract']): return 'refactor' # Check for performance if any(keyword in ' '.join(added_lines).lower() for keyword in ['performance', 'optimize', 'cache', 'memoize']): return 'perf' # Check for style changes (formatting only) removed_lines = [line for line in diff.split('\n') if line.startswith('-') and not line.startswith('---')] if len(added_lines) == len(removed_lines): # Similar number of additions and deletions might indicate formatting return 'style' # Default to feat for new code, chore for modifications if len(added_lines) > len(removed_lines) * 2: return 'feat' return 'chore' def extract_scope_from_path(self, file_path: str) -> str: """Extract scope from file path""" parts = file_path.split('/') # Skip common prefixes skip_prefixes = ['src', 'lib', 'app', 'packages', 'tests', 'test'] for part in parts: if part not in skip_prefixes and part != '.' and part != '..': # Remove file extension scope = part.split('.')[0] return scope return 'root' def detect_mixed_concerns(self) -> List[str]: """Detect mixed concerns in changes""" concerns = [] # Check for feature + unrelated changes has_feature = 'feat' in self.types has_refactor = 'refactor' in self.types has_style = 'style' in self.types if has_feature and has_refactor: concerns.append("Feature implementation mixed with refactoring") if has_feature and has_style: concerns.append("Feature implementation mixed with style changes") # Check for test + implementation in separate modules if 'test' in self.types: test_scopes = set(self.scopes[scope] for scope in self.scopes if 'test' in scope) impl_scopes = set(self.scopes[scope] for scope in self.scopes if 'test' not in scope) if test_scopes != impl_scopes and len(impl_scopes) > 1: concerns.append("Tests for multiple unrelated implementations") return concerns def analyze(self) -> Tuple[bool, str, Dict]: """Analyze changes and determine if should split""" # Get changed files self.files = self.get_changed_files() self.log(f"Found {len(self.files)} changed files") if not self.files: return False, "No changes detected", {} # Analyze each file for file_path in self.files: self.log(f"Analyzing: {file_path}") diff = self.get_file_diff(file_path) file_type = self.detect_type_from_diff(file_path, diff) scope = self.extract_scope_from_path(file_path) self.types[file_type].append(file_path) self.scopes[scope].append(file_path) self.log(f" Type: {file_type}, Scope: {scope}") # Check splitting criteria reasons = [] # Check 1: Multiple types if len(self.types) > 1: type_list = ', '.join(self.types.keys()) reasons.append(f"Multiple types detected: {type_list}") self.log(f"SPLIT REASON: Multiple types: {type_list}") # Check 2: Multiple scopes if len(self.scopes) > 1: scope_list = ', '.join(self.scopes.keys()) reasons.append(f"Multiple scopes detected: {scope_list}") self.log(f"SPLIT REASON: Multiple scopes: {scope_list}") # Check 3: Too many files if len(self.files) > self.threshold: reasons.append(f"Large change: {len(self.files)} files (threshold: {self.threshold})") self.log(f"SPLIT REASON: Too many files: {len(self.files)} > {self.threshold}") # Check 4: Mixed concerns self.concerns = self.detect_mixed_concerns() if self.concerns: reasons.append(f"Mixed concerns: {'; '.join(self.concerns)}") self.log(f"SPLIT REASON: Mixed concerns detected") # Prepare detailed metrics metrics = { 'file_count': len(self.files), 'types_detected': list(self.types.keys()), 'type_counts': {t: len(files) for t, files in self.types.items()}, 'scopes_detected': list(self.scopes.keys()), 'scope_counts': {s: len(files) for s, files in self.scopes.items()}, 'concerns': self.concerns, 'threshold': self.threshold } # Determine result if reasons: should_split = True reason = '; '.join(reasons) self.log(f"RECOMMENDATION: Should split - {reason}") else: should_split = False reason = "Changes are atomic - single logical unit" self.log(f"RECOMMENDATION: Already atomic") return should_split, reason, metrics def main(): import argparse parser = argparse.ArgumentParser(description='Analyze if git changes should be split') parser.add_argument('--verbose', action='store_true', help='Verbose output') parser.add_argument('--threshold', type=int, default=10, help='File count threshold') parser.add_argument('--json', action='store_true', help='Output JSON format') args = parser.parse_args() analyzer = SplitAnalyzer(threshold=args.threshold, verbose=args.verbose) should_split, reason, metrics = analyzer.analyze() if args.json: # Output JSON format result = { 'should_split': should_split, 'reason': reason, 'metrics': metrics, 'recommendation': 'split' if should_split else 'atomic' } print(json.dumps(result, indent=2)) else: # Output human-readable format print(f"Should split: {'YES' if should_split else 'NO'}") print(f"Reason: {reason}") print(f"\nMetrics:") print(f" Files: {metrics['file_count']}") print(f" Types: {', '.join(metrics['types_detected'])}") print(f" Scopes: {', '.join(metrics['scopes_detected'])}") if metrics['concerns']: print(f" Concerns: {', '.join(metrics['concerns'])}") # Exit code: 0 = should split, 1 = atomic sys.exit(0 if should_split else 1) if __name__ == '__main__': main()