Initial commit

This commit is contained in:
Zhongwei Li
2025-11-29 18:16:40 +08:00
commit f125e90b9f
370 changed files with 67769 additions and 0 deletions

View File

@@ -0,0 +1,411 @@
"""
Code Quality Analyzer
Analyzes code for:
- Cyclomatic complexity
- Code duplication
- Code smells
- File/function length
- Language-specific issues (TypeScript/JavaScript)
"""
import re
from pathlib import Path
from typing import Dict, List
def analyze(codebase_path: Path, metadata: Dict) -> List[Dict]:
"""
Analyze codebase for code quality issues.
Args:
codebase_path: Path to codebase
metadata: Project metadata from discovery phase
Returns:
List of findings with severity, location, and remediation info
"""
findings = []
# Determine which languages to analyze
tech_stack = metadata.get('tech_stack', {})
if tech_stack.get('javascript') or tech_stack.get('typescript'):
findings.extend(analyze_javascript_typescript(codebase_path))
if tech_stack.get('python'):
findings.extend(analyze_python(codebase_path))
# General analysis (language-agnostic)
findings.extend(analyze_file_sizes(codebase_path))
findings.extend(analyze_dead_code(codebase_path, tech_stack))
return findings
def analyze_javascript_typescript(codebase_path: Path) -> List[Dict]:
"""Analyze JavaScript/TypeScript specific quality issues."""
findings = []
extensions = {'.js', '.jsx', '.ts', '.tsx'}
exclude_dirs = {'node_modules', '.git', 'dist', 'build', '.next', 'coverage'}
for file_path in codebase_path.rglob('*'):
if (file_path.suffix in extensions and
not any(excluded in file_path.parts for excluded in exclude_dirs)):
try:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
lines = content.split('\n')
# Check for TypeScript 'any' type
if file_path.suffix in {'.ts', '.tsx'}:
findings.extend(check_any_usage(file_path, content, lines))
# Check for 'var' keyword
findings.extend(check_var_usage(file_path, content, lines))
# Check for console.log statements
findings.extend(check_console_log(file_path, content, lines))
# Check for loose equality
findings.extend(check_loose_equality(file_path, content, lines))
# Check cyclomatic complexity (simplified)
findings.extend(check_complexity(file_path, content, lines))
# Check function length
findings.extend(check_function_length(file_path, content, lines))
except Exception as e:
# Skip files that can't be read
pass
return findings
def check_any_usage(file_path: Path, content: str, lines: List[str]) -> List[Dict]:
"""Check for TypeScript 'any' type usage."""
findings = []
# Pattern to match 'any' type (excluding comments)
any_pattern = re.compile(r':\s*any\b|<any>|Array<any>|\bany\[\]')
for line_num, line in enumerate(lines, start=1):
# Skip comments
if line.strip().startswith('//') or line.strip().startswith('/*') or line.strip().startswith('*'):
continue
if any_pattern.search(line):
findings.append({
'severity': 'medium',
'category': 'code_quality',
'subcategory': 'typescript_strict_mode',
'title': "Use of 'any' type violates TypeScript strict mode",
'description': f"Found 'any' type on line {line_num}",
'file': str(file_path.relative_to(file_path.parents[len(file_path.parts) - file_path.parts.index('annex') - 2])),
'line': line_num,
'code_snippet': line.strip(),
'impact': 'Reduces type safety and defeats the purpose of TypeScript',
'remediation': 'Replace "any" with specific types or use "unknown" with type guards',
'effort': 'low',
})
return findings
def check_var_usage(file_path: Path, content: str, lines: List[str]) -> List[Dict]:
"""Check for 'var' keyword usage."""
findings = []
var_pattern = re.compile(r'\bvar\s+\w+')
for line_num, line in enumerate(lines, start=1):
if line.strip().startswith('//') or line.strip().startswith('/*'):
continue
if var_pattern.search(line):
findings.append({
'severity': 'low',
'category': 'code_quality',
'subcategory': 'modern_javascript',
'title': "Use of 'var' keyword is deprecated",
'description': f"Found 'var' keyword on line {line_num}",
'file': str(file_path.relative_to(file_path.parents[len(file_path.parts) - file_path.parts.index('annex') - 2])),
'line': line_num,
'code_snippet': line.strip(),
'impact': 'Function-scoped variables can lead to bugs; block-scoped (let/const) is preferred',
'remediation': "Replace 'var' with 'const' (for values that don't change) or 'let' (for values that change)",
'effort': 'low',
})
return findings
def check_console_log(file_path: Path, content: str, lines: List[str]) -> List[Dict]:
"""Check for console.log statements in production code."""
findings = []
# Skip if it's in a test file
if 'test' in file_path.name or 'spec' in file_path.name or '__tests__' in str(file_path):
return findings
console_pattern = re.compile(r'\bconsole\.(log|debug|info|warn|error)\(')
for line_num, line in enumerate(lines, start=1):
if line.strip().startswith('//'):
continue
if console_pattern.search(line):
findings.append({
'severity': 'medium',
'category': 'code_quality',
'subcategory': 'production_code',
'title': 'Console statement in production code',
'description': f"Found console statement on line {line_num}",
'file': str(file_path.relative_to(file_path.parents[len(file_path.parts) - file_path.parts.index('annex') - 2])),
'line': line_num,
'code_snippet': line.strip(),
'impact': 'Console statements should not be in production code; use proper logging',
'remediation': 'Remove console statement or replace with proper logging framework',
'effort': 'low',
})
return findings
def check_loose_equality(file_path: Path, content: str, lines: List[str]) -> List[Dict]:
"""Check for loose equality operators (== instead of ===)."""
findings = []
loose_eq_pattern = re.compile(r'[^!<>]==[^=]|[^!<>]!=[^=]')
for line_num, line in enumerate(lines, start=1):
if line.strip().startswith('//') or line.strip().startswith('/*'):
continue
if loose_eq_pattern.search(line):
findings.append({
'severity': 'low',
'category': 'code_quality',
'subcategory': 'code_smell',
'title': 'Loose equality operator used',
'description': f"Found '==' or '!=' on line {line_num}, should use '===' or '!=='",
'file': str(file_path.relative_to(file_path.parents[len(file_path.parts) - file_path.parts.index('annex') - 2])),
'line': line_num,
'code_snippet': line.strip(),
'impact': 'Loose equality can lead to unexpected type coercion bugs',
'remediation': "Replace '==' with '===' and '!=' with '!=='",
'effort': 'low',
})
return findings
def check_complexity(file_path: Path, content: str, lines: List[str]) -> List[Dict]:
"""
Check cyclomatic complexity (simplified).
Counts decision points: if, else, while, for, case, catch, &&, ||, ?
"""
findings = []
# Find function declarations
func_pattern = re.compile(r'(function\s+\w+|const\s+\w+\s*=\s*\([^)]*\)\s*=>|\w+\s*\([^)]*\)\s*{)')
current_function = None
current_function_line = 0
brace_depth = 0
complexity = 0
for line_num, line in enumerate(lines, start=1):
stripped = line.strip()
# Track braces to find function boundaries
brace_depth += stripped.count('{') - stripped.count('}')
# New function started
if func_pattern.search(line) and brace_depth >= 1:
# Save previous function if exists
if current_function and complexity > 10:
severity = 'critical' if complexity > 20 else 'high' if complexity > 15 else 'medium'
findings.append({
'severity': severity,
'category': 'code_quality',
'subcategory': 'complexity',
'title': f'High cyclomatic complexity ({complexity})',
'description': f'Function has complexity of {complexity}',
'file': str(file_path.relative_to(file_path.parents[len(file_path.parts) - file_path.parts.index('annex') - 2])),
'line': current_function_line,
'code_snippet': current_function,
'impact': 'High complexity makes code difficult to understand, test, and maintain',
'remediation': 'Refactor into smaller functions, extract complex conditions',
'effort': 'medium' if complexity < 20 else 'high',
})
# Start new function
current_function = stripped
current_function_line = line_num
complexity = 1 # Base complexity
# Count complexity contributors
if current_function:
complexity += stripped.count('if ')
complexity += stripped.count('else if')
complexity += stripped.count('while ')
complexity += stripped.count('for ')
complexity += stripped.count('case ')
complexity += stripped.count('catch ')
complexity += stripped.count('&&')
complexity += stripped.count('||')
complexity += stripped.count('?')
return findings
def check_function_length(file_path: Path, content: str, lines: List[str]) -> List[Dict]:
"""Check for overly long functions."""
findings = []
func_pattern = re.compile(r'(function\s+\w+|const\s+\w+\s*=\s*\([^)]*\)\s*=>|\w+\s*\([^)]*\)\s*{)')
current_function = None
current_function_line = 0
function_lines = 0
brace_depth = 0
for line_num, line in enumerate(lines, start=1):
stripped = line.strip()
if func_pattern.search(line):
# Check previous function
if current_function and function_lines > 50:
severity = 'high' if function_lines > 100 else 'medium'
findings.append({
'severity': severity,
'category': 'code_quality',
'subcategory': 'function_length',
'title': f'Long function ({function_lines} lines)',
'description': f'Function is {function_lines} lines long (recommended: < 50)',
'file': str(file_path.relative_to(file_path.parents[len(file_path.parts) - file_path.parts.index('annex') - 2])),
'line': current_function_line,
'code_snippet': current_function,
'impact': 'Long functions are harder to understand, test, and maintain',
'remediation': 'Extract smaller functions for distinct responsibilities',
'effort': 'medium',
})
current_function = stripped
current_function_line = line_num
function_lines = 0
brace_depth = 0
if current_function:
function_lines += 1
brace_depth += stripped.count('{') - stripped.count('}')
if brace_depth == 0 and function_lines > 1:
# Function ended
current_function = None
return findings
def analyze_python(codebase_path: Path) -> List[Dict]:
"""Analyze Python-specific quality issues."""
findings = []
# Python analysis to be implemented
# Would check: PEP 8 violations, complexity, type hints, etc.
return findings
def analyze_file_sizes(codebase_path: Path) -> List[Dict]:
"""Check for overly large files."""
findings = []
exclude_dirs = {'node_modules', '.git', 'dist', 'build', '__pycache__'}
code_extensions = {'.js', '.jsx', '.ts', '.tsx', '.py', '.java', '.go', '.rs'}
for file_path in codebase_path.rglob('*'):
if (file_path.is_file() and
file_path.suffix in code_extensions and
not any(excluded in file_path.parts for excluded in exclude_dirs)):
try:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
lines = len(f.readlines())
if lines > 500:
severity = 'high' if lines > 1000 else 'medium'
findings.append({
'severity': severity,
'category': 'code_quality',
'subcategory': 'file_length',
'title': f'Large file ({lines} lines)',
'description': f'File has {lines} lines (recommended: < 500)',
'file': str(file_path.relative_to(file_path.parents[len(file_path.parts) - file_path.parts.index('annex') - 2])),
'line': 1,
'code_snippet': None,
'impact': 'Large files are difficult to navigate and understand',
'remediation': 'Split into multiple smaller, focused modules',
'effort': 'high',
})
except:
pass
return findings
def analyze_dead_code(codebase_path: Path, tech_stack: Dict) -> List[Dict]:
"""Detect potential dead code (commented-out code blocks)."""
findings = []
exclude_dirs = {'node_modules', '.git', 'dist', 'build'}
extensions = set()
if tech_stack.get('javascript') or tech_stack.get('typescript'):
extensions.update({'.js', '.jsx', '.ts', '.tsx'})
if tech_stack.get('python'):
extensions.add('.py')
for file_path in codebase_path.rglob('*'):
if (file_path.suffix in extensions and
not any(excluded in file_path.parts for excluded in exclude_dirs)):
try:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
lines = f.readlines()
# Count consecutive commented lines with code-like content
comment_block_size = 0
block_start_line = 0
for line_num, line in enumerate(lines, start=1):
stripped = line.strip()
# Check if line is commented code
if (stripped.startswith('//') and
any(keyword in stripped for keyword in ['function', 'const', 'let', 'var', 'if', 'for', 'while', '{', '}', ';'])):
if comment_block_size == 0:
block_start_line = line_num
comment_block_size += 1
else:
# End of comment block
if comment_block_size >= 5: # 5+ lines of commented code
findings.append({
'severity': 'low',
'category': 'code_quality',
'subcategory': 'dead_code',
'title': f'Commented-out code block ({comment_block_size} lines)',
'description': f'Found {comment_block_size} lines of commented code',
'file': str(file_path.relative_to(file_path.parents[len(file_path.parts) - file_path.parts.index('annex') - 2])),
'line': block_start_line,
'code_snippet': None,
'impact': 'Commented code clutters codebase and reduces readability',
'remediation': 'Remove commented code (it\'s in version control if needed)',
'effort': 'low',
})
comment_block_size = 0
except:
pass
return findings