Initial commit

This commit is contained in:
Zhongwei Li
2025-11-29 18:16:40 +08:00
commit f125e90b9f
370 changed files with 67769 additions and 0 deletions

View File

@@ -0,0 +1,8 @@
"""
Analyzer modules for codebase auditing.
Each analyzer implements an analyze(codebase_path, metadata) function
that returns a list of findings.
"""
__version__ = '1.0.0'

View File

@@ -0,0 +1,411 @@
"""
Code Quality Analyzer
Analyzes code for:
- Cyclomatic complexity
- Code duplication
- Code smells
- File/function length
- Language-specific issues (TypeScript/JavaScript)
"""
import re
from pathlib import Path
from typing import Dict, List
def analyze(codebase_path: Path, metadata: Dict) -> List[Dict]:
"""
Analyze codebase for code quality issues.
Args:
codebase_path: Path to codebase
metadata: Project metadata from discovery phase
Returns:
List of findings with severity, location, and remediation info
"""
findings = []
# Determine which languages to analyze
tech_stack = metadata.get('tech_stack', {})
if tech_stack.get('javascript') or tech_stack.get('typescript'):
findings.extend(analyze_javascript_typescript(codebase_path))
if tech_stack.get('python'):
findings.extend(analyze_python(codebase_path))
# General analysis (language-agnostic)
findings.extend(analyze_file_sizes(codebase_path))
findings.extend(analyze_dead_code(codebase_path, tech_stack))
return findings
def analyze_javascript_typescript(codebase_path: Path) -> List[Dict]:
"""Analyze JavaScript/TypeScript specific quality issues."""
findings = []
extensions = {'.js', '.jsx', '.ts', '.tsx'}
exclude_dirs = {'node_modules', '.git', 'dist', 'build', '.next', 'coverage'}
for file_path in codebase_path.rglob('*'):
if (file_path.suffix in extensions and
not any(excluded in file_path.parts for excluded in exclude_dirs)):
try:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
lines = content.split('\n')
# Check for TypeScript 'any' type
if file_path.suffix in {'.ts', '.tsx'}:
findings.extend(check_any_usage(file_path, content, lines))
# Check for 'var' keyword
findings.extend(check_var_usage(file_path, content, lines))
# Check for console.log statements
findings.extend(check_console_log(file_path, content, lines))
# Check for loose equality
findings.extend(check_loose_equality(file_path, content, lines))
# Check cyclomatic complexity (simplified)
findings.extend(check_complexity(file_path, content, lines))
# Check function length
findings.extend(check_function_length(file_path, content, lines))
except Exception as e:
# Skip files that can't be read
pass
return findings
def check_any_usage(file_path: Path, content: str, lines: List[str]) -> List[Dict]:
"""Check for TypeScript 'any' type usage."""
findings = []
# Pattern to match 'any' type (excluding comments)
any_pattern = re.compile(r':\s*any\b|<any>|Array<any>|\bany\[\]')
for line_num, line in enumerate(lines, start=1):
# Skip comments
if line.strip().startswith('//') or line.strip().startswith('/*') or line.strip().startswith('*'):
continue
if any_pattern.search(line):
findings.append({
'severity': 'medium',
'category': 'code_quality',
'subcategory': 'typescript_strict_mode',
'title': "Use of 'any' type violates TypeScript strict mode",
'description': f"Found 'any' type on line {line_num}",
'file': str(file_path.relative_to(file_path.parents[len(file_path.parts) - file_path.parts.index('annex') - 2])),
'line': line_num,
'code_snippet': line.strip(),
'impact': 'Reduces type safety and defeats the purpose of TypeScript',
'remediation': 'Replace "any" with specific types or use "unknown" with type guards',
'effort': 'low',
})
return findings
def check_var_usage(file_path: Path, content: str, lines: List[str]) -> List[Dict]:
"""Check for 'var' keyword usage."""
findings = []
var_pattern = re.compile(r'\bvar\s+\w+')
for line_num, line in enumerate(lines, start=1):
if line.strip().startswith('//') or line.strip().startswith('/*'):
continue
if var_pattern.search(line):
findings.append({
'severity': 'low',
'category': 'code_quality',
'subcategory': 'modern_javascript',
'title': "Use of 'var' keyword is deprecated",
'description': f"Found 'var' keyword on line {line_num}",
'file': str(file_path.relative_to(file_path.parents[len(file_path.parts) - file_path.parts.index('annex') - 2])),
'line': line_num,
'code_snippet': line.strip(),
'impact': 'Function-scoped variables can lead to bugs; block-scoped (let/const) is preferred',
'remediation': "Replace 'var' with 'const' (for values that don't change) or 'let' (for values that change)",
'effort': 'low',
})
return findings
def check_console_log(file_path: Path, content: str, lines: List[str]) -> List[Dict]:
"""Check for console.log statements in production code."""
findings = []
# Skip if it's in a test file
if 'test' in file_path.name or 'spec' in file_path.name or '__tests__' in str(file_path):
return findings
console_pattern = re.compile(r'\bconsole\.(log|debug|info|warn|error)\(')
for line_num, line in enumerate(lines, start=1):
if line.strip().startswith('//'):
continue
if console_pattern.search(line):
findings.append({
'severity': 'medium',
'category': 'code_quality',
'subcategory': 'production_code',
'title': 'Console statement in production code',
'description': f"Found console statement on line {line_num}",
'file': str(file_path.relative_to(file_path.parents[len(file_path.parts) - file_path.parts.index('annex') - 2])),
'line': line_num,
'code_snippet': line.strip(),
'impact': 'Console statements should not be in production code; use proper logging',
'remediation': 'Remove console statement or replace with proper logging framework',
'effort': 'low',
})
return findings
def check_loose_equality(file_path: Path, content: str, lines: List[str]) -> List[Dict]:
"""Check for loose equality operators (== instead of ===)."""
findings = []
loose_eq_pattern = re.compile(r'[^!<>]==[^=]|[^!<>]!=[^=]')
for line_num, line in enumerate(lines, start=1):
if line.strip().startswith('//') or line.strip().startswith('/*'):
continue
if loose_eq_pattern.search(line):
findings.append({
'severity': 'low',
'category': 'code_quality',
'subcategory': 'code_smell',
'title': 'Loose equality operator used',
'description': f"Found '==' or '!=' on line {line_num}, should use '===' or '!=='",
'file': str(file_path.relative_to(file_path.parents[len(file_path.parts) - file_path.parts.index('annex') - 2])),
'line': line_num,
'code_snippet': line.strip(),
'impact': 'Loose equality can lead to unexpected type coercion bugs',
'remediation': "Replace '==' with '===' and '!=' with '!=='",
'effort': 'low',
})
return findings
def check_complexity(file_path: Path, content: str, lines: List[str]) -> List[Dict]:
"""
Check cyclomatic complexity (simplified).
Counts decision points: if, else, while, for, case, catch, &&, ||, ?
"""
findings = []
# Find function declarations
func_pattern = re.compile(r'(function\s+\w+|const\s+\w+\s*=\s*\([^)]*\)\s*=>|\w+\s*\([^)]*\)\s*{)')
current_function = None
current_function_line = 0
brace_depth = 0
complexity = 0
for line_num, line in enumerate(lines, start=1):
stripped = line.strip()
# Track braces to find function boundaries
brace_depth += stripped.count('{') - stripped.count('}')
# New function started
if func_pattern.search(line) and brace_depth >= 1:
# Save previous function if exists
if current_function and complexity > 10:
severity = 'critical' if complexity > 20 else 'high' if complexity > 15 else 'medium'
findings.append({
'severity': severity,
'category': 'code_quality',
'subcategory': 'complexity',
'title': f'High cyclomatic complexity ({complexity})',
'description': f'Function has complexity of {complexity}',
'file': str(file_path.relative_to(file_path.parents[len(file_path.parts) - file_path.parts.index('annex') - 2])),
'line': current_function_line,
'code_snippet': current_function,
'impact': 'High complexity makes code difficult to understand, test, and maintain',
'remediation': 'Refactor into smaller functions, extract complex conditions',
'effort': 'medium' if complexity < 20 else 'high',
})
# Start new function
current_function = stripped
current_function_line = line_num
complexity = 1 # Base complexity
# Count complexity contributors
if current_function:
complexity += stripped.count('if ')
complexity += stripped.count('else if')
complexity += stripped.count('while ')
complexity += stripped.count('for ')
complexity += stripped.count('case ')
complexity += stripped.count('catch ')
complexity += stripped.count('&&')
complexity += stripped.count('||')
complexity += stripped.count('?')
return findings
def check_function_length(file_path: Path, content: str, lines: List[str]) -> List[Dict]:
"""Check for overly long functions."""
findings = []
func_pattern = re.compile(r'(function\s+\w+|const\s+\w+\s*=\s*\([^)]*\)\s*=>|\w+\s*\([^)]*\)\s*{)')
current_function = None
current_function_line = 0
function_lines = 0
brace_depth = 0
for line_num, line in enumerate(lines, start=1):
stripped = line.strip()
if func_pattern.search(line):
# Check previous function
if current_function and function_lines > 50:
severity = 'high' if function_lines > 100 else 'medium'
findings.append({
'severity': severity,
'category': 'code_quality',
'subcategory': 'function_length',
'title': f'Long function ({function_lines} lines)',
'description': f'Function is {function_lines} lines long (recommended: < 50)',
'file': str(file_path.relative_to(file_path.parents[len(file_path.parts) - file_path.parts.index('annex') - 2])),
'line': current_function_line,
'code_snippet': current_function,
'impact': 'Long functions are harder to understand, test, and maintain',
'remediation': 'Extract smaller functions for distinct responsibilities',
'effort': 'medium',
})
current_function = stripped
current_function_line = line_num
function_lines = 0
brace_depth = 0
if current_function:
function_lines += 1
brace_depth += stripped.count('{') - stripped.count('}')
if brace_depth == 0 and function_lines > 1:
# Function ended
current_function = None
return findings
def analyze_python(codebase_path: Path) -> List[Dict]:
"""Analyze Python-specific quality issues."""
findings = []
# Python analysis to be implemented
# Would check: PEP 8 violations, complexity, type hints, etc.
return findings
def analyze_file_sizes(codebase_path: Path) -> List[Dict]:
"""Check for overly large files."""
findings = []
exclude_dirs = {'node_modules', '.git', 'dist', 'build', '__pycache__'}
code_extensions = {'.js', '.jsx', '.ts', '.tsx', '.py', '.java', '.go', '.rs'}
for file_path in codebase_path.rglob('*'):
if (file_path.is_file() and
file_path.suffix in code_extensions and
not any(excluded in file_path.parts for excluded in exclude_dirs)):
try:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
lines = len(f.readlines())
if lines > 500:
severity = 'high' if lines > 1000 else 'medium'
findings.append({
'severity': severity,
'category': 'code_quality',
'subcategory': 'file_length',
'title': f'Large file ({lines} lines)',
'description': f'File has {lines} lines (recommended: < 500)',
'file': str(file_path.relative_to(file_path.parents[len(file_path.parts) - file_path.parts.index('annex') - 2])),
'line': 1,
'code_snippet': None,
'impact': 'Large files are difficult to navigate and understand',
'remediation': 'Split into multiple smaller, focused modules',
'effort': 'high',
})
except:
pass
return findings
def analyze_dead_code(codebase_path: Path, tech_stack: Dict) -> List[Dict]:
"""Detect potential dead code (commented-out code blocks)."""
findings = []
exclude_dirs = {'node_modules', '.git', 'dist', 'build'}
extensions = set()
if tech_stack.get('javascript') or tech_stack.get('typescript'):
extensions.update({'.js', '.jsx', '.ts', '.tsx'})
if tech_stack.get('python'):
extensions.add('.py')
for file_path in codebase_path.rglob('*'):
if (file_path.suffix in extensions and
not any(excluded in file_path.parts for excluded in exclude_dirs)):
try:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
lines = f.readlines()
# Count consecutive commented lines with code-like content
comment_block_size = 0
block_start_line = 0
for line_num, line in enumerate(lines, start=1):
stripped = line.strip()
# Check if line is commented code
if (stripped.startswith('//') and
any(keyword in stripped for keyword in ['function', 'const', 'let', 'var', 'if', 'for', 'while', '{', '}', ';'])):
if comment_block_size == 0:
block_start_line = line_num
comment_block_size += 1
else:
# End of comment block
if comment_block_size >= 5: # 5+ lines of commented code
findings.append({
'severity': 'low',
'category': 'code_quality',
'subcategory': 'dead_code',
'title': f'Commented-out code block ({comment_block_size} lines)',
'description': f'Found {comment_block_size} lines of commented code',
'file': str(file_path.relative_to(file_path.parents[len(file_path.parts) - file_path.parts.index('annex') - 2])),
'line': block_start_line,
'code_snippet': None,
'impact': 'Commented code clutters codebase and reduces readability',
'remediation': 'Remove commented code (it\'s in version control if needed)',
'effort': 'low',
})
comment_block_size = 0
except:
pass
return findings

View File

@@ -0,0 +1,31 @@
"""
Dependencies Analyzer
Analyzes:
- Outdated dependencies
- Vulnerable dependencies
- License compliance
- Dependency health
"""
from pathlib import Path
from typing import Dict, List
def analyze(codebase_path: Path, metadata: Dict) -> List[Dict]:
"""
Analyze dependencies for issues.
Args:
codebase_path: Path to codebase
metadata: Project metadata
Returns:
List of dependency-related findings
"""
findings = []
# Placeholder implementation
# In production, this would integrate with npm audit, pip-audit, etc.
return findings

View File

@@ -0,0 +1,30 @@
"""
Performance Analyzer
Analyzes:
- Bundle sizes
- Build times
- Runtime performance indicators
"""
from pathlib import Path
from typing import Dict, List
def analyze(codebase_path: Path, metadata: Dict) -> List[Dict]:
"""
Analyze performance issues.
Args:
codebase_path: Path to codebase
metadata: Project metadata
Returns:
List of performance-related findings
"""
findings = []
# Placeholder implementation
# In production, this would analyze bundle sizes, check build configs, etc.
return findings

View File

@@ -0,0 +1,235 @@
"""
Security Scanner
Analyzes codebase for:
- Secrets in code (API keys, tokens, passwords)
- Dependency vulnerabilities
- Common security anti-patterns
- OWASP Top 10 issues
"""
import re
import json
from pathlib import Path
from typing import Dict, List
# Common patterns for secrets
SECRET_PATTERNS = {
'api_key': re.compile(r'(api[_-]?key|apikey)\s*[=:]\s*["\']([a-zA-Z0-9_-]{20,})["\']', re.IGNORECASE),
'aws_key': re.compile(r'AKIA[0-9A-Z]{16}'),
'generic_secret': re.compile(r'(secret|password|passwd|pwd)\s*[=:]\s*["\']([^"\'\s]{8,})["\']', re.IGNORECASE),
'private_key': re.compile(r'-----BEGIN (RSA |)PRIVATE KEY-----'),
'jwt': re.compile(r'eyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+'),
'github_token': re.compile(r'gh[pousr]_[A-Za-z0-9_]{36}'),
'slack_token': re.compile(r'xox[baprs]-[0-9]{10,12}-[0-9]{10,12}-[a-zA-Z0-9]{24,32}'),
}
def analyze(codebase_path: Path, metadata: Dict) -> List[Dict]:
"""
Analyze codebase for security issues.
Args:
codebase_path: Path to codebase
metadata: Project metadata from discovery phase
Returns:
List of security findings
"""
findings = []
# Scan for secrets
findings.extend(scan_for_secrets(codebase_path))
# Scan dependencies for vulnerabilities
if metadata.get('tech_stack', {}).get('javascript'):
findings.extend(scan_npm_dependencies(codebase_path))
# Check for common security anti-patterns
findings.extend(scan_security_antipatterns(codebase_path, metadata))
return findings
def scan_for_secrets(codebase_path: Path) -> List[Dict]:
"""Scan for hardcoded secrets in code."""
findings = []
exclude_dirs = {'node_modules', '.git', 'dist', 'build', '__pycache__', '.venv', 'venv'}
exclude_files = {'.env.example', 'package-lock.json', 'yarn.lock'}
# File extensions to scan
code_extensions = {'.js', '.jsx', '.ts', '.tsx', '.py', '.java', '.go', '.rb', '.php', '.yml', '.yaml', '.json', '.env'}
for file_path in codebase_path.rglob('*'):
if (file_path.is_file() and
file_path.suffix in code_extensions and
file_path.name not in exclude_files and
not any(excluded in file_path.parts for excluded in exclude_dirs)):
try:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
lines = content.split('\n')
for pattern_name, pattern in SECRET_PATTERNS.items():
matches = pattern.finditer(content)
for match in matches:
# Find line number
line_num = content[:match.start()].count('\n') + 1
# Skip if it's clearly a placeholder or example
matched_text = match.group(0)
if is_placeholder(matched_text):
continue
findings.append({
'severity': 'critical',
'category': 'security',
'subcategory': 'secrets',
'title': f'Potential {pattern_name.replace("_", " ")} found in code',
'description': f'Found potential secret on line {line_num}',
'file': str(file_path.relative_to(codebase_path)),
'line': line_num,
'code_snippet': lines[line_num - 1].strip() if line_num <= len(lines) else '',
'impact': 'Exposed secrets can lead to unauthorized access and data breaches',
'remediation': 'Remove secret from code and use environment variables or secret management tools',
'effort': 'low',
})
except:
pass
return findings
def is_placeholder(text: str) -> bool:
"""Check if a potential secret is actually a placeholder."""
placeholders = [
'your_api_key', 'your_secret', 'example', 'placeholder', 'test',
'dummy', 'sample', 'xxx', '000', 'abc123', 'changeme', 'replace_me',
'my_api_key', 'your_key_here', 'insert_key_here'
]
text_lower = text.lower()
return any(placeholder in text_lower for placeholder in placeholders)
def scan_npm_dependencies(codebase_path: Path) -> List[Dict]:
"""Scan npm dependencies for known vulnerabilities."""
findings = []
package_json = codebase_path / 'package.json'
if not package_json.exists():
return findings
try:
with open(package_json, 'r') as f:
pkg = json.load(f)
deps = {**pkg.get('dependencies', {}), **pkg.get('devDependencies', {})}
# Check for commonly vulnerable packages (simplified - in production use npm audit)
vulnerable_packages = {
'lodash': ('< 4.17.21', 'Prototype pollution vulnerability'),
'axios': ('< 0.21.1', 'SSRF vulnerability'),
'node-fetch': ('< 2.6.7', 'Information exposure vulnerability'),
}
for pkg_name, (vulnerable_version, description) in vulnerable_packages.items():
if pkg_name in deps:
findings.append({
'severity': 'high',
'category': 'security',
'subcategory': 'dependencies',
'title': f'Potentially vulnerable dependency: {pkg_name}',
'description': f'{description} (version: {deps[pkg_name]})',
'file': 'package.json',
'line': None,
'code_snippet': f'"{pkg_name}": "{deps[pkg_name]}"',
'impact': 'Vulnerable dependencies can be exploited by attackers',
'remediation': f'Update {pkg_name} to version {vulnerable_version.replace("< ", ">= ")} or later',
'effort': 'low',
})
except:
pass
return findings
def scan_security_antipatterns(codebase_path: Path, metadata: Dict) -> List[Dict]:
"""Scan for common security anti-patterns."""
findings = []
if metadata.get('tech_stack', {}).get('javascript') or metadata.get('tech_stack', {}).get('typescript'):
findings.extend(scan_js_security_issues(codebase_path))
return findings
def scan_js_security_issues(codebase_path: Path) -> List[Dict]:
"""Scan JavaScript/TypeScript for security anti-patterns."""
findings = []
extensions = {'.js', '.jsx', '.ts', '.tsx'}
exclude_dirs = {'node_modules', '.git', 'dist', 'build'}
# Dangerous patterns
patterns = {
'eval': (
re.compile(r'\beval\s*\('),
'Use of eval() is dangerous',
'eval() can execute arbitrary code and is a security risk',
'Refactor to avoid eval(), use safer alternatives like Function constructor with specific scope'
),
'dangerouslySetInnerHTML': (
re.compile(r'dangerouslySetInnerHTML'),
'Use of dangerouslySetInnerHTML without sanitization',
'Can lead to XSS attacks if not properly sanitized',
'Sanitize HTML content or use safer alternatives'
),
'innerHTML': (
re.compile(r'\.innerHTML\s*='),
'Direct assignment to innerHTML',
'Can lead to XSS attacks if content is not sanitized',
'Use textContent for text or sanitize HTML before assigning'
),
'document.write': (
re.compile(r'document\.write\s*\('),
'Use of document.write()',
'Can be exploited for XSS and causes page reflow',
'Use DOM manipulation methods instead'
),
}
for file_path in codebase_path.rglob('*'):
if (file_path.suffix in extensions and
not any(excluded in file_path.parts for excluded in exclude_dirs)):
try:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
lines = content.split('\n')
for pattern_name, (pattern, title, impact, remediation) in patterns.items():
for line_num, line in enumerate(lines, start=1):
if pattern.search(line):
findings.append({
'severity': 'high',
'category': 'security',
'subcategory': 'code_security',
'title': title,
'description': f'Found on line {line_num}',
'file': str(file_path.relative_to(codebase_path)),
'line': line_num,
'code_snippet': line.strip(),
'impact': impact,
'remediation': remediation,
'effort': 'medium',
})
except:
pass
return findings

View File

@@ -0,0 +1,76 @@
"""
Technical Debt Calculator
Calculates:
- SQALE rating (A-E)
- Remediation effort estimates
- Debt categorization
"""
from pathlib import Path
from typing import Dict, List
def analyze(codebase_path: Path, metadata: Dict) -> List[Dict]:
"""
Calculate technical debt metrics.
Args:
codebase_path: Path to codebase
metadata: Project metadata
Returns:
List of technical debt findings
"""
findings = []
# Placeholder implementation
# In production, this would calculate SQALE rating based on all findings
return findings
def calculate_sqale_rating(all_findings: List[Dict], total_loc: int) -> str:
"""
Calculate SQALE rating (A-E) based on findings.
Args:
all_findings: All findings from all analyzers
total_loc: Total lines of code
Returns:
SQALE rating (A, B, C, D, or E)
"""
# Estimate remediation time in hours
severity_hours = {
'critical': 8,
'high': 4,
'medium': 2,
'low': 0.5
}
total_remediation_hours = sum(
severity_hours.get(finding.get('severity', 'low'), 0.5)
for finding in all_findings
)
# Estimate development time (1 hour per 50 LOC is conservative)
development_hours = total_loc / 50
# Calculate debt ratio
if development_hours == 0:
debt_ratio = 0
else:
debt_ratio = (total_remediation_hours / development_hours) * 100
# Assign SQALE rating
if debt_ratio <= 5:
return 'A'
elif debt_ratio <= 10:
return 'B'
elif debt_ratio <= 20:
return 'C'
elif debt_ratio <= 50:
return 'D'
else:
return 'E'

View File

@@ -0,0 +1,184 @@
"""
Test Coverage Analyzer
Analyzes:
- Test coverage percentage
- Testing Trophy distribution
- Test quality
- Untested critical paths
"""
import json
from pathlib import Path
from typing import Dict, List
def analyze(codebase_path: Path, metadata: Dict) -> List[Dict]:
"""
Analyze test coverage and quality.
Args:
codebase_path: Path to codebase
metadata: Project metadata
Returns:
List of testing-related findings
"""
findings = []
# Check for test files existence
test_stats = analyze_test_presence(codebase_path, metadata)
if test_stats:
findings.extend(test_stats)
# Analyze coverage if coverage reports exist
coverage_findings = analyze_coverage_reports(codebase_path, metadata)
if coverage_findings:
findings.extend(coverage_findings)
return findings
def analyze_test_presence(codebase_path: Path, metadata: Dict) -> List[Dict]:
"""Check for test file presence and basic test hygiene."""
findings = []
# Count test files
test_extensions = {'.test.js', '.test.ts', '.test.jsx', '.test.tsx', '.spec.js', '.spec.ts'}
test_dirs = {'__tests__', 'tests', 'test', 'spec'}
test_file_count = 0
source_file_count = 0
exclude_dirs = {'node_modules', '.git', 'dist', 'build', '__pycache__'}
source_extensions = {'.js', '.jsx', '.ts', '.tsx', '.py'}
for file_path in codebase_path.rglob('*'):
if file_path.is_file() and not any(excluded in file_path.parts for excluded in exclude_dirs):
# Check if it's a test file
is_test = (
any(file_path.name.endswith(ext) for ext in test_extensions) or
any(test_dir in file_path.parts for test_dir in test_dirs)
)
if is_test:
test_file_count += 1
elif file_path.suffix in source_extensions:
source_file_count += 1
# Calculate test ratio
if source_file_count > 0:
test_ratio = (test_file_count / source_file_count) * 100
if test_ratio < 20:
findings.append({
'severity': 'high',
'category': 'testing',
'subcategory': 'test_coverage',
'title': f'Low test file ratio ({test_ratio:.1f}%)',
'description': f'Only {test_file_count} test files for {source_file_count} source files',
'file': None,
'line': None,
'code_snippet': None,
'impact': 'Insufficient testing leads to bugs and difficult refactoring',
'remediation': 'Add tests for untested modules, aim for at least 80% coverage',
'effort': 'high',
})
elif test_ratio < 50:
findings.append({
'severity': 'medium',
'category': 'testing',
'subcategory': 'test_coverage',
'title': f'Moderate test file ratio ({test_ratio:.1f}%)',
'description': f'{test_file_count} test files for {source_file_count} source files',
'file': None,
'line': None,
'code_snippet': None,
'impact': 'More tests needed to achieve recommended 80% coverage',
'remediation': 'Continue adding tests, focus on critical paths first',
'effort': 'medium',
})
return findings
def analyze_coverage_reports(codebase_path: Path, metadata: Dict) -> List[Dict]:
"""Analyze coverage reports if they exist."""
findings = []
# Look for coverage reports (Istanbul/c8 format)
coverage_files = [
codebase_path / 'coverage' / 'coverage-summary.json',
codebase_path / 'coverage' / 'coverage-final.json',
codebase_path / '.nyc_output' / 'coverage-summary.json',
]
for coverage_file in coverage_files:
if coverage_file.exists():
try:
with open(coverage_file, 'r') as f:
coverage_data = json.load(f)
# Extract total coverage
total = coverage_data.get('total', {})
line_coverage = total.get('lines', {}).get('pct', 0)
branch_coverage = total.get('branches', {}).get('pct', 0)
function_coverage = total.get('functions', {}).get('pct', 0)
statement_coverage = total.get('statements', {}).get('pct', 0)
# Check against 80% threshold
if line_coverage < 80:
severity = 'high' if line_coverage < 50 else 'medium'
findings.append({
'severity': severity,
'category': 'testing',
'subcategory': 'test_coverage',
'title': f'Line coverage below target ({line_coverage:.1f}%)',
'description': f'Current coverage is {line_coverage:.1f}%, target is 80%',
'file': 'coverage/coverage-summary.json',
'line': None,
'code_snippet': None,
'impact': 'Low coverage means untested code paths and higher bug risk',
'remediation': f'Add tests to increase coverage by {80 - line_coverage:.1f}%',
'effort': 'high',
})
if branch_coverage < 75:
findings.append({
'severity': 'medium',
'category': 'testing',
'subcategory': 'test_coverage',
'title': f'Branch coverage below target ({branch_coverage:.1f}%)',
'description': f'Current branch coverage is {branch_coverage:.1f}%, target is 75%',
'file': 'coverage/coverage-summary.json',
'line': None,
'code_snippet': None,
'impact': 'Untested branches can hide bugs in conditional logic',
'remediation': 'Add tests for edge cases and conditional branches',
'effort': 'medium',
})
break # Found coverage, don't check other files
except:
pass
# If no coverage report found
if not findings:
findings.append({
'severity': 'medium',
'category': 'testing',
'subcategory': 'test_infrastructure',
'title': 'No coverage report found',
'description': 'Could not find coverage-summary.json',
'file': None,
'line': None,
'code_snippet': None,
'impact': 'Cannot measure test effectiveness without coverage reports',
'remediation': 'Configure test runner to generate coverage reports (Jest: --coverage, Vitest: --coverage)',
'effort': 'low',
})
return findings