""" Security Scanner Analyzes codebase for: - Secrets in code (API keys, tokens, passwords) - Dependency vulnerabilities - Common security anti-patterns - OWASP Top 10 issues """ import re import json from pathlib import Path from typing import Dict, List # Common patterns for secrets SECRET_PATTERNS = { 'api_key': re.compile(r'(api[_-]?key|apikey)\s*[=:]\s*["\']([a-zA-Z0-9_-]{20,})["\']', re.IGNORECASE), 'aws_key': re.compile(r'AKIA[0-9A-Z]{16}'), 'generic_secret': re.compile(r'(secret|password|passwd|pwd)\s*[=:]\s*["\']([^"\'\s]{8,})["\']', re.IGNORECASE), 'private_key': re.compile(r'-----BEGIN (RSA |)PRIVATE KEY-----'), 'jwt': re.compile(r'eyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+'), 'github_token': re.compile(r'gh[pousr]_[A-Za-z0-9_]{36}'), 'slack_token': re.compile(r'xox[baprs]-[0-9]{10,12}-[0-9]{10,12}-[a-zA-Z0-9]{24,32}'), } def analyze(codebase_path: Path, metadata: Dict) -> List[Dict]: """ Analyze codebase for security issues. Args: codebase_path: Path to codebase metadata: Project metadata from discovery phase Returns: List of security findings """ findings = [] # Scan for secrets findings.extend(scan_for_secrets(codebase_path)) # Scan dependencies for vulnerabilities if metadata.get('tech_stack', {}).get('javascript'): findings.extend(scan_npm_dependencies(codebase_path)) # Check for common security anti-patterns findings.extend(scan_security_antipatterns(codebase_path, metadata)) return findings def scan_for_secrets(codebase_path: Path) -> List[Dict]: """Scan for hardcoded secrets in code.""" findings = [] exclude_dirs = {'node_modules', '.git', 'dist', 'build', '__pycache__', '.venv', 'venv'} exclude_files = {'.env.example', 'package-lock.json', 'yarn.lock'} # File extensions to scan code_extensions = {'.js', '.jsx', '.ts', '.tsx', '.py', '.java', '.go', '.rb', '.php', '.yml', '.yaml', '.json', '.env'} for file_path in codebase_path.rglob('*'): if (file_path.is_file() and file_path.suffix in code_extensions and file_path.name not in exclude_files and not any(excluded in file_path.parts for excluded in exclude_dirs)): try: with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: content = f.read() lines = content.split('\n') for pattern_name, pattern in SECRET_PATTERNS.items(): matches = pattern.finditer(content) for match in matches: # Find line number line_num = content[:match.start()].count('\n') + 1 # Skip if it's clearly a placeholder or example matched_text = match.group(0) if is_placeholder(matched_text): continue findings.append({ 'severity': 'critical', 'category': 'security', 'subcategory': 'secrets', 'title': f'Potential {pattern_name.replace("_", " ")} found in code', 'description': f'Found potential secret on line {line_num}', 'file': str(file_path.relative_to(codebase_path)), 'line': line_num, 'code_snippet': lines[line_num - 1].strip() if line_num <= len(lines) else '', 'impact': 'Exposed secrets can lead to unauthorized access and data breaches', 'remediation': 'Remove secret from code and use environment variables or secret management tools', 'effort': 'low', }) except: pass return findings def is_placeholder(text: str) -> bool: """Check if a potential secret is actually a placeholder.""" placeholders = [ 'your_api_key', 'your_secret', 'example', 'placeholder', 'test', 'dummy', 'sample', 'xxx', '000', 'abc123', 'changeme', 'replace_me', 'my_api_key', 'your_key_here', 'insert_key_here' ] text_lower = text.lower() return any(placeholder in text_lower for placeholder in placeholders) def scan_npm_dependencies(codebase_path: Path) -> List[Dict]: """Scan npm dependencies for known vulnerabilities.""" findings = [] package_json = codebase_path / 'package.json' if not package_json.exists(): return findings try: with open(package_json, 'r') as f: pkg = json.load(f) deps = {**pkg.get('dependencies', {}), **pkg.get('devDependencies', {})} # Check for commonly vulnerable packages (simplified - in production use npm audit) vulnerable_packages = { 'lodash': ('< 4.17.21', 'Prototype pollution vulnerability'), 'axios': ('< 0.21.1', 'SSRF vulnerability'), 'node-fetch': ('< 2.6.7', 'Information exposure vulnerability'), } for pkg_name, (vulnerable_version, description) in vulnerable_packages.items(): if pkg_name in deps: findings.append({ 'severity': 'high', 'category': 'security', 'subcategory': 'dependencies', 'title': f'Potentially vulnerable dependency: {pkg_name}', 'description': f'{description} (version: {deps[pkg_name]})', 'file': 'package.json', 'line': None, 'code_snippet': f'"{pkg_name}": "{deps[pkg_name]}"', 'impact': 'Vulnerable dependencies can be exploited by attackers', 'remediation': f'Update {pkg_name} to version {vulnerable_version.replace("< ", ">= ")} or later', 'effort': 'low', }) except: pass return findings def scan_security_antipatterns(codebase_path: Path, metadata: Dict) -> List[Dict]: """Scan for common security anti-patterns.""" findings = [] if metadata.get('tech_stack', {}).get('javascript') or metadata.get('tech_stack', {}).get('typescript'): findings.extend(scan_js_security_issues(codebase_path)) return findings def scan_js_security_issues(codebase_path: Path) -> List[Dict]: """Scan JavaScript/TypeScript for security anti-patterns.""" findings = [] extensions = {'.js', '.jsx', '.ts', '.tsx'} exclude_dirs = {'node_modules', '.git', 'dist', 'build'} # Dangerous patterns patterns = { 'eval': ( re.compile(r'\beval\s*\('), 'Use of eval() is dangerous', 'eval() can execute arbitrary code and is a security risk', 'Refactor to avoid eval(), use safer alternatives like Function constructor with specific scope' ), 'dangerouslySetInnerHTML': ( re.compile(r'dangerouslySetInnerHTML'), 'Use of dangerouslySetInnerHTML without sanitization', 'Can lead to XSS attacks if not properly sanitized', 'Sanitize HTML content or use safer alternatives' ), 'innerHTML': ( re.compile(r'\.innerHTML\s*='), 'Direct assignment to innerHTML', 'Can lead to XSS attacks if content is not sanitized', 'Use textContent for text or sanitize HTML before assigning' ), 'document.write': ( re.compile(r'document\.write\s*\('), 'Use of document.write()', 'Can be exploited for XSS and causes page reflow', 'Use DOM manipulation methods instead' ), } for file_path in codebase_path.rglob('*'): if (file_path.suffix in extensions and not any(excluded in file_path.parts for excluded in exclude_dirs)): try: with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: content = f.read() lines = content.split('\n') for pattern_name, (pattern, title, impact, remediation) in patterns.items(): for line_num, line in enumerate(lines, start=1): if pattern.search(line): findings.append({ 'severity': 'high', 'category': 'security', 'subcategory': 'code_security', 'title': title, 'description': f'Found on line {line_num}', 'file': str(file_path.relative_to(codebase_path)), 'line': line_num, 'code_snippet': line.strip(), 'impact': impact, 'remediation': remediation, 'effort': 'medium', }) except: pass return findings