387 lines
14 KiB
Python
Executable File
387 lines
14 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
URL Validator - Check URL safety and detect malicious patterns
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
import re
|
|
import json
|
|
from pathlib import Path
|
|
from urllib.parse import urlparse
|
|
from typing import List, Dict, Tuple, Set
|
|
|
|
# ============================================================================
|
|
# Configuration
|
|
# ============================================================================
|
|
|
|
class Config:
|
|
"""Configuration for URL validation"""
|
|
SUSPICIOUS_TLDS = {'.tk', '.ml', '.ga', '.cf', '.gq'}
|
|
URL_SHORTENERS = {'bit.ly', 'tinyurl.com', 'goo.gl', 't.co', 'ow.ly'}
|
|
TRUSTED_REGISTRIES = {
|
|
'registry.npmjs.org',
|
|
'pypi.org',
|
|
'registry.hub.docker.com',
|
|
'github.com',
|
|
'gitlab.com'
|
|
}
|
|
|
|
# ============================================================================
|
|
# URL Pattern Definitions
|
|
# ============================================================================
|
|
|
|
# Comprehensive URL pattern
|
|
URL_PATTERN = re.compile(
|
|
r'(?:(?:https?|ftp|file)://|www\.|ftp\.)'
|
|
r'(?:\S+(?::\S*)?@)?'
|
|
r'(?:'
|
|
r'(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])'
|
|
r'(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}'
|
|
r'(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))'
|
|
r'|'
|
|
r'(?:(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)'
|
|
r'(?:\.(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)*'
|
|
r'(?:\.(?:[a-z\u00a1-\uffff]{2,}))'
|
|
r')'
|
|
r'(?::\d{2,5})?'
|
|
r'(?:[/?#]\S*)?',
|
|
re.IGNORECASE
|
|
)
|
|
|
|
# Dangerous code execution patterns
|
|
DANGEROUS_PATTERNS = {
|
|
'curl_pipe_sh': re.compile(r'curl\s+[^|]+\|\s*(sh|bash)', re.IGNORECASE),
|
|
'wget_pipe_sh': re.compile(r'wget\s+[^|]+\|\s*(sh|bash)', re.IGNORECASE),
|
|
'curl_silent_pipe': re.compile(r'curl\s+-[a-zA-Z]*s[a-zA-Z]*\s+[^|]+\|\s*(sh|bash)', re.IGNORECASE),
|
|
'bash_redirect': re.compile(r'bash\s+<\s*\(\s*curl', re.IGNORECASE),
|
|
'eval_fetch': re.compile(r'eval.*fetch\s*\(', re.IGNORECASE),
|
|
'eval_curl': re.compile(r'eval.*curl', re.IGNORECASE),
|
|
'exec_wget': re.compile(r'exec\s*\(.*wget', re.IGNORECASE),
|
|
'rm_rf_url': re.compile(r'rm\s+-rf.*https?://', re.IGNORECASE),
|
|
}
|
|
|
|
# Obfuscation patterns
|
|
OBFUSCATION_PATTERNS = {
|
|
'base64_url': re.compile(r'(?:atob|base64|Buffer\.from)\s*\([^)]*https?:', re.IGNORECASE),
|
|
'hex_encoded': re.compile(r'\\x[0-9a-f]{2}.*https?:', re.IGNORECASE),
|
|
'unicode_escape': re.compile(r'\\u[0-9a-f]{4}.*https?:', re.IGNORECASE),
|
|
}
|
|
|
|
# ============================================================================
|
|
# Severity Classification
|
|
# ============================================================================
|
|
|
|
class Severity:
|
|
CRITICAL = 'critical'
|
|
HIGH = 'high'
|
|
MEDIUM = 'medium'
|
|
LOW = 'low'
|
|
|
|
# ============================================================================
|
|
# Finding Class
|
|
# ============================================================================
|
|
|
|
class Finding:
|
|
"""Represents a URL security finding"""
|
|
|
|
def __init__(self, file_path: str, line_num: int, url: str, issue: str,
|
|
severity: str, risk: str, remediation: str):
|
|
self.file = file_path
|
|
self.line = line_num
|
|
self.url = url
|
|
self.issue = issue
|
|
self.severity = severity
|
|
self.risk = risk
|
|
self.remediation = remediation
|
|
|
|
def to_dict(self) -> Dict:
|
|
return {
|
|
'file': self.file,
|
|
'line': self.line,
|
|
'url': self.url,
|
|
'issue': self.issue,
|
|
'severity': self.severity,
|
|
'risk': self.risk,
|
|
'remediation': self.remediation
|
|
}
|
|
|
|
# ============================================================================
|
|
# URL Validator
|
|
# ============================================================================
|
|
|
|
class URLValidator:
|
|
"""Main URL validation class"""
|
|
|
|
def __init__(self, path: str, https_only: bool = False,
|
|
allow_localhost: bool = True, check_code_patterns: bool = True):
|
|
self.path = Path(path)
|
|
self.https_only = https_only
|
|
self.allow_localhost = allow_localhost
|
|
self.check_code_patterns = check_code_patterns
|
|
self.findings: List[Finding] = []
|
|
self.urls_checked = 0
|
|
self.files_scanned = 0
|
|
|
|
def is_text_file(self, file_path: Path) -> bool:
|
|
"""Check if file is text"""
|
|
try:
|
|
with open(file_path, 'rb') as f:
|
|
chunk = f.read(512)
|
|
if b'\0' in chunk:
|
|
return False
|
|
return True
|
|
except Exception:
|
|
return False
|
|
|
|
def should_exclude(self, file_path: Path) -> bool:
|
|
"""Check if file should be excluded"""
|
|
exclude_patterns = {'.git', 'node_modules', 'vendor', 'dist', 'build', '__pycache__'}
|
|
return any(part in exclude_patterns for part in file_path.parts)
|
|
|
|
def get_context(self, file_path: Path, line_num: int) -> str:
|
|
"""Get context around a line"""
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
lines = f.readlines()
|
|
if 0 <= line_num - 1 < len(lines):
|
|
# Check if in comment or documentation
|
|
line = lines[line_num - 1].strip()
|
|
if line.startswith('#') or line.startswith('//') or line.startswith('*'):
|
|
return 'documentation'
|
|
if 'test' in str(file_path).lower() or 'spec' in str(file_path).lower():
|
|
return 'test'
|
|
if 'example' in str(file_path).lower() or 'mock' in str(file_path).lower():
|
|
return 'example'
|
|
return 'production'
|
|
except Exception:
|
|
pass
|
|
return 'unknown'
|
|
|
|
def check_url_safety(self, url: str, file_path: Path, line_num: int) -> None:
|
|
"""Check if URL is safe"""
|
|
try:
|
|
parsed = urlparse(url)
|
|
except Exception:
|
|
return
|
|
|
|
context = self.get_context(file_path, line_num)
|
|
|
|
# Check protocol
|
|
if parsed.scheme == 'http':
|
|
# Allow localhost in development
|
|
if self.allow_localhost and parsed.hostname in ('localhost', '127.0.0.1', '0.0.0.0'):
|
|
return
|
|
|
|
# Enforce HTTPS
|
|
if self.https_only or context == 'production':
|
|
severity = Severity.HIGH if context == 'production' else Severity.MEDIUM
|
|
self.findings.append(Finding(
|
|
str(file_path), line_num, url,
|
|
'Non-HTTPS URL',
|
|
severity,
|
|
'Man-in-the-middle attacks, data interception',
|
|
'Change to HTTPS: ' + url.replace('http://', 'https://')
|
|
))
|
|
return
|
|
|
|
# Check for FTP/Telnet
|
|
if parsed.scheme in ('ftp', 'telnet'):
|
|
self.findings.append(Finding(
|
|
str(file_path), line_num, url,
|
|
'Insecure protocol',
|
|
Severity.HIGH,
|
|
'Unencrypted data transmission',
|
|
'Use secure alternatives (HTTPS, SFTP, SSH)'
|
|
))
|
|
return
|
|
|
|
# Check for file:// protocol
|
|
if parsed.scheme == 'file':
|
|
self.findings.append(Finding(
|
|
str(file_path), line_num, url,
|
|
'File protocol detected',
|
|
Severity.MEDIUM,
|
|
'Potential security risk, path disclosure',
|
|
'Review necessity of file:// protocol'
|
|
))
|
|
|
|
# Check for IP addresses
|
|
if parsed.hostname and re.match(r'^\d+\.\d+\.\d+\.\d+$', parsed.hostname):
|
|
self.findings.append(Finding(
|
|
str(file_path), line_num, url,
|
|
'IP address instead of domain',
|
|
Severity.LOW,
|
|
'Harder to verify legitimacy, no certificate validation',
|
|
'Use domain name instead of IP address'
|
|
))
|
|
|
|
# Check for suspicious TLDs
|
|
if parsed.hostname:
|
|
for tld in Config.SUSPICIOUS_TLDS:
|
|
if parsed.hostname.endswith(tld):
|
|
self.findings.append(Finding(
|
|
str(file_path), line_num, url,
|
|
'Suspicious TLD',
|
|
Severity.MEDIUM,
|
|
'Often used for malicious purposes',
|
|
'Verify domain legitimacy before use'
|
|
))
|
|
break
|
|
|
|
# Check for URL shorteners
|
|
if parsed.hostname in Config.URL_SHORTENERS:
|
|
self.findings.append(Finding(
|
|
str(file_path), line_num, url,
|
|
'Shortened URL',
|
|
Severity.LOW,
|
|
'Cannot verify destination',
|
|
'Expand URL and use full destination'
|
|
))
|
|
|
|
def check_dangerous_patterns(self, content: str, file_path: Path) -> None:
|
|
"""Check for dangerous code execution patterns"""
|
|
if not self.check_code_patterns:
|
|
return
|
|
|
|
lines = content.split('\n')
|
|
|
|
for pattern_name, pattern in DANGEROUS_PATTERNS.items():
|
|
for match in pattern.finditer(content):
|
|
line_num = content[:match.start()].count('\n') + 1
|
|
self.findings.append(Finding(
|
|
str(file_path), line_num, match.group(0),
|
|
'Remote code execution pattern',
|
|
Severity.CRITICAL,
|
|
f'Executes arbitrary code from remote source ({pattern_name})',
|
|
'Download, verify checksum, review code, then execute'
|
|
))
|
|
|
|
for pattern_name, pattern in OBFUSCATION_PATTERNS.items():
|
|
for match in pattern.finditer(content):
|
|
line_num = content[:match.start()].count('\n') + 1
|
|
self.findings.append(Finding(
|
|
str(file_path), line_num, match.group(0)[:50] + '...',
|
|
'Obfuscated URL',
|
|
Severity.HIGH,
|
|
f'URL obfuscation detected ({pattern_name})',
|
|
'Review obfuscated content for malicious intent'
|
|
))
|
|
|
|
def scan_file(self, file_path: Path) -> None:
|
|
"""Scan a single file"""
|
|
if self.should_exclude(file_path) or not self.is_text_file(file_path):
|
|
return
|
|
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
content = f.read()
|
|
|
|
self.files_scanned += 1
|
|
|
|
# Check for dangerous patterns first
|
|
self.check_dangerous_patterns(content, file_path)
|
|
|
|
# Find all URLs
|
|
lines = content.split('\n')
|
|
for line_num, line in enumerate(lines, 1):
|
|
for match in URL_PATTERN.finditer(line):
|
|
url = match.group(0)
|
|
self.urls_checked += 1
|
|
self.check_url_safety(url, file_path, line_num)
|
|
|
|
except Exception as e:
|
|
print(f"Warning: Could not scan {file_path}: {e}", file=sys.stderr)
|
|
|
|
def scan(self) -> None:
|
|
"""Scan path for URLs"""
|
|
if self.path.is_file():
|
|
self.scan_file(self.path)
|
|
elif self.path.is_dir():
|
|
for file_path in self.path.rglob('*'):
|
|
if file_path.is_file():
|
|
self.scan_file(file_path)
|
|
|
|
def report(self) -> int:
|
|
"""Generate report and return exit code"""
|
|
print("URL Safety Scan Results")
|
|
print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
|
|
print(f"Path: {self.path}")
|
|
print(f"Files Scanned: {self.files_scanned}")
|
|
print(f"URLs Checked: {self.urls_checked}")
|
|
print()
|
|
|
|
if not self.findings:
|
|
print("✅ SUCCESS: All URLs safe")
|
|
print("No unsafe URLs or malicious patterns detected")
|
|
return 0
|
|
|
|
# Group by severity
|
|
critical = [f for f in self.findings if f.severity == Severity.CRITICAL]
|
|
high = [f for f in self.findings if f.severity == Severity.HIGH]
|
|
medium = [f for f in self.findings if f.severity == Severity.MEDIUM]
|
|
low = [f for f in self.findings if f.severity == Severity.LOW]
|
|
|
|
print(f"⚠️ UNSAFE URLS DETECTED: {len(self.findings)}")
|
|
print()
|
|
|
|
if critical:
|
|
print(f"CRITICAL Issues ({len(critical)}):")
|
|
for finding in critical:
|
|
print(f" ❌ {finding.file}:{finding.line}")
|
|
print(f" Pattern: {finding.url}")
|
|
print(f" Risk: {finding.risk}")
|
|
print(f" Remediation: {finding.remediation}")
|
|
print()
|
|
|
|
if high:
|
|
print(f"HIGH Issues ({len(high)}):")
|
|
for finding in high:
|
|
print(f" ⚠️ {finding.file}:{finding.line}")
|
|
print(f" URL: {finding.url}")
|
|
print(f" Issue: {finding.issue}")
|
|
print(f" Remediation: {finding.remediation}")
|
|
print()
|
|
|
|
if medium:
|
|
print(f"MEDIUM Issues ({len(medium)}):")
|
|
for finding in medium:
|
|
print(f" 💡 {finding.file}:{finding.line}")
|
|
print(f" Issue: {finding.issue}")
|
|
print()
|
|
|
|
print("Summary:")
|
|
print(f" Critical: {len(critical)}")
|
|
print(f" High: {len(high)}")
|
|
print(f" Medium: {len(medium)}")
|
|
print(f" Low: {len(low)}")
|
|
print()
|
|
print("Action Required: YES" if (critical or high) else "Review Recommended")
|
|
|
|
return 1
|
|
|
|
# ============================================================================
|
|
# Main
|
|
# ============================================================================
|
|
|
|
def main():
|
|
if len(sys.argv) < 2:
|
|
print("Usage: url-validator.py <path> [https_only] [allow_localhost] [check_code_patterns]")
|
|
sys.exit(2)
|
|
|
|
path = sys.argv[1]
|
|
https_only = sys.argv[2].lower() == 'true' if len(sys.argv) > 2 else False
|
|
allow_localhost = sys.argv[3].lower() == 'true' if len(sys.argv) > 3 else True
|
|
check_code_patterns = sys.argv[4].lower() == 'true' if len(sys.argv) > 4 else True
|
|
|
|
if not os.path.exists(path):
|
|
print(f"ERROR: Path does not exist: {path}", file=sys.stderr)
|
|
sys.exit(2)
|
|
|
|
validator = URLValidator(path, https_only, allow_localhost, check_code_patterns)
|
|
validator.scan()
|
|
sys.exit(validator.report())
|
|
|
|
if __name__ == '__main__':
|
|
main()
|