#!/usr/bin/env python3
"""
URL Validator - Check URL safety and detect malicious patterns
"""

import sys
import os
import re
import json
from pathlib import Path
from urllib.parse import urlparse
from typing import List, Dict, Tuple, Set

# ============================================================================
# Configuration
# ============================================================================

class Config:
    """Configuration for URL validation"""
    SUSPICIOUS_TLDS = {'.tk', '.ml', '.ga', '.cf', '.gq'}
    URL_SHORTENERS = {'bit.ly', 'tinyurl.com', 'goo.gl', 't.co', 'ow.ly'}
    TRUSTED_REGISTRIES = {
        'registry.npmjs.org',
        'pypi.org',
        'registry.hub.docker.com',
        'github.com',
        'gitlab.com'
    }

# ============================================================================
# URL Pattern Definitions
# ============================================================================

# Comprehensive URL pattern
URL_PATTERN = re.compile(
    r'(?:(?:https?|ftp|file)://|www\.|ftp\.)'
    r'(?:\S+(?::\S*)?@)?'
    r'(?:'
    r'(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])'
    r'(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}'
    r'(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))'
    r'|'
    r'(?:(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)'
    r'(?:\.(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)*'
    r'(?:\.(?:[a-z\u00a1-\uffff]{2,}))'
    r')'
    r'(?::\d{2,5})?'
    r'(?:[/?#]\S*)?',
    re.IGNORECASE
)

# Dangerous code execution patterns
DANGEROUS_PATTERNS = {
    'curl_pipe_sh': re.compile(r'curl\s+[^|]+\|\s*(sh|bash)', re.IGNORECASE),
    'wget_pipe_sh': re.compile(r'wget\s+[^|]+\|\s*(sh|bash)', re.IGNORECASE),
    'curl_silent_pipe': re.compile(r'curl\s+-[a-zA-Z]*s[a-zA-Z]*\s+[^|]+\|\s*(sh|bash)', re.IGNORECASE),
    'bash_redirect': re.compile(r'bash\s+<\s*\(\s*curl', re.IGNORECASE),
    'eval_fetch': re.compile(r'eval.*fetch\s*\(', re.IGNORECASE),
    'eval_curl': re.compile(r'eval.*curl', re.IGNORECASE),
    'exec_wget': re.compile(r'exec\s*\(.*wget', re.IGNORECASE),
    'rm_rf_url': re.compile(r'rm\s+-rf.*https?://', re.IGNORECASE),
}

# Obfuscation patterns
OBFUSCATION_PATTERNS = {
    'base64_url': re.compile(r'(?:atob|base64|Buffer\.from)\s*\([^)]*https?:', re.IGNORECASE),
    'hex_encoded': re.compile(r'\\x[0-9a-f]{2}.*https?:', re.IGNORECASE),
    'unicode_escape': re.compile(r'\\u[0-9a-f]{4}.*https?:', re.IGNORECASE),
}

# ============================================================================
# Severity Classification
# ============================================================================

class Severity:
    CRITICAL = 'critical'
    HIGH = 'high'
    MEDIUM = 'medium'
    LOW = 'low'

# ============================================================================
# Finding Class
# ============================================================================

class Finding:
    """Represents a URL security finding"""

    def __init__(self, file_path: str, line_num: int, url: str, issue: str,
                 severity: str, risk: str, remediation: str):
        self.file = file_path
        self.line = line_num
        self.url = url
        self.issue = issue
        self.severity = severity
        self.risk = risk
        self.remediation = remediation

    def to_dict(self) -> Dict:
        return {
            'file': self.file,
            'line': self.line,
            'url': self.url,
            'issue': self.issue,
            'severity': self.severity,
            'risk': self.risk,
            'remediation': self.remediation
        }

# ============================================================================
# URL Validator
# ============================================================================

class URLValidator:
    """Main URL validation class"""

    def __init__(self, path: str, https_only: bool = False,
                 allow_localhost: bool = True, check_code_patterns: bool = True):
        self.path = Path(path)
        self.https_only = https_only
        self.allow_localhost = allow_localhost
        self.check_code_patterns = check_code_patterns
        self.findings: List[Finding] = []
        self.urls_checked = 0
        self.files_scanned = 0

    def is_text_file(self, file_path: Path) -> bool:
        """Check if file is text"""
        try:
            with open(file_path, 'rb') as f:
                chunk = f.read(512)
                if b'\0' in chunk:
                    return False
                return True
        except Exception:
            return False

    def should_exclude(self, file_path: Path) -> bool:
        """Check if file should be excluded"""
        exclude_patterns = {'.git', 'node_modules', 'vendor', 'dist', 'build', '__pycache__'}
        return any(part in exclude_patterns for part in file_path.parts)

    def get_context(self, file_path: Path, line_num: int) -> str:
        """Get context around a line"""
        try:
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                lines = f.readlines()
                if 0 <= line_num - 1 < len(lines):
                    # Check if in comment or documentation
                    line = lines[line_num - 1].strip()
                    if line.startswith('#') or line.startswith('//') or line.startswith('*'):
                        return 'documentation'
                    if 'test' in str(file_path).lower() or 'spec' in str(file_path).lower():
                        return 'test'
                    if 'example' in str(file_path).lower() or 'mock' in str(file_path).lower():
                        return 'example'
                    return 'production'
        except Exception:
            pass
        return 'unknown'

    def check_url_safety(self, url: str, file_path: Path, line_num: int) -> None:
        """Check if URL is safe"""
        try:
            parsed = urlparse(url)
        except Exception:
            return

        context = self.get_context(file_path, line_num)

        # Check protocol
        if parsed.scheme == 'http':
            # Allow localhost in development
            if self.allow_localhost and parsed.hostname in ('localhost', '127.0.0.1', '0.0.0.0'):
                return

            # Enforce HTTPS
            if self.https_only or context == 'production':
                severity = Severity.HIGH if context == 'production' else Severity.MEDIUM
                self.findings.append(Finding(
                    str(file_path), line_num, url,
                    'Non-HTTPS URL',
                    severity,
                    'Man-in-the-middle attacks, data interception',
                    'Change to HTTPS: ' + url.replace('http://', 'https://')
                ))
                return

        # Check for FTP/Telnet
        if parsed.scheme in ('ftp', 'telnet'):
            self.findings.append(Finding(
                str(file_path), line_num, url,
                'Insecure protocol',
                Severity.HIGH,
                'Unencrypted data transmission',
                'Use secure alternatives (HTTPS, SFTP, SSH)'
            ))
            return

        # Check for file:// protocol
        if parsed.scheme == 'file':
            self.findings.append(Finding(
                str(file_path), line_num, url,
                'File protocol detected',
                Severity.MEDIUM,
                'Potential security risk, path disclosure',
                'Review necessity of file:// protocol'
            ))

        # Check for IP addresses
        if parsed.hostname and re.match(r'^\d+\.\d+\.\d+\.\d+$', parsed.hostname):
            self.findings.append(Finding(
                str(file_path), line_num, url,
                'IP address instead of domain',
                Severity.LOW,
                'Harder to verify legitimacy, no certificate validation',
                'Use domain name instead of IP address'
            ))

        # Check for suspicious TLDs
        if parsed.hostname:
            for tld in Config.SUSPICIOUS_TLDS:
                if parsed.hostname.endswith(tld):
                    self.findings.append(Finding(
                        str(file_path), line_num, url,
                        'Suspicious TLD',
                        Severity.MEDIUM,
                        'Often used for malicious purposes',
                        'Verify domain legitimacy before use'
                    ))
                    break

            # Check for URL shorteners
            if parsed.hostname in Config.URL_SHORTENERS:
                self.findings.append(Finding(
                    str(file_path), line_num, url,
                    'Shortened URL',
                    Severity.LOW,
                    'Cannot verify destination',
                    'Expand URL and use full destination'
                ))

    def check_dangerous_patterns(self, content: str, file_path: Path) -> None:
        """Check for dangerous code execution patterns"""
        if not self.check_code_patterns:
            return

        lines = content.split('\n')

        for pattern_name, pattern in DANGEROUS_PATTERNS.items():
            for match in pattern.finditer(content):
                line_num = content[:match.start()].count('\n') + 1
                self.findings.append(Finding(
                    str(file_path), line_num, match.group(0),
                    'Remote code execution pattern',
                    Severity.CRITICAL,
                    f'Executes arbitrary code from remote source ({pattern_name})',
                    'Download, verify checksum, review code, then execute'
                ))

        for pattern_name, pattern in OBFUSCATION_PATTERNS.items():
            for match in pattern.finditer(content):
                line_num = content[:match.start()].count('\n') + 1
                self.findings.append(Finding(
                    str(file_path), line_num, match.group(0)[:50] + '...',
                    'Obfuscated URL',
                    Severity.HIGH,
                    f'URL obfuscation detected ({pattern_name})',
                    'Review obfuscated content for malicious intent'
                ))

    def scan_file(self, file_path: Path) -> None:
        """Scan a single file"""
        if self.should_exclude(file_path) or not self.is_text_file(file_path):
            return

        try:
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                content = f.read()

            self.files_scanned += 1

            # Check for dangerous patterns first
            self.check_dangerous_patterns(content, file_path)

            # Find all URLs
            lines = content.split('\n')
            for line_num, line in enumerate(lines, 1):
                for match in URL_PATTERN.finditer(line):
                    url = match.group(0)
                    self.urls_checked += 1
                    self.check_url_safety(url, file_path, line_num)

        except Exception as e:
            print(f"Warning: Could not scan {file_path}: {e}", file=sys.stderr)

    def scan(self) -> None:
        """Scan path for URLs"""
        if self.path.is_file():
            self.scan_file(self.path)
        elif self.path.is_dir():
            for file_path in self.path.rglob('*'):
                if file_path.is_file():
                    self.scan_file(file_path)

    def report(self) -> int:
        """Generate report and return exit code"""
        print("URL Safety Scan Results")
        print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
        print(f"Path: {self.path}")
        print(f"Files Scanned: {self.files_scanned}")
        print(f"URLs Checked: {self.urls_checked}")
        print()

        if not self.findings:
            print("✅ SUCCESS: All URLs safe")
            print("No unsafe URLs or malicious patterns detected")
            return 0

        # Group by severity
        critical = [f for f in self.findings if f.severity == Severity.CRITICAL]
        high = [f for f in self.findings if f.severity == Severity.HIGH]
        medium = [f for f in self.findings if f.severity == Severity.MEDIUM]
        low = [f for f in self.findings if f.severity == Severity.LOW]

        print(f"⚠️  UNSAFE URLS DETECTED: {len(self.findings)}")
        print()

        if critical:
            print(f"CRITICAL Issues ({len(critical)}):")
            for finding in critical:
                print(f"  ❌ {finding.file}:{finding.line}")
                print(f"     Pattern: {finding.url}")
                print(f"     Risk: {finding.risk}")
                print(f"     Remediation: {finding.remediation}")
                print()

        if high:
            print(f"HIGH Issues ({len(high)}):")
            for finding in high:
                print(f"  ⚠️  {finding.file}:{finding.line}")
                print(f"     URL: {finding.url}")
                print(f"     Issue: {finding.issue}")
                print(f"     Remediation: {finding.remediation}")
                print()

        if medium:
            print(f"MEDIUM Issues ({len(medium)}):")
            for finding in medium:
                print(f"  💡 {finding.file}:{finding.line}")
                print(f"     Issue: {finding.issue}")
                print()

        print("Summary:")
        print(f"  Critical: {len(critical)}")
        print(f"  High: {len(high)}")
        print(f"  Medium: {len(medium)}")
        print(f"  Low: {len(low)}")
        print()
        print("Action Required: YES" if (critical or high) else "Review Recommended")

        return 1

# ============================================================================
# Main
# ============================================================================

def main():
    if len(sys.argv) < 2:
        print("Usage: url-validator.py <path> [https_only] [allow_localhost] [check_code_patterns]")
        sys.exit(2)

    path = sys.argv[1]
    https_only = sys.argv[2].lower() == 'true' if len(sys.argv) > 2 else False
    allow_localhost = sys.argv[3].lower() == 'true' if len(sys.argv) > 3 else True
    check_code_patterns = sys.argv[4].lower() == 'true' if len(sys.argv) > 4 else True

    if not os.path.exists(path):
        print(f"ERROR: Path does not exist: {path}", file=sys.stderr)
        sys.exit(2)

    validator = URLValidator(path, https_only, allow_localhost, check_code_patterns)
    validator.scan()
    sys.exit(validator.report())

if __name__ == '__main__':
    main()