gh-danielscholl-agent-skill…/skills/web/scripts/fetch.py

#!/usr/bin/env python3
# /// script
# dependencies = [
#     "httpx",
#     "beautifulsoup4",
#     "markdownify",
#     "click",
# ]
# ///

"""
Web Fetch Script

Fetch and parse web page content, converting HTML to clean markdown.
Supports timeout configuration and size limits.

Usage:
    uv run fetch.py --url "https://example.com"
    uv run fetch.py --url "https://example.com" --json
    uv run fetch.py --url "https://example.com" --timeout 60
"""

import json
import os
import sys
from urllib.parse import urlparse

import click
import httpx
from bs4 import BeautifulSoup  # type: ignore
from markdownify import markdownify as md  # type: ignore

# Configuration defaults
DEFAULT_TIMEOUT = 30  # seconds
DEFAULT_MAX_SIZE = 1048576  # 1MB


def is_valid_url(url: str) -> bool:
    """Validate URL format.

    Args:
        url: URL to validate

    Returns:
        True if URL is valid (http/https), False otherwise
    """
    try:
        parsed = urlparse(url)
        return parsed.scheme in ("http", "https") and bool(parsed.netloc)
    except Exception:
        return False


def html_to_markdown(html: str) -> str:
    """Convert HTML content to clean markdown.

    Removes scripts, styles, and other non-content elements before conversion.

    Args:
        html: HTML content to convert

    Returns:
        Markdown representation of the content
    """
    # Parse HTML
    soup = BeautifulSoup(html, "html.parser")

    # Remove script and style elements
    for element in soup(["script", "style", "nav", "footer", "header"]):
        element.decompose()

    # Convert to markdown
    markdown = md(str(soup), heading_style="ATX", strip=["img"])

    # Clean up excessive whitespace
    lines = [line.strip() for line in markdown.split("\n")]
    clean_lines = [line for line in lines if line]  # Remove empty lines

    return "\n\n".join(clean_lines)


def fetch_web_page(url: str, timeout: int, max_size: int) -> dict:
    """Fetch and parse web page content.

    Args:
        url: URL to fetch content from
        timeout: Request timeout in seconds
        max_size: Maximum response size in bytes

    Returns:
        Dict with success, result/error, and message
    """
    # Validate URL
    if not is_valid_url(url):
        return {
            "success": False,
            "error": "invalid_url",
            "message": f"URL must start with http:// or https://. Got: {url}",
        }

    try:
        # Fetch content with timeout
        with httpx.Client(timeout=timeout, follow_redirects=True) as client:
            response = client.get(url)
            response.raise_for_status()

            # Check content size
            content_length = len(response.content)
            if content_length > max_size:
                return {
                    "success": False,
                    "error": "content_too_large",
                    "message": f"Response size ({content_length} bytes) exceeds maximum ({max_size} bytes)",
                }

            # Parse HTML to markdown
            markdown_content = html_to_markdown(response.text)

            return {
                "success": True,
                "result": markdown_content,
                "message": f"Successfully fetched {len(markdown_content)} characters from {url}",
            }

    except httpx.TimeoutException:
        return {
            "success": False,
            "error": "timeout",
            "message": f"Request timed out after {timeout} seconds",
        }

    except httpx.HTTPStatusError as e:
        return {
            "success": False,
            "error": "http_error",
            "message": f"HTTP {e.response.status_code}: {e.response.reason_phrase}",
        }

    except httpx.RequestError as e:
        return {
            "success": False,
            "error": "request_error",
            "message": f"Request failed: {str(e)}",
        }

    except Exception as e:
        return {
            "success": False,
            "error": "unknown_error",
            "message": f"Unexpected error: {str(e)}",
        }


def format_markdown_output(url: str, markdown: str) -> str:
    """Format markdown for human-readable output.

    Args:
        url: Source URL
        markdown: Markdown content

    Returns:
        Formatted string for display
    """
    lines = []
    lines.append("\n" + "=" * 60)
    lines.append(f"📄 Content from {url}")
    lines.append("=" * 60)
    lines.append("")
    lines.append(markdown)
    lines.append("")
    lines.append("=" * 60)
    return "\n".join(lines)


@click.command()
@click.option("--url", required=True, help="URL to fetch content from")
@click.option(
    "--timeout",
    default=None,
    type=int,
    help=f"Request timeout in seconds (default: {DEFAULT_TIMEOUT})",
)
@click.option(
    "--max-size",
    default=None,
    type=int,
    help=f"Maximum response size in bytes (default: {DEFAULT_MAX_SIZE})",
)
@click.option(
    "--json",
    "output_json",
    is_flag=True,
    help="Output as JSON instead of human-readable format",
)
def main(url: str, timeout: int | None, max_size: int | None, output_json: bool) -> None:
    """
    Fetch and parse web page content.

    Retrieves HTML content from the specified URL and converts it to clean
    markdown format. Handles timeouts, HTTP errors, and invalid URLs gracefully.

    Environment variables:
        WEB_FETCH_TIMEOUT - Default timeout in seconds
        WEB_FETCH_MAX_SIZE - Default max response size in bytes

    Examples:
        uv run fetch.py --url "https://example.com"
        uv run fetch.py --url "https://example.com" --json
        uv run fetch.py --url "https://example.com" --timeout 60
    """
    try:
        # Get timeout from env or use default
        if timeout is None:
            timeout = int(os.getenv("WEB_FETCH_TIMEOUT", str(DEFAULT_TIMEOUT)))

        # Get max_size from env or use default
        if max_size is None:
            max_size = int(os.getenv("WEB_FETCH_MAX_SIZE", str(DEFAULT_MAX_SIZE)))

        # Fetch and parse
        result = fetch_web_page(url, timeout, max_size)

        # Output results
        if output_json:
            click.echo(json.dumps(result, indent=2))
        else:
            if result["success"]:
                formatted = format_markdown_output(url, result["result"])
                click.echo(formatted)
            else:
                click.echo(f"❌ Error ({result['error']}): {result['message']}", err=True)

        sys.exit(0 if result["success"] else 1)

    except Exception as e:
        if output_json:
            error_data = {"success": False, "error": "script_error", "message": str(e)}
            click.echo(json.dumps(error_data, indent=2))
        else:
            click.echo(f"❌ Script Error: {e}", err=True)
        sys.exit(1)


if __name__ == "__main__":
    main()