Initial commit

2025-11-29 18:17:27 +08:00
commit 9b5e9af436
6 changed files with 583 additions and 0 deletions
--- a/skills/web/scripts/fetch.py
+++ b/skills/web/scripts/fetch.py
@@ -0,0 +1,245 @@
+#!/usr/bin/env python3
+# /// script
+# dependencies = [
+#     "httpx",
+#     "beautifulsoup4",
+#     "markdownify",
+#     "click",
+# ]
+# ///
+
+"""
+Web Fetch Script
+
+Fetch and parse web page content, converting HTML to clean markdown.
+Supports timeout configuration and size limits.
+
+Usage:
+    uv run fetch.py --url "https://example.com"
+    uv run fetch.py --url "https://example.com" --json
+    uv run fetch.py --url "https://example.com" --timeout 60
+"""
+
+import json
+import os
+import sys
+from urllib.parse import urlparse
+
+import click
+import httpx
+from bs4 import BeautifulSoup  # type: ignore
+from markdownify import markdownify as md  # type: ignore
+
+# Configuration defaults
+DEFAULT_TIMEOUT = 30  # seconds
+DEFAULT_MAX_SIZE = 1048576  # 1MB
+
+
+def is_valid_url(url: str) -> bool:
+    """Validate URL format.
+
+    Args:
+        url: URL to validate
+
+    Returns:
+        True if URL is valid (http/https), False otherwise
+    """
+    try:
+        parsed = urlparse(url)
+        return parsed.scheme in ("http", "https") and bool(parsed.netloc)
+    except Exception:
+        return False
+
+
+def html_to_markdown(html: str) -> str:
+    """Convert HTML content to clean markdown.
+
+    Removes scripts, styles, and other non-content elements before conversion.
+
+    Args:
+        html: HTML content to convert
+
+    Returns:
+        Markdown representation of the content
+    """
+    # Parse HTML
+    soup = BeautifulSoup(html, "html.parser")
+
+    # Remove script and style elements
+    for element in soup(["script", "style", "nav", "footer", "header"]):
+        element.decompose()
+
+    # Convert to markdown
+    markdown = md(str(soup), heading_style="ATX", strip=["img"])
+
+    # Clean up excessive whitespace
+    lines = [line.strip() for line in markdown.split("\n")]
+    clean_lines = [line for line in lines if line]  # Remove empty lines
+
+    return "\n\n".join(clean_lines)
+
+
+def fetch_web_page(url: str, timeout: int, max_size: int) -> dict:
+    """Fetch and parse web page content.
+
+    Args:
+        url: URL to fetch content from
+        timeout: Request timeout in seconds
+        max_size: Maximum response size in bytes
+
+    Returns:
+        Dict with success, result/error, and message
+    """
+    # Validate URL
+    if not is_valid_url(url):
+        return {
+            "success": False,
+            "error": "invalid_url",
+            "message": f"URL must start with http:// or https://. Got: {url}",
+        }
+
+    try:
+        # Fetch content with timeout
+        with httpx.Client(timeout=timeout, follow_redirects=True) as client:
+            response = client.get(url)
+            response.raise_for_status()
+
+            # Check content size
+            content_length = len(response.content)
+            if content_length > max_size:
+                return {
+                    "success": False,
+                    "error": "content_too_large",
+                    "message": f"Response size ({content_length} bytes) exceeds maximum ({max_size} bytes)",
+                }
+
+            # Parse HTML to markdown
+            markdown_content = html_to_markdown(response.text)
+
+            return {
+                "success": True,
+                "result": markdown_content,
+                "message": f"Successfully fetched {len(markdown_content)} characters from {url}",
+            }
+
+    except httpx.TimeoutException:
+        return {
+            "success": False,
+            "error": "timeout",
+            "message": f"Request timed out after {timeout} seconds",
+        }
+
+    except httpx.HTTPStatusError as e:
+        return {
+            "success": False,
+            "error": "http_error",
+            "message": f"HTTP {e.response.status_code}: {e.response.reason_phrase}",
+        }
+
+    except httpx.RequestError as e:
+        return {
+            "success": False,
+            "error": "request_error",
+            "message": f"Request failed: {str(e)}",
+        }
+
+    except Exception as e:
+        return {
+            "success": False,
+            "error": "unknown_error",
+            "message": f"Unexpected error: {str(e)}",
+        }
+
+
+def format_markdown_output(url: str, markdown: str) -> str:
+    """Format markdown for human-readable output.
+
+    Args:
+        url: Source URL
+        markdown: Markdown content
+
+    Returns:
+        Formatted string for display
+    """
+    lines = []
+    lines.append("\n" + "=" * 60)
+    lines.append(f"📄 Content from {url}")
+    lines.append("=" * 60)
+    lines.append("")
+    lines.append(markdown)
+    lines.append("")
+    lines.append("=" * 60)
+    return "\n".join(lines)
+
+
+@click.command()
+@click.option("--url", required=True, help="URL to fetch content from")
+@click.option(
+    "--timeout",
+    default=None,
+    type=int,
+    help=f"Request timeout in seconds (default: {DEFAULT_TIMEOUT})",
+)
+@click.option(
+    "--max-size",
+    default=None,
+    type=int,
+    help=f"Maximum response size in bytes (default: {DEFAULT_MAX_SIZE})",
+)
+@click.option(
+    "--json",
+    "output_json",
+    is_flag=True,
+    help="Output as JSON instead of human-readable format",
+)
+def main(url: str, timeout: int | None, max_size: int | None, output_json: bool) -> None:
+    """
+    Fetch and parse web page content.
+
+    Retrieves HTML content from the specified URL and converts it to clean
+    markdown format. Handles timeouts, HTTP errors, and invalid URLs gracefully.
+
+    Environment variables:
+        WEB_FETCH_TIMEOUT - Default timeout in seconds
+        WEB_FETCH_MAX_SIZE - Default max response size in bytes
+
+    Examples:
+        uv run fetch.py --url "https://example.com"
+        uv run fetch.py --url "https://example.com" --json
+        uv run fetch.py --url "https://example.com" --timeout 60
+    """
+    try:
+        # Get timeout from env or use default
+        if timeout is None:
+            timeout = int(os.getenv("WEB_FETCH_TIMEOUT", str(DEFAULT_TIMEOUT)))
+
+        # Get max_size from env or use default
+        if max_size is None:
+            max_size = int(os.getenv("WEB_FETCH_MAX_SIZE", str(DEFAULT_MAX_SIZE)))
+
+        # Fetch and parse
+        result = fetch_web_page(url, timeout, max_size)
+
+        # Output results
+        if output_json:
+            click.echo(json.dumps(result, indent=2))
+        else:
+            if result["success"]:
+                formatted = format_markdown_output(url, result["result"])
+                click.echo(formatted)
+            else:
+                click.echo(f"❌ Error ({result['error']}): {result['message']}", err=True)
+
+        sys.exit(0 if result["success"] else 1)
+
+    except Exception as e:
+        if output_json:
+            error_data = {"success": False, "error": "script_error", "message": str(e)}
+            click.echo(json.dumps(error_data, indent=2))
+        else:
+            click.echo(f"❌ Script Error: {e}", err=True)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()