246 lines
6.7 KiB
Python
Executable File
246 lines
6.7 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# /// script
|
|
# dependencies = [
|
|
# "httpx",
|
|
# "beautifulsoup4",
|
|
# "markdownify",
|
|
# "click",
|
|
# ]
|
|
# ///
|
|
|
|
"""
|
|
Web Fetch Script
|
|
|
|
Fetch and parse web page content, converting HTML to clean markdown.
|
|
Supports timeout configuration and size limits.
|
|
|
|
Usage:
|
|
uv run fetch.py --url "https://example.com"
|
|
uv run fetch.py --url "https://example.com" --json
|
|
uv run fetch.py --url "https://example.com" --timeout 60
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import sys
|
|
from urllib.parse import urlparse
|
|
|
|
import click
|
|
import httpx
|
|
from bs4 import BeautifulSoup # type: ignore
|
|
from markdownify import markdownify as md # type: ignore
|
|
|
|
# Configuration defaults
|
|
DEFAULT_TIMEOUT = 30 # seconds
|
|
DEFAULT_MAX_SIZE = 1048576 # 1MB
|
|
|
|
|
|
def is_valid_url(url: str) -> bool:
|
|
"""Validate URL format.
|
|
|
|
Args:
|
|
url: URL to validate
|
|
|
|
Returns:
|
|
True if URL is valid (http/https), False otherwise
|
|
"""
|
|
try:
|
|
parsed = urlparse(url)
|
|
return parsed.scheme in ("http", "https") and bool(parsed.netloc)
|
|
except Exception:
|
|
return False
|
|
|
|
|
|
def html_to_markdown(html: str) -> str:
|
|
"""Convert HTML content to clean markdown.
|
|
|
|
Removes scripts, styles, and other non-content elements before conversion.
|
|
|
|
Args:
|
|
html: HTML content to convert
|
|
|
|
Returns:
|
|
Markdown representation of the content
|
|
"""
|
|
# Parse HTML
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
|
|
# Remove script and style elements
|
|
for element in soup(["script", "style", "nav", "footer", "header"]):
|
|
element.decompose()
|
|
|
|
# Convert to markdown
|
|
markdown = md(str(soup), heading_style="ATX", strip=["img"])
|
|
|
|
# Clean up excessive whitespace
|
|
lines = [line.strip() for line in markdown.split("\n")]
|
|
clean_lines = [line for line in lines if line] # Remove empty lines
|
|
|
|
return "\n\n".join(clean_lines)
|
|
|
|
|
|
def fetch_web_page(url: str, timeout: int, max_size: int) -> dict:
|
|
"""Fetch and parse web page content.
|
|
|
|
Args:
|
|
url: URL to fetch content from
|
|
timeout: Request timeout in seconds
|
|
max_size: Maximum response size in bytes
|
|
|
|
Returns:
|
|
Dict with success, result/error, and message
|
|
"""
|
|
# Validate URL
|
|
if not is_valid_url(url):
|
|
return {
|
|
"success": False,
|
|
"error": "invalid_url",
|
|
"message": f"URL must start with http:// or https://. Got: {url}",
|
|
}
|
|
|
|
try:
|
|
# Fetch content with timeout
|
|
with httpx.Client(timeout=timeout, follow_redirects=True) as client:
|
|
response = client.get(url)
|
|
response.raise_for_status()
|
|
|
|
# Check content size
|
|
content_length = len(response.content)
|
|
if content_length > max_size:
|
|
return {
|
|
"success": False,
|
|
"error": "content_too_large",
|
|
"message": f"Response size ({content_length} bytes) exceeds maximum ({max_size} bytes)",
|
|
}
|
|
|
|
# Parse HTML to markdown
|
|
markdown_content = html_to_markdown(response.text)
|
|
|
|
return {
|
|
"success": True,
|
|
"result": markdown_content,
|
|
"message": f"Successfully fetched {len(markdown_content)} characters from {url}",
|
|
}
|
|
|
|
except httpx.TimeoutException:
|
|
return {
|
|
"success": False,
|
|
"error": "timeout",
|
|
"message": f"Request timed out after {timeout} seconds",
|
|
}
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
return {
|
|
"success": False,
|
|
"error": "http_error",
|
|
"message": f"HTTP {e.response.status_code}: {e.response.reason_phrase}",
|
|
}
|
|
|
|
except httpx.RequestError as e:
|
|
return {
|
|
"success": False,
|
|
"error": "request_error",
|
|
"message": f"Request failed: {str(e)}",
|
|
}
|
|
|
|
except Exception as e:
|
|
return {
|
|
"success": False,
|
|
"error": "unknown_error",
|
|
"message": f"Unexpected error: {str(e)}",
|
|
}
|
|
|
|
|
|
def format_markdown_output(url: str, markdown: str) -> str:
|
|
"""Format markdown for human-readable output.
|
|
|
|
Args:
|
|
url: Source URL
|
|
markdown: Markdown content
|
|
|
|
Returns:
|
|
Formatted string for display
|
|
"""
|
|
lines = []
|
|
lines.append("\n" + "=" * 60)
|
|
lines.append(f"📄 Content from {url}")
|
|
lines.append("=" * 60)
|
|
lines.append("")
|
|
lines.append(markdown)
|
|
lines.append("")
|
|
lines.append("=" * 60)
|
|
return "\n".join(lines)
|
|
|
|
|
|
@click.command()
|
|
@click.option("--url", required=True, help="URL to fetch content from")
|
|
@click.option(
|
|
"--timeout",
|
|
default=None,
|
|
type=int,
|
|
help=f"Request timeout in seconds (default: {DEFAULT_TIMEOUT})",
|
|
)
|
|
@click.option(
|
|
"--max-size",
|
|
default=None,
|
|
type=int,
|
|
help=f"Maximum response size in bytes (default: {DEFAULT_MAX_SIZE})",
|
|
)
|
|
@click.option(
|
|
"--json",
|
|
"output_json",
|
|
is_flag=True,
|
|
help="Output as JSON instead of human-readable format",
|
|
)
|
|
def main(url: str, timeout: int | None, max_size: int | None, output_json: bool) -> None:
|
|
"""
|
|
Fetch and parse web page content.
|
|
|
|
Retrieves HTML content from the specified URL and converts it to clean
|
|
markdown format. Handles timeouts, HTTP errors, and invalid URLs gracefully.
|
|
|
|
Environment variables:
|
|
WEB_FETCH_TIMEOUT - Default timeout in seconds
|
|
WEB_FETCH_MAX_SIZE - Default max response size in bytes
|
|
|
|
Examples:
|
|
uv run fetch.py --url "https://example.com"
|
|
uv run fetch.py --url "https://example.com" --json
|
|
uv run fetch.py --url "https://example.com" --timeout 60
|
|
"""
|
|
try:
|
|
# Get timeout from env or use default
|
|
if timeout is None:
|
|
timeout = int(os.getenv("WEB_FETCH_TIMEOUT", str(DEFAULT_TIMEOUT)))
|
|
|
|
# Get max_size from env or use default
|
|
if max_size is None:
|
|
max_size = int(os.getenv("WEB_FETCH_MAX_SIZE", str(DEFAULT_MAX_SIZE)))
|
|
|
|
# Fetch and parse
|
|
result = fetch_web_page(url, timeout, max_size)
|
|
|
|
# Output results
|
|
if output_json:
|
|
click.echo(json.dumps(result, indent=2))
|
|
else:
|
|
if result["success"]:
|
|
formatted = format_markdown_output(url, result["result"])
|
|
click.echo(formatted)
|
|
else:
|
|
click.echo(f"❌ Error ({result['error']}): {result['message']}", err=True)
|
|
|
|
sys.exit(0 if result["success"] else 1)
|
|
|
|
except Exception as e:
|
|
if output_json:
|
|
error_data = {"success": False, "error": "script_error", "message": str(e)}
|
|
click.echo(json.dumps(error_data, indent=2))
|
|
else:
|
|
click.echo(f"❌ Script Error: {e}", err=True)
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|