Initial commit
This commit is contained in:
245
skills/web/scripts/fetch.py
Executable file
245
skills/web/scripts/fetch.py
Executable file
@@ -0,0 +1,245 @@
|
||||
#!/usr/bin/env python3
|
||||
# /// script
|
||||
# dependencies = [
|
||||
# "httpx",
|
||||
# "beautifulsoup4",
|
||||
# "markdownify",
|
||||
# "click",
|
||||
# ]
|
||||
# ///
|
||||
|
||||
"""
|
||||
Web Fetch Script
|
||||
|
||||
Fetch and parse web page content, converting HTML to clean markdown.
|
||||
Supports timeout configuration and size limits.
|
||||
|
||||
Usage:
|
||||
uv run fetch.py --url "https://example.com"
|
||||
uv run fetch.py --url "https://example.com" --json
|
||||
uv run fetch.py --url "https://example.com" --timeout 60
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import click
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup # type: ignore
|
||||
from markdownify import markdownify as md # type: ignore
|
||||
|
||||
# Configuration defaults
|
||||
DEFAULT_TIMEOUT = 30 # seconds
|
||||
DEFAULT_MAX_SIZE = 1048576 # 1MB
|
||||
|
||||
|
||||
def is_valid_url(url: str) -> bool:
|
||||
"""Validate URL format.
|
||||
|
||||
Args:
|
||||
url: URL to validate
|
||||
|
||||
Returns:
|
||||
True if URL is valid (http/https), False otherwise
|
||||
"""
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
return parsed.scheme in ("http", "https") and bool(parsed.netloc)
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def html_to_markdown(html: str) -> str:
|
||||
"""Convert HTML content to clean markdown.
|
||||
|
||||
Removes scripts, styles, and other non-content elements before conversion.
|
||||
|
||||
Args:
|
||||
html: HTML content to convert
|
||||
|
||||
Returns:
|
||||
Markdown representation of the content
|
||||
"""
|
||||
# Parse HTML
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
# Remove script and style elements
|
||||
for element in soup(["script", "style", "nav", "footer", "header"]):
|
||||
element.decompose()
|
||||
|
||||
# Convert to markdown
|
||||
markdown = md(str(soup), heading_style="ATX", strip=["img"])
|
||||
|
||||
# Clean up excessive whitespace
|
||||
lines = [line.strip() for line in markdown.split("\n")]
|
||||
clean_lines = [line for line in lines if line] # Remove empty lines
|
||||
|
||||
return "\n\n".join(clean_lines)
|
||||
|
||||
|
||||
def fetch_web_page(url: str, timeout: int, max_size: int) -> dict:
|
||||
"""Fetch and parse web page content.
|
||||
|
||||
Args:
|
||||
url: URL to fetch content from
|
||||
timeout: Request timeout in seconds
|
||||
max_size: Maximum response size in bytes
|
||||
|
||||
Returns:
|
||||
Dict with success, result/error, and message
|
||||
"""
|
||||
# Validate URL
|
||||
if not is_valid_url(url):
|
||||
return {
|
||||
"success": False,
|
||||
"error": "invalid_url",
|
||||
"message": f"URL must start with http:// or https://. Got: {url}",
|
||||
}
|
||||
|
||||
try:
|
||||
# Fetch content with timeout
|
||||
with httpx.Client(timeout=timeout, follow_redirects=True) as client:
|
||||
response = client.get(url)
|
||||
response.raise_for_status()
|
||||
|
||||
# Check content size
|
||||
content_length = len(response.content)
|
||||
if content_length > max_size:
|
||||
return {
|
||||
"success": False,
|
||||
"error": "content_too_large",
|
||||
"message": f"Response size ({content_length} bytes) exceeds maximum ({max_size} bytes)",
|
||||
}
|
||||
|
||||
# Parse HTML to markdown
|
||||
markdown_content = html_to_markdown(response.text)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"result": markdown_content,
|
||||
"message": f"Successfully fetched {len(markdown_content)} characters from {url}",
|
||||
}
|
||||
|
||||
except httpx.TimeoutException:
|
||||
return {
|
||||
"success": False,
|
||||
"error": "timeout",
|
||||
"message": f"Request timed out after {timeout} seconds",
|
||||
}
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
return {
|
||||
"success": False,
|
||||
"error": "http_error",
|
||||
"message": f"HTTP {e.response.status_code}: {e.response.reason_phrase}",
|
||||
}
|
||||
|
||||
except httpx.RequestError as e:
|
||||
return {
|
||||
"success": False,
|
||||
"error": "request_error",
|
||||
"message": f"Request failed: {str(e)}",
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
"success": False,
|
||||
"error": "unknown_error",
|
||||
"message": f"Unexpected error: {str(e)}",
|
||||
}
|
||||
|
||||
|
||||
def format_markdown_output(url: str, markdown: str) -> str:
|
||||
"""Format markdown for human-readable output.
|
||||
|
||||
Args:
|
||||
url: Source URL
|
||||
markdown: Markdown content
|
||||
|
||||
Returns:
|
||||
Formatted string for display
|
||||
"""
|
||||
lines = []
|
||||
lines.append("\n" + "=" * 60)
|
||||
lines.append(f"📄 Content from {url}")
|
||||
lines.append("=" * 60)
|
||||
lines.append("")
|
||||
lines.append(markdown)
|
||||
lines.append("")
|
||||
lines.append("=" * 60)
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option("--url", required=True, help="URL to fetch content from")
|
||||
@click.option(
|
||||
"--timeout",
|
||||
default=None,
|
||||
type=int,
|
||||
help=f"Request timeout in seconds (default: {DEFAULT_TIMEOUT})",
|
||||
)
|
||||
@click.option(
|
||||
"--max-size",
|
||||
default=None,
|
||||
type=int,
|
||||
help=f"Maximum response size in bytes (default: {DEFAULT_MAX_SIZE})",
|
||||
)
|
||||
@click.option(
|
||||
"--json",
|
||||
"output_json",
|
||||
is_flag=True,
|
||||
help="Output as JSON instead of human-readable format",
|
||||
)
|
||||
def main(url: str, timeout: int | None, max_size: int | None, output_json: bool) -> None:
|
||||
"""
|
||||
Fetch and parse web page content.
|
||||
|
||||
Retrieves HTML content from the specified URL and converts it to clean
|
||||
markdown format. Handles timeouts, HTTP errors, and invalid URLs gracefully.
|
||||
|
||||
Environment variables:
|
||||
WEB_FETCH_TIMEOUT - Default timeout in seconds
|
||||
WEB_FETCH_MAX_SIZE - Default max response size in bytes
|
||||
|
||||
Examples:
|
||||
uv run fetch.py --url "https://example.com"
|
||||
uv run fetch.py --url "https://example.com" --json
|
||||
uv run fetch.py --url "https://example.com" --timeout 60
|
||||
"""
|
||||
try:
|
||||
# Get timeout from env or use default
|
||||
if timeout is None:
|
||||
timeout = int(os.getenv("WEB_FETCH_TIMEOUT", str(DEFAULT_TIMEOUT)))
|
||||
|
||||
# Get max_size from env or use default
|
||||
if max_size is None:
|
||||
max_size = int(os.getenv("WEB_FETCH_MAX_SIZE", str(DEFAULT_MAX_SIZE)))
|
||||
|
||||
# Fetch and parse
|
||||
result = fetch_web_page(url, timeout, max_size)
|
||||
|
||||
# Output results
|
||||
if output_json:
|
||||
click.echo(json.dumps(result, indent=2))
|
||||
else:
|
||||
if result["success"]:
|
||||
formatted = format_markdown_output(url, result["result"])
|
||||
click.echo(formatted)
|
||||
else:
|
||||
click.echo(f"❌ Error ({result['error']}): {result['message']}", err=True)
|
||||
|
||||
sys.exit(0 if result["success"] else 1)
|
||||
|
||||
except Exception as e:
|
||||
if output_json:
|
||||
error_data = {"success": False, "error": "script_error", "message": str(e)}
|
||||
click.echo(json.dumps(error_data, indent=2))
|
||||
else:
|
||||
click.echo(f"❌ Script Error: {e}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user