Initial commit

This commit is contained in:
Zhongwei Li
2025-11-29 18:17:27 +08:00
commit 9b5e9af436
6 changed files with 583 additions and 0 deletions

245
skills/web/scripts/fetch.py Executable file
View File

@@ -0,0 +1,245 @@
#!/usr/bin/env python3
# /// script
# dependencies = [
# "httpx",
# "beautifulsoup4",
# "markdownify",
# "click",
# ]
# ///
"""
Web Fetch Script
Fetch and parse web page content, converting HTML to clean markdown.
Supports timeout configuration and size limits.
Usage:
uv run fetch.py --url "https://example.com"
uv run fetch.py --url "https://example.com" --json
uv run fetch.py --url "https://example.com" --timeout 60
"""
import json
import os
import sys
from urllib.parse import urlparse
import click
import httpx
from bs4 import BeautifulSoup # type: ignore
from markdownify import markdownify as md # type: ignore
# Configuration defaults
DEFAULT_TIMEOUT = 30 # seconds
DEFAULT_MAX_SIZE = 1048576 # 1MB
def is_valid_url(url: str) -> bool:
"""Validate URL format.
Args:
url: URL to validate
Returns:
True if URL is valid (http/https), False otherwise
"""
try:
parsed = urlparse(url)
return parsed.scheme in ("http", "https") and bool(parsed.netloc)
except Exception:
return False
def html_to_markdown(html: str) -> str:
"""Convert HTML content to clean markdown.
Removes scripts, styles, and other non-content elements before conversion.
Args:
html: HTML content to convert
Returns:
Markdown representation of the content
"""
# Parse HTML
soup = BeautifulSoup(html, "html.parser")
# Remove script and style elements
for element in soup(["script", "style", "nav", "footer", "header"]):
element.decompose()
# Convert to markdown
markdown = md(str(soup), heading_style="ATX", strip=["img"])
# Clean up excessive whitespace
lines = [line.strip() for line in markdown.split("\n")]
clean_lines = [line for line in lines if line] # Remove empty lines
return "\n\n".join(clean_lines)
def fetch_web_page(url: str, timeout: int, max_size: int) -> dict:
"""Fetch and parse web page content.
Args:
url: URL to fetch content from
timeout: Request timeout in seconds
max_size: Maximum response size in bytes
Returns:
Dict with success, result/error, and message
"""
# Validate URL
if not is_valid_url(url):
return {
"success": False,
"error": "invalid_url",
"message": f"URL must start with http:// or https://. Got: {url}",
}
try:
# Fetch content with timeout
with httpx.Client(timeout=timeout, follow_redirects=True) as client:
response = client.get(url)
response.raise_for_status()
# Check content size
content_length = len(response.content)
if content_length > max_size:
return {
"success": False,
"error": "content_too_large",
"message": f"Response size ({content_length} bytes) exceeds maximum ({max_size} bytes)",
}
# Parse HTML to markdown
markdown_content = html_to_markdown(response.text)
return {
"success": True,
"result": markdown_content,
"message": f"Successfully fetched {len(markdown_content)} characters from {url}",
}
except httpx.TimeoutException:
return {
"success": False,
"error": "timeout",
"message": f"Request timed out after {timeout} seconds",
}
except httpx.HTTPStatusError as e:
return {
"success": False,
"error": "http_error",
"message": f"HTTP {e.response.status_code}: {e.response.reason_phrase}",
}
except httpx.RequestError as e:
return {
"success": False,
"error": "request_error",
"message": f"Request failed: {str(e)}",
}
except Exception as e:
return {
"success": False,
"error": "unknown_error",
"message": f"Unexpected error: {str(e)}",
}
def format_markdown_output(url: str, markdown: str) -> str:
"""Format markdown for human-readable output.
Args:
url: Source URL
markdown: Markdown content
Returns:
Formatted string for display
"""
lines = []
lines.append("\n" + "=" * 60)
lines.append(f"📄 Content from {url}")
lines.append("=" * 60)
lines.append("")
lines.append(markdown)
lines.append("")
lines.append("=" * 60)
return "\n".join(lines)
@click.command()
@click.option("--url", required=True, help="URL to fetch content from")
@click.option(
"--timeout",
default=None,
type=int,
help=f"Request timeout in seconds (default: {DEFAULT_TIMEOUT})",
)
@click.option(
"--max-size",
default=None,
type=int,
help=f"Maximum response size in bytes (default: {DEFAULT_MAX_SIZE})",
)
@click.option(
"--json",
"output_json",
is_flag=True,
help="Output as JSON instead of human-readable format",
)
def main(url: str, timeout: int | None, max_size: int | None, output_json: bool) -> None:
"""
Fetch and parse web page content.
Retrieves HTML content from the specified URL and converts it to clean
markdown format. Handles timeouts, HTTP errors, and invalid URLs gracefully.
Environment variables:
WEB_FETCH_TIMEOUT - Default timeout in seconds
WEB_FETCH_MAX_SIZE - Default max response size in bytes
Examples:
uv run fetch.py --url "https://example.com"
uv run fetch.py --url "https://example.com" --json
uv run fetch.py --url "https://example.com" --timeout 60
"""
try:
# Get timeout from env or use default
if timeout is None:
timeout = int(os.getenv("WEB_FETCH_TIMEOUT", str(DEFAULT_TIMEOUT)))
# Get max_size from env or use default
if max_size is None:
max_size = int(os.getenv("WEB_FETCH_MAX_SIZE", str(DEFAULT_MAX_SIZE)))
# Fetch and parse
result = fetch_web_page(url, timeout, max_size)
# Output results
if output_json:
click.echo(json.dumps(result, indent=2))
else:
if result["success"]:
formatted = format_markdown_output(url, result["result"])
click.echo(formatted)
else:
click.echo(f"❌ Error ({result['error']}): {result['message']}", err=True)
sys.exit(0 if result["success"] else 1)
except Exception as e:
if output_json:
error_data = {"success": False, "error": "script_error", "message": str(e)}
click.echo(json.dumps(error_data, indent=2))
else:
click.echo(f"❌ Script Error: {e}", err=True)
sys.exit(1)
if __name__ == "__main__":
main()