From e9e441dcb19c9234a6c0f70951274afb44ef2ef1 Mon Sep 17 00:00:00 2001 From: Zhongwei Li Date: Sun, 30 Nov 2025 09:08:11 +0800 Subject: [PATCH] Initial commit --- .claude-plugin/plugin.json | 12 ++ EXAMPLES.md | 183 +++++++++++++++++++++++++++ QUICK_START.md | 58 +++++++++ README.md | 3 + SKILL.md | 252 +++++++++++++++++++++++++++++++++++++ VERSION | 34 +++++ html_to_long_image.py | 166 ++++++++++++++++++++++++ plugin.lock.json | 61 +++++++++ 8 files changed, 769 insertions(+) create mode 100644 .claude-plugin/plugin.json create mode 100644 EXAMPLES.md create mode 100644 QUICK_START.md create mode 100644 README.md create mode 100644 SKILL.md create mode 100644 VERSION create mode 100644 html_to_long_image.py create mode 100644 plugin.lock.json diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json new file mode 100644 index 0000000..fca7215 --- /dev/null +++ b/.claude-plugin/plugin.json @@ -0,0 +1,12 @@ +{ + "name": "html-to-pdf", + "description": "Convert HTML files to PDF or PNG format with multiple rendering options. Supports multi-page PDFs, single-page long-image PDFs, background colors, and Chinese/CJK characters.", + "version": "0.0.0-2025.11.28", + "author": { + "name": "Yong Gao", + "email": "zhongweili@tubi.tv" + }, + "skills": [ + "./" + ] +} \ No newline at end of file diff --git a/EXAMPLES.md b/EXAMPLES.md new file mode 100644 index 0000000..0e7dade --- /dev/null +++ b/EXAMPLES.md @@ -0,0 +1,183 @@ +# Usage Examples + +## Real-World Examples + +### Example 1: Business Plan Report +```bash +# Original file: 202510_Alpha_Intelligence_BP.html (71 KB) +python ~/.claude/skills/html-to-pdf/html_to_long_image.py 202510_Alpha_Intelligence_BP.html + +# Output: +# - 202510_Alpha_Intelligence_BP_fullpage.png (6.1 MB) +# - 202510_Alpha_Intelligence_BP_fullpage.pdf (1.6 MB, 1 page) +``` + +**Result**: Single-page PDF with no page breaks, perfect for online presentation! + +### Example 2: Simple Test +```bash +# Create test HTML +echo '

Test

Hello World

' > test.html + +# Convert +python ~/.claude/skills/html-to-pdf/html_to_long_image.py test.html + +# Output: +# - test_fullpage.png (12 KB) +# - test_fullpage.pdf (20 KB) +``` + +### Example 3: Multi-File Batch Conversion +```bash +# Convert all HTML files in a directory +cd /path/to/html/files + +for file in *.html; do + echo "Converting $file..." + python ~/.claude/skills/html-to-pdf/html_to_long_image.py "$file" +done + +# Check results +ls -lh *_fullpage.pdf +``` + +### Example 4: With Custom Path +```bash +# Specify full paths +python ~/.claude/skills/html-to-pdf/html_to_long_image.py \ + /Users/yonggao/Documents/report.html +``` + +## Integration Examples + +### Use in Shell Script +```bash +#!/bin/bash +# convert_reports.sh + +HTML_CONVERTER=~/.claude/skills/html-to-pdf/html_to_long_image.py + +for report in reports/*.html; do + python "$HTML_CONVERTER" "$report" + echo "✓ Converted: $report" +done + +echo "All reports converted!" +``` + +### Use in Makefile +```makefile +# Makefile +CONVERTER = python ~/.claude/skills/html-to-pdf/html_to_long_image.py + +%.pdf: %.html + $(CONVERTER) $< + +all: report.pdf presentation.pdf + +clean: + rm -f *_fullpage.pdf *_fullpage.png +``` + +### Use in Python Script +```python +import subprocess +import sys + +def convert_html_to_pdf(html_file): + """Convert HTML to PDF using the skill.""" + converter = "~/.claude/skills/html-to-pdf/html_to_long_image.py" + result = subprocess.run( + [sys.executable, converter, html_file], + capture_output=True, + text=True + ) + return result.returncode == 0 + +# Usage +if convert_html_to_pdf("report.html"): + print("Success!") +``` + +## Comparison with Original Files + +| File | Type | Size | Pages | Notes | +|------|------|------|-------|-------| +| 202510_Alpha_Intelligence_BP.html | HTML | 71 KB | - | Source file | +| 202510_Alpha_Intelligence_BP_fullpage.png | PNG | 6.1 MB | 1 | High-quality screenshot | +| 202510_Alpha_Intelligence_BP_fullpage.pdf | PDF | 1.6 MB | 1 | **Best for viewing** | +| 202510_Alpha_Intelligence_BP_final.pdf | PDF | 5.7 MB | 22 | Multi-page (with breaks) | + +## Performance Data + +Based on real usage: + +| HTML Size | Processing Time | PNG Size | PDF Size | +|-----------|----------------|----------|----------| +| 71 KB | ~5 seconds | 6.1 MB | 1.6 MB | +| 5 KB | ~3 seconds | 12 KB | 20 KB | +| 200 KB | ~8 seconds | 15 MB | 4 MB | + +## Tips & Tricks + +### Optimize for File Size +The PNG is always larger than the PDF. If you only need the PDF: +```bash +# Generate both, then delete PNG +python ~/.claude/skills/html-to-pdf/html_to_long_image.py report.html +rm report_fullpage.png +``` + +### Preview Before Converting +```bash +# Open HTML in browser first +open report.html + +# Then convert +python ~/.claude/skills/html-to-pdf/html_to_long_image.py report.html +``` + +### Auto-open Result +```bash +python ~/.claude/skills/html-to-pdf/html_to_long_image.py report.html && \ + open report_fullpage.pdf +``` + +### Check Page Count +```bash +python -c " +from pypdf import PdfReader +r = PdfReader('report_fullpage.pdf') +print(f'Pages: {len(r.pages)}') +" +``` + +## Troubleshooting Examples + +### Problem: Script not found +```bash +# Solution: Use full path +python ~/.claude/skills/html-to-pdf/html_to_long_image.py file.html +``` + +### Problem: Permission denied +```bash +# Solution: Make executable +chmod +x ~/.claude/skills/html-to-pdf/html_to_long_image.py +``` + +### Problem: Playwright not installed +```bash +# Solution: Install dependencies +pip install playwright pillow pypdf +playwright install chromium +``` + +### Problem: Content appears cut off +```bash +# This is automatically handled by the script which: +# 1. Scrolls through entire page +# 2. Waits for animations +# 3. Forces all content visible +# No action needed! +``` diff --git a/QUICK_START.md b/QUICK_START.md new file mode 100644 index 0000000..d683609 --- /dev/null +++ b/QUICK_START.md @@ -0,0 +1,58 @@ +# Quick Start Guide + +## One-Line Usage + +```bash +python ~/.claude/skills/html-to-pdf/html_to_long_image.py your_file.html +``` + +## What You Get + +- `your_file_fullpage.png` - Complete screenshot +- `your_file_fullpage.pdf` - Single-page PDF (no page breaks!) + +## Common Tasks + +### Convert HTML in current directory +```bash +python ~/.claude/skills/html-to-pdf/html_to_long_image.py report.html +``` + +### Convert and open immediately +```bash +python ~/.claude/skills/html-to-pdf/html_to_long_image.py report.html && open report_fullpage.pdf +``` + +### Batch convert all HTML files +```bash +for file in *.html; do + python ~/.claude/skills/html-to-pdf/html_to_long_image.py "$file" +done +``` + +## Tips + +1. **For presentations**: Use the PDF output (smaller, 1-2 MB) +2. **For high-quality images**: Use the PNG output (6-10 MB) +3. **First-time setup**: Run `playwright install chromium` once + +## Comparison + +| Method | Pages | Size | Best For | +|--------|-------|------|----------| +| Long Image (this skill) | 1 page | 1-2 MB | Online viewing, presentations | +| Standard A4 | 20+ pages | 5-6 MB | Printing, archiving | + +## Success! + +If you see this output, it worked: +``` +✅ 成功生成长图! + 大小: 6263.9 KB + +✅ PDF生成成功!大小: 1632.6 KB + +💡 打开查看: + 长图: open your_file_fullpage.png + PDF: open your_file_fullpage.pdf +``` diff --git a/README.md b/README.md new file mode 100644 index 0000000..8c7ced4 --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# html-to-pdf + +Convert HTML files to PDF or PNG format with multiple rendering options. Supports multi-page PDFs, single-page long-image PDFs, background colors, and Chinese/CJK characters. diff --git a/SKILL.md b/SKILL.md new file mode 100644 index 0000000..a631ee6 --- /dev/null +++ b/SKILL.md @@ -0,0 +1,252 @@ +--- +name: html-to-pdf +description: Converts HTML files to PDF or PNG format. Use this skill when the user asks to convert, export, or generate PDF/PNG from HTML files, or when they want to create printable documents, presentations, or long-form images from web pages or HTML reports. +--- + +# HTML to PDF/PNG Converter Skill + +This skill helps you convert HTML files to PDF or PNG format with various options for output quality and page layout. + +## When to Use This Skill + +Use this skill when the user wants to: +- Convert HTML files to PDF +- Generate PNG screenshots from HTML +- Create printable documents from web pages +- Export HTML reports as PDFs +- Generate long-form images without page breaks +- Create presentation-ready PDFs from HTML + +## Available Conversion Methods + +### Method 1: Multi-Page PDF (Standard A4) +Best for: Printing, archiving, traditional documents + +**Command:** +```bash +python html_to_pdf_final.py input.html output.pdf +``` + +**Features:** +- Standard A4 page format +- Zero margins for seamless appearance +- Background colors and gradients preserved +- Multiple pages with page breaks +- Optimized file size (~5-6 MB for typical reports) + +### Method 2: Single-Page Long Image PDF +Best for: Online viewing, presentations, no page breaks + +**Command:** +```bash +python html_to_long_image.py input.html +``` + +**Features:** +- Generates a full-page PNG screenshot first +- Converts PNG to single-page PDF +- NO page breaks - entire content on one page +- Perfect for presentations and online viewing +- Smaller file size (~1-2 MB) +- Two output files: `.png` and `.pdf` + +### Method 3: Advanced Multi-Method Converter +Best for: Fallback options, compatibility + +**Command:** +```bash +python html_to_pdf_converter.py input.html output.pdf +``` + +**Features:** +- Tries WeasyPrint first (best CSS support) +- Falls back to Playwright if needed +- Automatic dependency installation +- Handles complex CSS and gradients + +## Required Dependencies + +The skill requires Playwright and Pillow. The scripts auto-install dependencies if missing: + +```bash +pip install playwright pillow pypdf +playwright install chromium +``` + +## Step-by-Step Instructions + +### For Standard Multi-Page PDF: + +1. **Verify the HTML file exists:** + ```bash + ls -la *.html + ``` + +2. **Run the converter:** + ```bash + python html_to_pdf_final.py your_file.html + ``` + +3. **Open the result:** + ```bash + open your_file_final.pdf + ``` + +### For Single-Page Long Image PDF: + +1. **Verify the HTML file exists:** + ```bash + ls -la *.html + ``` + +2. **Run the long image converter:** + ```bash + python html_to_long_image.py your_file.html + ``` + +3. **Check outputs:** + ```bash + # View the PNG screenshot + open your_file_fullpage.png + + # View the PDF version + open your_file_fullpage.pdf + ``` + +## Troubleshooting + +### Issue: Playwright browser not found + +**Solution:** +```bash +playwright install chromium +``` + +### Issue: Page breaks visible in PDF + +**Solution:** Use the long image method instead: +```bash +python html_to_long_image.py your_file.html +``` + +### Issue: Content appears cut off + +**Causes & Solutions:** +- **CSS animations not complete**: Script waits 2 seconds for animations +- **Lazy loading**: Script scrolls through entire page to trigger loading +- **Large file size**: Scripts handle files up to 20MB+ + +### Issue: Blank PDF output + +**Solution:** Use the long image method which uses screenshot instead of PDF rendering: +```bash +python html_to_long_image.py your_file.html +``` + +## Output Files + +After conversion, you'll get: + +**Multi-Page PDF:** +- `filename_final.pdf` - Standard A4 multi-page PDF + +**Long Image Method:** +- `filename_fullpage.png` - Complete screenshot as PNG (6-10 MB) +- `filename_fullpage.pdf` - Single-page PDF from image (1-2 MB) + +## Best Practices + +1. **For online viewing/presentations:** Use `html_to_long_image.py` + - No page breaks + - Smooth scrolling experience + - Smaller file size + +2. **For printing/archiving:** Use `html_to_pdf_final.py` + - Standard A4 pages + - Better for physical printing + - Professional document format + +3. **For complex CSS:** Use `html_to_pdf_converter.py` + - Multiple fallback methods + - Better compatibility + +## Implementation Notes + +### Script Locations +All scripts should be in the project directory: +- `html_to_pdf_final.py` - Main multi-page converter +- `html_to_long_image.py` - Long image generator +- `html_to_pdf_converter.py` - Advanced multi-method converter + +### Key Features Implemented + +1. **Animation Handling**: All scripts disable CSS animations/transitions +2. **Lazy Loading**: Scripts scroll through content to trigger loading +3. **Background Preservation**: All gradients and colors render correctly +4. **Zero Margins**: Seamless page appearance without visible borders +5. **Chinese Font Support**: Handles CJK characters properly + +## Examples + +### Example 1: Convert BP Report to Multi-Page PDF +```bash +python html_to_pdf_final.py 202510_Alpha_Intelligence_BP.html +# Output: 202510_Alpha_Intelligence_BP_final.pdf (22 pages, 5.7 MB) +``` + +### Example 2: Create Single-Page Presentation PDF +```bash +python html_to_long_image.py 202510_Alpha_Intelligence_BP.html +# Output: +# - 202510_Alpha_Intelligence_BP_fullpage.png (6.1 MB) +# - 202510_Alpha_Intelligence_BP_fullpage.pdf (1.6 MB, 1 page) +``` + +### Example 3: Batch Convert Multiple Files +```bash +for file in *.html; do + python html_to_long_image.py "$file" +done +``` + +## Advanced Usage + +### Custom Output Paths +```bash +python html_to_pdf_final.py input.html custom_output.pdf +python html_to_long_image.py input.html +``` + +### Check PDF Page Count +```bash +python -c "from pypdf import PdfReader; r = PdfReader('output.pdf'); print(f'Pages: {len(r.pages)}')" +``` + +### Verify File Sizes +```bash +ls -lh *.pdf *.png +``` + +## Performance Expectations + +- **Processing Speed**: ~5-10 seconds for typical HTML files +- **Memory Usage**: ~100-200 MB during conversion +- **PDF File Size**: 1-6 MB depending on method +- **PNG File Size**: 6-10 MB for full-page screenshots + +## Success Criteria + +A successful conversion should: +1. ✅ Generate PDF/PNG without errors +2. ✅ Include all HTML content (no truncation) +3. ✅ Preserve colors, gradients, and styling +4. ✅ Handle Chinese/CJK characters correctly +5. ✅ Create readable file sizes (< 10 MB) + +## Quick Reference + +| Need | Use | Output | +|------|-----|--------| +| Printing | `html_to_pdf_final.py` | Multi-page A4 PDF | +| Online viewing | `html_to_long_image.py` | Single-page PDF + PNG | +| Maximum compatibility | `html_to_pdf_converter.py` | Multi-page PDF with fallbacks | diff --git a/VERSION b/VERSION new file mode 100644 index 0000000..06cc3bd --- /dev/null +++ b/VERSION @@ -0,0 +1,34 @@ +# HTML to PDF Skill - Version History + +## Version 1.0.0 (2025-10-23) + +### Initial Release + +**Features:** +- HTML to PNG screenshot conversion +- PNG to single-page PDF conversion +- Auto-install dependencies (playwright, Pillow) +- CSS animation disabling +- Lazy content loading via scrolling +- Zero-margin seamless output +- Chinese/CJK character support + +**Scripts:** +- `html_to_long_image.py` - Main converter script + +**Documentation:** +- SKILL.md - Main skill definition +- README.md - Installation and usage +- QUICK_START.md - One-line quick reference +- EXAMPLES.md - Real-world examples + +**Tested With:** +- Python 3.12 +- Playwright 1.40+ +- Pillow 11.0+ +- macOS (arm64) + +**Known Limitations:** +- Single-page output only (no multi-page A4) +- Requires Chromium browser installation +- Large HTML files (>5MB) may take longer to process diff --git a/html_to_long_image.py b/html_to_long_image.py new file mode 100644 index 0000000..a72827d --- /dev/null +++ b/html_to_long_image.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python3 +""" +HTML转长图工具 +将HTML渲染为一张完整的长图(PNG),然后可以转PDF +""" + +import os +import sys +import time +from pathlib import Path + + +def html_to_long_image(html_path: str, output_path: str = None) -> str: + """ + 将HTML转换为一张完整的长图PNG。 + + Args: + html_path: HTML文件路径 + output_path: 输出图片路径(可选) + + Returns: + 生成的图片路径 + """ + try: + from playwright.sync_api import sync_playwright + except ImportError: + print("正在安装 Playwright...") + import subprocess + subprocess.check_call([sys.executable, "-m", "pip", "install", "playwright", "-q"]) + from playwright.sync_api import sync_playwright + + if not os.path.exists(html_path): + raise FileNotFoundError(f"HTML文件不存在: {html_path}") + + if output_path is None: + html_file = Path(html_path) + output_path = str(html_file.parent / f"{html_file.stem}_fullpage.png") + + print(f"\n📄 转换HTML为完整长图") + print(f" 输入: {Path(html_path).name}") + print(f" 输出: {Path(output_path).name}\n") + + with sync_playwright() as p: + browser = p.chromium.launch() + page = browser.new_page(viewport={'width': 1200, 'height': 800}) + + # 加载HTML + html_path_abs = str(Path(html_path).absolute()) + print("⏳ 加载HTML...") + page.goto(f'file://{html_path_abs}', wait_until='networkidle') + + # 禁用所有动画 + print("🎨 禁用动画...") + page.add_style_tag(content=""" + *, *::before, *::after { + animation: none !important; + transition: none !important; + } + .section, .cover { + opacity: 1 !important; + transform: none !important; + } + """) + + # 强制显示所有内容 + page.evaluate(""" + () => { + document.querySelectorAll('.section, .cover').forEach(el => { + el.style.opacity = '1'; + el.style.transform = 'none'; + }); + } + """) + + # 滚动加载 + print("📜 加载所有内容...") + total_height = page.evaluate("document.body.scrollHeight") + for y in range(0, total_height, 1000): + page.evaluate(f"window.scrollTo(0, {y})") + time.sleep(0.1) + + page.evaluate("window.scrollTo(0, 0)") + time.sleep(0.5) + + # 截取完整页面 + print("📸 截取完整页面...") + page.screenshot(path=output_path, full_page=True) + + browser.close() + + size_kb = os.path.getsize(output_path) / 1024 + print(f"\n✅ 成功生成长图!") + print(f" 大小: {size_kb:.1f} KB\n") + + return str(output_path) + + +def image_to_pdf(image_path: str, pdf_path: str = None) -> str: + """ + 将图片转换为PDF。 + + Args: + image_path: 图片路径 + pdf_path: PDF输出路径(可选) + + Returns: + 生成的PDF路径 + """ + try: + from PIL import Image + except ImportError: + print("正在安装 Pillow...") + import subprocess + subprocess.check_call([sys.executable, "-m", "pip", "install", "Pillow", "-q"]) + from PIL import Image + + if pdf_path is None: + image_file = Path(image_path) + pdf_path = str(image_file.with_suffix('.pdf')) + + print(f"📄 转换图片为PDF: {Path(pdf_path).name}") + + # 打开图片并转换为PDF + image = Image.open(image_path) + + # 转换为RGB(PDF需要) + if image.mode != 'RGB': + image = image.convert('RGB') + + # 保存为PDF + image.save(pdf_path, 'PDF', resolution=100.0) + + size_kb = os.path.getsize(pdf_path) / 1024 + print(f"✅ PDF生成成功!大小: {size_kb:.1f} KB\n") + + return str(pdf_path) + + +def main(): + default_html = "202510_Alpha_Intelligence_BP.html" + html_path = sys.argv[1] if len(sys.argv) > 1 else default_html + + print("\n" + "=" * 70) + print("HTML转完整长图工具 - 无分页断开") + print("=" * 70) + + try: + # 生成长图 + image_path = html_to_long_image(html_path) + + # 转换为PDF + pdf_path = image_to_pdf(image_path) + + print(f"💡 打开查看:") + print(f" 长图: open {Path(image_path).name}") + print(f" PDF: open {Path(pdf_path).name}\n") + + except Exception as e: + print(f"\n❌ 错误: {e}\n") + import traceback + traceback.print_exc() + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/plugin.lock.json b/plugin.lock.json new file mode 100644 index 0000000..d3f2857 --- /dev/null +++ b/plugin.lock.json @@ -0,0 +1,61 @@ +{ + "$schema": "internal://schemas/plugin.lock.v1.json", + "pluginId": "gh:yonggao/claude-plugins:skills/html-to-pdf", + "normalized": { + "repo": null, + "ref": "refs/tags/v20251128.0", + "commit": "d9a7bdd2b1535728f687605158566929b17670e8", + "treeHash": "6792dd0d55d9a678dadebc65721c1a3864c4d30c619e5d185be09158b9de027d", + "generatedAt": "2025-11-28T10:29:12.715718Z", + "toolVersion": "publish_plugins.py@0.2.0" + }, + "origin": { + "remote": "git@github.com:zhongweili/42plugin-data.git", + "branch": "master", + "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390", + "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data" + }, + "manifest": { + "name": "html-to-pdf", + "description": "Convert HTML files to PDF or PNG format with multiple rendering options. Supports multi-page PDFs, single-page long-image PDFs, background colors, and Chinese/CJK characters.", + "version": null + }, + "content": { + "files": [ + { + "path": "EXAMPLES.md", + "sha256": "3e0a847cf1df915d9cbab1aada54f1e6cdc86976fc4ea9c0f25e85bd5a56da5a" + }, + { + "path": "html_to_long_image.py", + "sha256": "450db8662b54dab9a17c8c2fd326ae6b0ac783dddef21b5c32c9d56627bfea74" + }, + { + "path": "README.md", + "sha256": "d896d73dfc7ca8ac44703f1dfe09caccf05252ffdec97c696211293447c49e4a" + }, + { + "path": "VERSION", + "sha256": "7086eb17c2adb4a5a4808c02065c4aa5bafb70ce833bbf31e237733265b0ef28" + }, + { + "path": "SKILL.md", + "sha256": "2776f5b726b2bf7c8be9475cea8aad5bdbda7f70fe48a5aead366a161fa59cd0" + }, + { + "path": "QUICK_START.md", + "sha256": "b7908dc8aeb6a687aa965debe8183ab9b37c8eae0392e7a7293a652946ad87c9" + }, + { + "path": ".claude-plugin/plugin.json", + "sha256": "5fc173f27bc0ff64119a367e2f395ab3b4da752e7facb25ad019019ab0d5904b" + } + ], + "dirSha256": "6792dd0d55d9a678dadebc65721c1a3864c4d30c619e5d185be09158b9de027d" + }, + "security": { + "scannedAt": null, + "scannerVersion": null, + "flags": [] + } +} \ No newline at end of file