Initial commit
This commit is contained in:
610
skills/pdf-processing-pro/FORMS.md
Normal file
610
skills/pdf-processing-pro/FORMS.md
Normal file
@@ -0,0 +1,610 @@
|
||||
# PDF Form Processing Guide
|
||||
|
||||
Complete guide for processing PDF forms in production environments.
|
||||
|
||||
## Table of contents
|
||||
|
||||
- Form analysis and field detection
|
||||
- Form filling workflows
|
||||
- Validation strategies
|
||||
- Field types and handling
|
||||
- Multi-page forms
|
||||
- Flattening and finalization
|
||||
- Error handling patterns
|
||||
- Production examples
|
||||
|
||||
## Form analysis
|
||||
|
||||
### Analyze form structure
|
||||
|
||||
Use `analyze_form.py` to extract complete form information:
|
||||
|
||||
```bash
|
||||
python scripts/analyze_form.py application.pdf --output schema.json
|
||||
```
|
||||
|
||||
Output format:
|
||||
|
||||
```json
|
||||
{
|
||||
"full_name": {
|
||||
"type": "text",
|
||||
"required": true,
|
||||
"max_length": 100,
|
||||
"x": 120.5,
|
||||
"y": 450.2,
|
||||
"width": 300,
|
||||
"height": 20
|
||||
},
|
||||
"date_of_birth": {
|
||||
"type": "text",
|
||||
"required": true,
|
||||
"format": "MM/DD/YYYY",
|
||||
"x": 120.5,
|
||||
"y": 400.8,
|
||||
"width": 150,
|
||||
"height": 20
|
||||
},
|
||||
"email_newsletter": {
|
||||
"type": "checkbox",
|
||||
"required": false,
|
||||
"x": 120.5,
|
||||
"y": 350.4,
|
||||
"width": 15,
|
||||
"height": 15
|
||||
},
|
||||
"preferred_contact": {
|
||||
"type": "radio",
|
||||
"required": true,
|
||||
"options": ["email", "phone", "mail"],
|
||||
"x": 120.5,
|
||||
"y": 300.0,
|
||||
"width": 200,
|
||||
"height": 60
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Programmatic analysis
|
||||
|
||||
```python
|
||||
from pypdf import PdfReader
|
||||
|
||||
reader = PdfReader("form.pdf")
|
||||
fields = reader.get_fields()
|
||||
|
||||
for field_name, field_info in fields.items():
|
||||
print(f"Field: {field_name}")
|
||||
print(f" Type: {field_info.get('/FT')}")
|
||||
print(f" Value: {field_info.get('/V')}")
|
||||
print(f" Flags: {field_info.get('/Ff', 0)}")
|
||||
print()
|
||||
```
|
||||
|
||||
## Form filling workflows
|
||||
|
||||
### Basic workflow
|
||||
|
||||
```bash
|
||||
# 1. Analyze form
|
||||
python scripts/analyze_form.py template.pdf --output schema.json
|
||||
|
||||
# 2. Prepare data
|
||||
cat > data.json << EOF
|
||||
{
|
||||
"full_name": "John Doe",
|
||||
"date_of_birth": "01/15/1990",
|
||||
"email": "john@example.com",
|
||||
"email_newsletter": true,
|
||||
"preferred_contact": "email"
|
||||
}
|
||||
EOF
|
||||
|
||||
# 3. Validate data
|
||||
python scripts/validate_form.py data.json schema.json
|
||||
|
||||
# 4. Fill form
|
||||
python scripts/fill_form.py template.pdf data.json filled.pdf
|
||||
|
||||
# 5. Flatten (optional - makes fields non-editable)
|
||||
python scripts/flatten_form.py filled.pdf final.pdf
|
||||
```
|
||||
|
||||
### Programmatic filling
|
||||
|
||||
```python
|
||||
from pypdf import PdfReader, PdfWriter
|
||||
|
||||
reader = PdfReader("template.pdf")
|
||||
writer = PdfWriter()
|
||||
|
||||
# Clone all pages
|
||||
for page in reader.pages:
|
||||
writer.add_page(page)
|
||||
|
||||
# Fill form fields
|
||||
writer.update_page_form_field_values(
|
||||
writer.pages[0],
|
||||
{
|
||||
"full_name": "John Doe",
|
||||
"date_of_birth": "01/15/1990",
|
||||
"email": "john@example.com",
|
||||
"email_newsletter": "/Yes", # Checkbox value
|
||||
"preferred_contact": "/email" # Radio value
|
||||
}
|
||||
)
|
||||
|
||||
# Save filled form
|
||||
with open("filled.pdf", "wb") as output:
|
||||
writer.write(output)
|
||||
```
|
||||
|
||||
## Field types and handling
|
||||
|
||||
### Text fields
|
||||
|
||||
```python
|
||||
# Simple text
|
||||
field_values["customer_name"] = "Jane Smith"
|
||||
|
||||
# Formatted text (dates)
|
||||
field_values["date"] = "12/25/2024"
|
||||
|
||||
# Numbers
|
||||
field_values["amount"] = "1234.56"
|
||||
|
||||
# Multi-line text
|
||||
field_values["comments"] = "Line 1\nLine 2\nLine 3"
|
||||
```
|
||||
|
||||
### Checkboxes
|
||||
|
||||
Checkboxes typically use `/Yes` for checked, `/Off` for unchecked:
|
||||
|
||||
```python
|
||||
# Check checkbox
|
||||
field_values["agree_to_terms"] = "/Yes"
|
||||
|
||||
# Uncheck checkbox
|
||||
field_values["newsletter_opt_out"] = "/Off"
|
||||
```
|
||||
|
||||
**Note**: Some PDFs use different values. Check with `analyze_form.py`:
|
||||
|
||||
```json
|
||||
{
|
||||
"some_checkbox": {
|
||||
"type": "checkbox",
|
||||
"on_value": "/On", # ← Check this
|
||||
"off_value": "/Off"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Radio buttons
|
||||
|
||||
Radio buttons are mutually exclusive options:
|
||||
|
||||
```python
|
||||
# Select one option from radio group
|
||||
field_values["preferred_contact"] = "/email"
|
||||
|
||||
# Other options in same group
|
||||
# field_values["preferred_contact"] = "/phone"
|
||||
# field_values["preferred_contact"] = "/mail"
|
||||
```
|
||||
|
||||
### Dropdown/List boxes
|
||||
|
||||
```python
|
||||
# Single selection
|
||||
field_values["country"] = "United States"
|
||||
|
||||
# List of available options in schema
|
||||
"country": {
|
||||
"type": "dropdown",
|
||||
"options": ["United States", "Canada", "Mexico", ...]
|
||||
}
|
||||
```
|
||||
|
||||
## Validation strategies
|
||||
|
||||
### Schema-based validation
|
||||
|
||||
```python
|
||||
import json
|
||||
from jsonschema import validate, ValidationError
|
||||
|
||||
# Load schema from analyze_form.py output
|
||||
with open("schema.json") as f:
|
||||
schema = json.load(f)
|
||||
|
||||
# Load form data
|
||||
with open("data.json") as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Validate all fields
|
||||
errors = []
|
||||
|
||||
for field_name, field_schema in schema.items():
|
||||
value = data.get(field_name)
|
||||
|
||||
# Check required fields
|
||||
if field_schema.get("required") and not value:
|
||||
errors.append(f"Missing required field: {field_name}")
|
||||
|
||||
# Check field type
|
||||
if value and field_schema.get("type") == "text":
|
||||
if not isinstance(value, str):
|
||||
errors.append(f"Field {field_name} must be string")
|
||||
|
||||
# Check max length
|
||||
max_length = field_schema.get("max_length")
|
||||
if value and max_length and len(str(value)) > max_length:
|
||||
errors.append(f"Field {field_name} exceeds max length {max_length}")
|
||||
|
||||
# Check format (dates, emails, etc)
|
||||
format_type = field_schema.get("format")
|
||||
if value and format_type:
|
||||
if not validate_format(value, format_type):
|
||||
errors.append(f"Field {field_name} has invalid format")
|
||||
|
||||
if errors:
|
||||
print("Validation errors:")
|
||||
for error in errors:
|
||||
print(f" - {error}")
|
||||
exit(1)
|
||||
|
||||
print("Validation passed")
|
||||
```
|
||||
|
||||
### Format validation
|
||||
|
||||
```python
|
||||
import re
|
||||
from datetime import datetime
|
||||
|
||||
def validate_format(value, format_type):
|
||||
"""Validate field format."""
|
||||
|
||||
if format_type == "email":
|
||||
pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
|
||||
return re.match(pattern, value) is not None
|
||||
|
||||
elif format_type == "phone":
|
||||
# US phone: (555) 123-4567 or 555-123-4567
|
||||
pattern = r'^\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}$'
|
||||
return re.match(pattern, value) is not None
|
||||
|
||||
elif format_type == "MM/DD/YYYY":
|
||||
try:
|
||||
datetime.strptime(value, "%m/%d/%Y")
|
||||
return True
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
elif format_type == "SSN":
|
||||
# XXX-XX-XXXX
|
||||
pattern = r'^\d{3}-\d{2}-\d{4}$'
|
||||
return re.match(pattern, value) is not None
|
||||
|
||||
elif format_type == "ZIP":
|
||||
# XXXXX or XXXXX-XXXX
|
||||
pattern = r'^\d{5}(-\d{4})?$'
|
||||
return re.match(pattern, value) is not None
|
||||
|
||||
return True # Unknown format, skip validation
|
||||
```
|
||||
|
||||
## Multi-page forms
|
||||
|
||||
### Handling multi-page forms
|
||||
|
||||
```python
|
||||
from pypdf import PdfReader, PdfWriter
|
||||
|
||||
reader = PdfReader("multi_page_form.pdf")
|
||||
writer = PdfWriter()
|
||||
|
||||
# Clone all pages
|
||||
for page in reader.pages:
|
||||
writer.add_page(page)
|
||||
|
||||
# Fill fields on page 1
|
||||
writer.update_page_form_field_values(
|
||||
writer.pages[0],
|
||||
{
|
||||
"name_page1": "John Doe",
|
||||
"email_page1": "john@example.com"
|
||||
}
|
||||
)
|
||||
|
||||
# Fill fields on page 2
|
||||
writer.update_page_form_field_values(
|
||||
writer.pages[1],
|
||||
{
|
||||
"address_page2": "123 Main St",
|
||||
"city_page2": "Springfield"
|
||||
}
|
||||
)
|
||||
|
||||
# Fill fields on page 3
|
||||
writer.update_page_form_field_values(
|
||||
writer.pages[2],
|
||||
{
|
||||
"signature_page3": "John Doe",
|
||||
"date_page3": "12/25/2024"
|
||||
}
|
||||
)
|
||||
|
||||
with open("filled_multi_page.pdf", "wb") as output:
|
||||
writer.write(output)
|
||||
```
|
||||
|
||||
### Identifying page-specific fields
|
||||
|
||||
```python
|
||||
# Analyze which fields are on which pages
|
||||
for page_num, page in enumerate(reader.pages, 1):
|
||||
fields = page.get("/Annots", [])
|
||||
|
||||
if fields:
|
||||
print(f"\nPage {page_num} fields:")
|
||||
for field_ref in fields:
|
||||
field = field_ref.get_object()
|
||||
field_name = field.get("/T", "Unknown")
|
||||
print(f" - {field_name}")
|
||||
```
|
||||
|
||||
## Flattening forms
|
||||
|
||||
### Why flatten
|
||||
|
||||
Flattening makes form fields non-editable, embedding values permanently:
|
||||
|
||||
- **Security**: Prevent modifications
|
||||
- **Distribution**: Share read-only forms
|
||||
- **Printing**: Ensure correct appearance
|
||||
- **Archival**: Long-term storage
|
||||
|
||||
### Flatten with pypdf
|
||||
|
||||
```python
|
||||
from pypdf import PdfReader, PdfWriter
|
||||
|
||||
reader = PdfReader("filled.pdf")
|
||||
writer = PdfWriter()
|
||||
|
||||
# Add all pages
|
||||
for page in reader.pages:
|
||||
writer.add_page(page)
|
||||
|
||||
# Flatten all form fields
|
||||
writer.flatten_fields()
|
||||
|
||||
# Save flattened PDF
|
||||
with open("flattened.pdf", "wb") as output:
|
||||
writer.write(output)
|
||||
```
|
||||
|
||||
### Using included script
|
||||
|
||||
```bash
|
||||
python scripts/flatten_form.py filled.pdf flattened.pdf
|
||||
```
|
||||
|
||||
## Error handling patterns
|
||||
|
||||
### Robust form filling
|
||||
|
||||
```python
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from pypdf import PdfReader, PdfWriter
|
||||
from pypdf.errors import PdfReadError
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def fill_form_safe(template_path, data, output_path):
|
||||
"""Fill form with comprehensive error handling."""
|
||||
|
||||
try:
|
||||
# Validate inputs
|
||||
template = Path(template_path)
|
||||
if not template.exists():
|
||||
raise FileNotFoundError(f"Template not found: {template_path}")
|
||||
|
||||
# Read template
|
||||
logger.info(f"Reading template: {template_path}")
|
||||
reader = PdfReader(template_path)
|
||||
|
||||
if not reader.pages:
|
||||
raise ValueError("PDF has no pages")
|
||||
|
||||
# Check if form has fields
|
||||
fields = reader.get_fields()
|
||||
if not fields:
|
||||
logger.warning("PDF has no form fields")
|
||||
return False
|
||||
|
||||
# Create writer
|
||||
writer = PdfWriter()
|
||||
for page in reader.pages:
|
||||
writer.add_page(page)
|
||||
|
||||
# Validate data against schema
|
||||
missing_required = []
|
||||
invalid_fields = []
|
||||
|
||||
for field_name, field_info in fields.items():
|
||||
# Check required fields
|
||||
is_required = field_info.get("/Ff", 0) & 2 == 2
|
||||
if is_required and field_name not in data:
|
||||
missing_required.append(field_name)
|
||||
|
||||
# Check invalid field names in data
|
||||
if field_name in data:
|
||||
value = data[field_name]
|
||||
# Add type validation here if needed
|
||||
|
||||
if missing_required:
|
||||
raise ValueError(f"Missing required fields: {missing_required}")
|
||||
|
||||
# Fill fields
|
||||
logger.info("Filling form fields")
|
||||
writer.update_page_form_field_values(
|
||||
writer.pages[0],
|
||||
data
|
||||
)
|
||||
|
||||
# Write output
|
||||
logger.info(f"Writing output: {output_path}")
|
||||
with open(output_path, "wb") as output:
|
||||
writer.write(output)
|
||||
|
||||
logger.info("Form filled successfully")
|
||||
return True
|
||||
|
||||
except PdfReadError as e:
|
||||
logger.error(f"PDF read error: {e}")
|
||||
return False
|
||||
|
||||
except FileNotFoundError as e:
|
||||
logger.error(f"File error: {e}")
|
||||
return False
|
||||
|
||||
except ValueError as e:
|
||||
logger.error(f"Validation error: {e}")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error: {e}")
|
||||
return False
|
||||
|
||||
# Usage
|
||||
success = fill_form_safe(
|
||||
"template.pdf",
|
||||
{"name": "John", "email": "john@example.com"},
|
||||
"filled.pdf"
|
||||
)
|
||||
|
||||
if not success:
|
||||
exit(1)
|
||||
```
|
||||
|
||||
## Production examples
|
||||
|
||||
### Example 1: Batch form processing
|
||||
|
||||
```python
|
||||
import json
|
||||
import glob
|
||||
from pathlib import Path
|
||||
from fill_form_safe import fill_form_safe
|
||||
|
||||
# Process multiple submissions
|
||||
submissions_dir = Path("submissions")
|
||||
template = "application_template.pdf"
|
||||
output_dir = Path("completed")
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
for submission_file in submissions_dir.glob("*.json"):
|
||||
print(f"Processing: {submission_file.name}")
|
||||
|
||||
# Load submission data
|
||||
with open(submission_file) as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Fill form
|
||||
applicant_id = data.get("id", "unknown")
|
||||
output_file = output_dir / f"application_{applicant_id}.pdf"
|
||||
|
||||
success = fill_form_safe(template, data, output_file)
|
||||
|
||||
if success:
|
||||
print(f" ✓ Completed: {output_file}")
|
||||
else:
|
||||
print(f" ✗ Failed: {submission_file.name}")
|
||||
```
|
||||
|
||||
### Example 2: Form with conditional logic
|
||||
|
||||
```python
|
||||
def prepare_form_data(raw_data):
|
||||
"""Prepare form data with conditional logic."""
|
||||
|
||||
form_data = {}
|
||||
|
||||
# Basic fields
|
||||
form_data["full_name"] = raw_data["name"]
|
||||
form_data["email"] = raw_data["email"]
|
||||
|
||||
# Conditional fields
|
||||
if raw_data.get("is_student"):
|
||||
form_data["student_id"] = raw_data["student_id"]
|
||||
form_data["school_name"] = raw_data["school"]
|
||||
else:
|
||||
form_data["employer"] = raw_data.get("employer", "")
|
||||
|
||||
# Checkbox logic
|
||||
form_data["newsletter"] = "/Yes" if raw_data.get("opt_in") else "/Off"
|
||||
|
||||
# Calculated fields
|
||||
total = sum(raw_data.get("items", []))
|
||||
form_data["total_amount"] = f"${total:.2f}"
|
||||
|
||||
return form_data
|
||||
|
||||
# Usage
|
||||
raw_input = {
|
||||
"name": "Jane Smith",
|
||||
"email": "jane@example.com",
|
||||
"is_student": True,
|
||||
"student_id": "12345",
|
||||
"school": "State University",
|
||||
"opt_in": True,
|
||||
"items": [10.00, 25.50, 15.75]
|
||||
}
|
||||
|
||||
form_data = prepare_form_data(raw_input)
|
||||
fill_form_safe("template.pdf", form_data, "output.pdf")
|
||||
```
|
||||
|
||||
## Best practices
|
||||
|
||||
1. **Always analyze before filling**: Use `analyze_form.py` to understand structure
|
||||
2. **Validate early**: Check data before attempting to fill
|
||||
3. **Use logging**: Track operations for debugging
|
||||
4. **Handle errors gracefully**: Don't crash on invalid data
|
||||
5. **Test with samples**: Verify with small datasets first
|
||||
6. **Flatten when distributing**: Make read-only for recipients
|
||||
7. **Keep templates versioned**: Track form template changes
|
||||
8. **Document field mappings**: Maintain data-to-field documentation
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Fields not filling
|
||||
|
||||
1. Check field names match exactly (case-sensitive)
|
||||
2. Verify checkbox/radio values (`/Yes`, `/On`, etc.)
|
||||
3. Ensure PDF is not encrypted or protected
|
||||
4. Check if form uses XFA format (not supported by pypdf)
|
||||
|
||||
### Encoding issues
|
||||
|
||||
```python
|
||||
# Handle special characters
|
||||
field_values["name"] = "José García" # UTF-8 encoded
|
||||
```
|
||||
|
||||
### Large batch processing
|
||||
|
||||
```python
|
||||
# Process in chunks to avoid memory issues
|
||||
chunk_size = 100
|
||||
|
||||
for i in range(0, len(submissions), chunk_size):
|
||||
chunk = submissions[i:i + chunk_size]
|
||||
process_batch(chunk)
|
||||
```
|
||||
137
skills/pdf-processing-pro/OCR.md
Normal file
137
skills/pdf-processing-pro/OCR.md
Normal file
@@ -0,0 +1,137 @@
|
||||
# PDF OCR Processing Guide
|
||||
|
||||
Extract text from scanned PDFs and image-based documents.
|
||||
|
||||
## Quick start
|
||||
|
||||
```python
|
||||
import pytesseract
|
||||
from pdf2image import convert_from_path
|
||||
from PIL import Image
|
||||
|
||||
# Convert PDF to images
|
||||
images = convert_from_path("scanned.pdf")
|
||||
|
||||
# Extract text from each page
|
||||
for i, image in enumerate(images):
|
||||
text = pytesseract.image_to_string(image)
|
||||
print(f"Page {i+1}:\n{text}\n")
|
||||
```
|
||||
|
||||
## Installation
|
||||
|
||||
### Install Tesseract
|
||||
|
||||
**macOS:**
|
||||
```bash
|
||||
brew install tesseract
|
||||
```
|
||||
|
||||
**Ubuntu/Debian:**
|
||||
```bash
|
||||
sudo apt-get install tesseract-ocr
|
||||
```
|
||||
|
||||
**Windows:**
|
||||
Download from: https://github.com/UB-Mannheim/tesseract/wiki
|
||||
|
||||
### Install Python packages
|
||||
|
||||
```bash
|
||||
pip install pytesseract pdf2image pillow
|
||||
```
|
||||
|
||||
## Language support
|
||||
|
||||
```python
|
||||
# English (default)
|
||||
text = pytesseract.image_to_string(image, lang="eng")
|
||||
|
||||
# Spanish
|
||||
text = pytesseract.image_to_string(image, lang="spa")
|
||||
|
||||
# Multiple languages
|
||||
text = pytesseract.image_to_string(image, lang="eng+spa+fra")
|
||||
```
|
||||
|
||||
Install additional languages:
|
||||
```bash
|
||||
# macOS
|
||||
brew install tesseract-lang
|
||||
|
||||
# Ubuntu
|
||||
sudo apt-get install tesseract-ocr-spa tesseract-ocr-fra
|
||||
```
|
||||
|
||||
## Image preprocessing
|
||||
|
||||
```python
|
||||
from PIL import Image, ImageEnhance, ImageFilter
|
||||
|
||||
def preprocess_for_ocr(image):
|
||||
"""Optimize image for better OCR accuracy."""
|
||||
|
||||
# Convert to grayscale
|
||||
image = image.convert("L")
|
||||
|
||||
# Increase contrast
|
||||
enhancer = ImageEnhance.Contrast(image)
|
||||
image = enhancer.enhance(2.0)
|
||||
|
||||
# Denoise
|
||||
image = image.filter(ImageFilter.MedianFilter())
|
||||
|
||||
# Sharpen
|
||||
image = image.filter(ImageFilter.SHARPEN)
|
||||
|
||||
return image
|
||||
|
||||
# Usage
|
||||
image = Image.open("scanned_page.png")
|
||||
processed = preprocess_for_ocr(image)
|
||||
text = pytesseract.image_to_string(processed)
|
||||
```
|
||||
|
||||
## Best practices
|
||||
|
||||
1. **Preprocess images** for better accuracy
|
||||
2. **Use appropriate language** models
|
||||
3. **Batch process** large documents
|
||||
4. **Cache results** to avoid re-processing
|
||||
5. **Validate output** - OCR is not 100% accurate
|
||||
6. **Consider confidence scores** for quality checks
|
||||
|
||||
## Production example
|
||||
|
||||
```python
|
||||
import pytesseract
|
||||
from pdf2image import convert_from_path
|
||||
from PIL import Image
|
||||
|
||||
def ocr_pdf(pdf_path, output_path):
|
||||
"""OCR PDF and save to text file."""
|
||||
|
||||
# Convert to images
|
||||
images = convert_from_path(pdf_path, dpi=300)
|
||||
|
||||
full_text = []
|
||||
|
||||
for i, image in enumerate(images, 1):
|
||||
print(f"Processing page {i}/{len(images)}")
|
||||
|
||||
# Preprocess
|
||||
processed = preprocess_for_ocr(image)
|
||||
|
||||
# OCR
|
||||
text = pytesseract.image_to_string(processed, lang="eng")
|
||||
full_text.append(f"--- Page {i} ---\n{text}\n")
|
||||
|
||||
# Save
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
f.write("\n".join(full_text))
|
||||
|
||||
print(f"Saved to {output_path}")
|
||||
|
||||
# Usage
|
||||
ocr_pdf("scanned_document.pdf", "extracted_text.txt")
|
||||
```
|
||||
296
skills/pdf-processing-pro/SKILL.md
Normal file
296
skills/pdf-processing-pro/SKILL.md
Normal file
@@ -0,0 +1,296 @@
|
||||
---
|
||||
name: PDF Processing Pro
|
||||
description: Production-ready PDF processing with forms, tables, OCR, validation, and batch operations. Use when working with complex PDF workflows in production environments, processing large volumes of PDFs, or requiring robust error handling and validation.
|
||||
---
|
||||
|
||||
# PDF Processing Pro
|
||||
|
||||
Production-ready PDF processing toolkit with pre-built scripts, comprehensive error handling, and support for complex workflows.
|
||||
|
||||
## Quick start
|
||||
|
||||
### Extract text from PDF
|
||||
|
||||
```python
|
||||
import pdfplumber
|
||||
|
||||
with pdfplumber.open("document.pdf") as pdf:
|
||||
text = pdf.pages[0].extract_text()
|
||||
print(text)
|
||||
```
|
||||
|
||||
### Analyze PDF form (using included script)
|
||||
|
||||
```bash
|
||||
python scripts/analyze_form.py input.pdf --output fields.json
|
||||
# Returns: JSON with all form fields, types, and positions
|
||||
```
|
||||
|
||||
### Fill PDF form with validation
|
||||
|
||||
```bash
|
||||
python scripts/fill_form.py input.pdf data.json output.pdf
|
||||
# Validates all fields before filling, includes error reporting
|
||||
```
|
||||
|
||||
### Extract tables from PDF
|
||||
|
||||
```bash
|
||||
python scripts/extract_tables.py report.pdf --output tables.csv
|
||||
# Extracts all tables with automatic column detection
|
||||
```
|
||||
|
||||
## Features
|
||||
|
||||
### ✅ Production-ready scripts
|
||||
|
||||
All scripts include:
|
||||
- **Error handling**: Graceful failures with detailed error messages
|
||||
- **Validation**: Input validation and type checking
|
||||
- **Logging**: Configurable logging with timestamps
|
||||
- **Type hints**: Full type annotations for IDE support
|
||||
- **CLI interface**: `--help` flag for all scripts
|
||||
- **Exit codes**: Proper exit codes for automation
|
||||
|
||||
### ✅ Comprehensive workflows
|
||||
|
||||
- **PDF Forms**: Complete form processing pipeline
|
||||
- **Table Extraction**: Advanced table detection and extraction
|
||||
- **OCR Processing**: Scanned PDF text extraction
|
||||
- **Batch Operations**: Process multiple PDFs efficiently
|
||||
- **Validation**: Pre and post-processing validation
|
||||
|
||||
## Advanced topics
|
||||
|
||||
### PDF Form Processing
|
||||
|
||||
For complete form workflows including:
|
||||
- Field analysis and detection
|
||||
- Dynamic form filling
|
||||
- Validation rules
|
||||
- Multi-page forms
|
||||
- Checkbox and radio button handling
|
||||
|
||||
See [FORMS.md](FORMS.md)
|
||||
|
||||
### Table Extraction
|
||||
|
||||
For complex table extraction:
|
||||
- Multi-page tables
|
||||
- Merged cells
|
||||
- Nested tables
|
||||
- Custom table detection
|
||||
- Export to CSV/Excel
|
||||
|
||||
See [TABLES.md](TABLES.md)
|
||||
|
||||
### OCR Processing
|
||||
|
||||
For scanned PDFs and image-based documents:
|
||||
- Tesseract integration
|
||||
- Language support
|
||||
- Image preprocessing
|
||||
- Confidence scoring
|
||||
- Batch OCR
|
||||
|
||||
See [OCR.md](OCR.md)
|
||||
|
||||
## Included scripts
|
||||
|
||||
### Form processing
|
||||
|
||||
**analyze_form.py** - Extract form field information
|
||||
```bash
|
||||
python scripts/analyze_form.py input.pdf [--output fields.json] [--verbose]
|
||||
```
|
||||
|
||||
**fill_form.py** - Fill PDF forms with data
|
||||
```bash
|
||||
python scripts/fill_form.py input.pdf data.json output.pdf [--validate]
|
||||
```
|
||||
|
||||
**validate_form.py** - Validate form data before filling
|
||||
```bash
|
||||
python scripts/validate_form.py data.json schema.json
|
||||
```
|
||||
|
||||
### Table extraction
|
||||
|
||||
**extract_tables.py** - Extract tables to CSV/Excel
|
||||
```bash
|
||||
python scripts/extract_tables.py input.pdf [--output tables.csv] [--format csv|excel]
|
||||
```
|
||||
|
||||
### Text extraction
|
||||
|
||||
**extract_text.py** - Extract text with formatting preservation
|
||||
```bash
|
||||
python scripts/extract_text.py input.pdf [--output text.txt] [--preserve-formatting]
|
||||
```
|
||||
|
||||
### Utilities
|
||||
|
||||
**merge_pdfs.py** - Merge multiple PDFs
|
||||
```bash
|
||||
python scripts/merge_pdfs.py file1.pdf file2.pdf file3.pdf --output merged.pdf
|
||||
```
|
||||
|
||||
**split_pdf.py** - Split PDF into individual pages
|
||||
```bash
|
||||
python scripts/split_pdf.py input.pdf --output-dir pages/
|
||||
```
|
||||
|
||||
**validate_pdf.py** - Validate PDF integrity
|
||||
```bash
|
||||
python scripts/validate_pdf.py input.pdf
|
||||
```
|
||||
|
||||
## Common workflows
|
||||
|
||||
### Workflow 1: Process form submissions
|
||||
|
||||
```bash
|
||||
# 1. Analyze form structure
|
||||
python scripts/analyze_form.py template.pdf --output schema.json
|
||||
|
||||
# 2. Validate submission data
|
||||
python scripts/validate_form.py submission.json schema.json
|
||||
|
||||
# 3. Fill form
|
||||
python scripts/fill_form.py template.pdf submission.json completed.pdf
|
||||
|
||||
# 4. Validate output
|
||||
python scripts/validate_pdf.py completed.pdf
|
||||
```
|
||||
|
||||
### Workflow 2: Extract data from reports
|
||||
|
||||
```bash
|
||||
# 1. Extract tables
|
||||
python scripts/extract_tables.py monthly_report.pdf --output data.csv
|
||||
|
||||
# 2. Extract text for analysis
|
||||
python scripts/extract_text.py monthly_report.pdf --output report.txt
|
||||
```
|
||||
|
||||
### Workflow 3: Batch processing
|
||||
|
||||
```python
|
||||
import glob
|
||||
from pathlib import Path
|
||||
import subprocess
|
||||
|
||||
# Process all PDFs in directory
|
||||
for pdf_file in glob.glob("invoices/*.pdf"):
|
||||
output_file = Path("processed") / Path(pdf_file).name
|
||||
|
||||
result = subprocess.run([
|
||||
"python", "scripts/extract_text.py",
|
||||
pdf_file,
|
||||
"--output", str(output_file)
|
||||
], capture_output=True)
|
||||
|
||||
if result.returncode == 0:
|
||||
print(f"✓ Processed: {pdf_file}")
|
||||
else:
|
||||
print(f"✗ Failed: {pdf_file} - {result.stderr}")
|
||||
```
|
||||
|
||||
## Error handling
|
||||
|
||||
All scripts follow consistent error patterns:
|
||||
|
||||
```python
|
||||
# Exit codes
|
||||
# 0 - Success
|
||||
# 1 - File not found
|
||||
# 2 - Invalid input
|
||||
# 3 - Processing error
|
||||
# 4 - Validation error
|
||||
|
||||
# Example usage in automation
|
||||
result = subprocess.run(["python", "scripts/fill_form.py", ...])
|
||||
|
||||
if result.returncode == 0:
|
||||
print("Success")
|
||||
elif result.returncode == 4:
|
||||
print("Validation failed - check input data")
|
||||
else:
|
||||
print(f"Error occurred: {result.returncode}")
|
||||
```
|
||||
|
||||
## Dependencies
|
||||
|
||||
All scripts require:
|
||||
|
||||
```bash
|
||||
pip install pdfplumber pypdf pillow pytesseract pandas
|
||||
```
|
||||
|
||||
Optional for OCR:
|
||||
```bash
|
||||
# Install tesseract-ocr system package
|
||||
# macOS: brew install tesseract
|
||||
# Ubuntu: apt-get install tesseract-ocr
|
||||
# Windows: Download from GitHub releases
|
||||
```
|
||||
|
||||
## Performance tips
|
||||
|
||||
- **Use batch processing** for multiple PDFs
|
||||
- **Enable multiprocessing** with `--parallel` flag (where supported)
|
||||
- **Cache extracted data** to avoid re-processing
|
||||
- **Validate inputs early** to fail fast
|
||||
- **Use streaming** for large PDFs (>50MB)
|
||||
|
||||
## Best practices
|
||||
|
||||
1. **Always validate inputs** before processing
|
||||
2. **Use try-except** in custom scripts
|
||||
3. **Log all operations** for debugging
|
||||
4. **Test with sample PDFs** before production
|
||||
5. **Set timeouts** for long-running operations
|
||||
6. **Check exit codes** in automation
|
||||
7. **Backup originals** before modification
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Common issues
|
||||
|
||||
**"Module not found" errors**:
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
**Tesseract not found**:
|
||||
```bash
|
||||
# Install tesseract system package (see Dependencies)
|
||||
```
|
||||
|
||||
**Memory errors with large PDFs**:
|
||||
```python
|
||||
# Process page by page instead of loading entire PDF
|
||||
with pdfplumber.open("large.pdf") as pdf:
|
||||
for page in pdf.pages:
|
||||
text = page.extract_text()
|
||||
# Process page immediately
|
||||
```
|
||||
|
||||
**Permission errors**:
|
||||
```bash
|
||||
chmod +x scripts/*.py
|
||||
```
|
||||
|
||||
## Getting help
|
||||
|
||||
All scripts support `--help`:
|
||||
|
||||
```bash
|
||||
python scripts/analyze_form.py --help
|
||||
python scripts/extract_tables.py --help
|
||||
```
|
||||
|
||||
For detailed documentation on specific topics, see:
|
||||
- [FORMS.md](FORMS.md) - Complete form processing guide
|
||||
- [TABLES.md](TABLES.md) - Advanced table extraction
|
||||
- [OCR.md](OCR.md) - Scanned PDF processing
|
||||
626
skills/pdf-processing-pro/TABLES.md
Normal file
626
skills/pdf-processing-pro/TABLES.md
Normal file
@@ -0,0 +1,626 @@
|
||||
# PDF Table Extraction Guide
|
||||
|
||||
Advanced table extraction strategies for production environments.
|
||||
|
||||
## Table of contents
|
||||
|
||||
- Basic table extraction
|
||||
- Multi-page tables
|
||||
- Complex table structures
|
||||
- Export formats
|
||||
- Table detection algorithms
|
||||
- Custom extraction rules
|
||||
- Performance optimization
|
||||
- Production examples
|
||||
|
||||
## Basic table extraction
|
||||
|
||||
### Using pdfplumber (recommended)
|
||||
|
||||
```python
|
||||
import pdfplumber
|
||||
|
||||
with pdfplumber.open("report.pdf") as pdf:
|
||||
page = pdf.pages[0]
|
||||
tables = page.extract_tables()
|
||||
|
||||
for i, table in enumerate(tables):
|
||||
print(f"\nTable {i + 1}:")
|
||||
for row in table:
|
||||
print(row)
|
||||
```
|
||||
|
||||
### Using included script
|
||||
|
||||
```bash
|
||||
python scripts/extract_tables.py report.pdf --output tables.csv
|
||||
```
|
||||
|
||||
Output:
|
||||
```csv
|
||||
Name,Age,City
|
||||
John Doe,30,New York
|
||||
Jane Smith,25,Los Angeles
|
||||
Bob Johnson,35,Chicago
|
||||
```
|
||||
|
||||
## Table extraction strategies
|
||||
|
||||
### Strategy 1: Automatic detection
|
||||
|
||||
Let pdfplumber auto-detect tables:
|
||||
|
||||
```python
|
||||
import pdfplumber
|
||||
|
||||
with pdfplumber.open("document.pdf") as pdf:
|
||||
for page_num, page in enumerate(pdf.pages, 1):
|
||||
tables = page.extract_tables()
|
||||
|
||||
if tables:
|
||||
print(f"Found {len(tables)} table(s) on page {page_num}")
|
||||
|
||||
for table_num, table in enumerate(tables, 1):
|
||||
print(f"\nTable {table_num}:")
|
||||
# First row is usually headers
|
||||
headers = table[0]
|
||||
print(f"Columns: {headers}")
|
||||
|
||||
# Data rows
|
||||
for row in table[1:]:
|
||||
print(row)
|
||||
```
|
||||
|
||||
### Strategy 2: Custom table settings
|
||||
|
||||
Fine-tune detection with custom settings:
|
||||
|
||||
```python
|
||||
import pdfplumber
|
||||
|
||||
table_settings = {
|
||||
"vertical_strategy": "lines", # or "text", "lines_strict"
|
||||
"horizontal_strategy": "lines",
|
||||
"explicit_vertical_lines": [],
|
||||
"explicit_horizontal_lines": [],
|
||||
"snap_tolerance": 3,
|
||||
"join_tolerance": 3,
|
||||
"edge_min_length": 3,
|
||||
"min_words_vertical": 3,
|
||||
"min_words_horizontal": 1,
|
||||
"keep_blank_chars": False,
|
||||
"text_tolerance": 3,
|
||||
"text_x_tolerance": 3,
|
||||
"text_y_tolerance": 3,
|
||||
"intersection_tolerance": 3
|
||||
}
|
||||
|
||||
with pdfplumber.open("document.pdf") as pdf:
|
||||
page = pdf.pages[0]
|
||||
tables = page.extract_tables(table_settings=table_settings)
|
||||
```
|
||||
|
||||
### Strategy 3: Explicit boundaries
|
||||
|
||||
Define table boundaries manually:
|
||||
|
||||
```python
|
||||
import pdfplumber
|
||||
|
||||
with pdfplumber.open("document.pdf") as pdf:
|
||||
page = pdf.pages[0]
|
||||
|
||||
# Define bounding box (x0, top, x1, bottom)
|
||||
bbox = (50, 100, 550, 700)
|
||||
|
||||
# Extract table within bounding box
|
||||
cropped = page.within_bbox(bbox)
|
||||
tables = cropped.extract_tables()
|
||||
```
|
||||
|
||||
## Multi-page tables
|
||||
|
||||
### Detect and merge multi-page tables
|
||||
|
||||
```python
|
||||
import pdfplumber
|
||||
|
||||
def extract_multipage_table(pdf_path, start_page=0, end_page=None):
|
||||
"""Extract table that spans multiple pages."""
|
||||
|
||||
all_rows = []
|
||||
headers = None
|
||||
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
pages = pdf.pages[start_page:end_page]
|
||||
|
||||
for page_num, page in enumerate(pages):
|
||||
tables = page.extract_tables()
|
||||
|
||||
if not tables:
|
||||
continue
|
||||
|
||||
# Assume first table on page
|
||||
table = tables[0]
|
||||
|
||||
if page_num == 0:
|
||||
# First page: capture headers and data
|
||||
headers = table[0]
|
||||
all_rows.extend(table[1:])
|
||||
else:
|
||||
# Subsequent pages: skip headers if they repeat
|
||||
if table[0] == headers:
|
||||
all_rows.extend(table[1:])
|
||||
else:
|
||||
all_rows.extend(table)
|
||||
|
||||
return [headers] + all_rows if headers else all_rows
|
||||
|
||||
# Usage
|
||||
table = extract_multipage_table("report.pdf", start_page=2, end_page=5)
|
||||
|
||||
print(f"Extracted {len(table) - 1} rows")
|
||||
print(f"Columns: {table[0]}")
|
||||
```
|
||||
|
||||
## Complex table structures
|
||||
|
||||
### Handling merged cells
|
||||
|
||||
```python
|
||||
import pdfplumber
|
||||
|
||||
def handle_merged_cells(table):
|
||||
"""Process table with merged cells."""
|
||||
|
||||
processed = []
|
||||
|
||||
for row in table:
|
||||
new_row = []
|
||||
last_value = None
|
||||
|
||||
for cell in row:
|
||||
if cell is None or cell == "":
|
||||
# Merged cell - use value from left
|
||||
new_row.append(last_value)
|
||||
else:
|
||||
new_row.append(cell)
|
||||
last_value = cell
|
||||
|
||||
processed.append(new_row)
|
||||
|
||||
return processed
|
||||
|
||||
# Usage
|
||||
with pdfplumber.open("document.pdf") as pdf:
|
||||
table = pdf.pages[0].extract_tables()[0]
|
||||
clean_table = handle_merged_cells(table)
|
||||
```
|
||||
|
||||
### Nested tables
|
||||
|
||||
```python
|
||||
def extract_nested_tables(page, bbox):
|
||||
"""Extract nested tables from a region."""
|
||||
|
||||
cropped = page.within_bbox(bbox)
|
||||
|
||||
# Try to detect sub-regions with tables
|
||||
tables = cropped.extract_tables()
|
||||
|
||||
result = []
|
||||
for table in tables:
|
||||
# Process each nested table
|
||||
if table:
|
||||
result.append({
|
||||
"type": "nested",
|
||||
"data": table
|
||||
})
|
||||
|
||||
return result
|
||||
```
|
||||
|
||||
### Tables with varying column counts
|
||||
|
||||
```python
|
||||
def normalize_table_columns(table):
|
||||
"""Normalize table with inconsistent column counts."""
|
||||
|
||||
if not table:
|
||||
return table
|
||||
|
||||
# Find max column count
|
||||
max_cols = max(len(row) for row in table)
|
||||
|
||||
# Pad short rows
|
||||
normalized = []
|
||||
for row in table:
|
||||
if len(row) < max_cols:
|
||||
# Pad with empty strings
|
||||
row = row + [""] * (max_cols - len(row))
|
||||
normalized.append(row)
|
||||
|
||||
return normalized
|
||||
```
|
||||
|
||||
## Export formats
|
||||
|
||||
### Export to CSV
|
||||
|
||||
```python
|
||||
import csv
|
||||
|
||||
def export_to_csv(table, output_path):
|
||||
"""Export table to CSV."""
|
||||
|
||||
with open(output_path, "w", newline="", encoding="utf-8") as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerows(table)
|
||||
|
||||
# Usage
|
||||
table = extract_table("report.pdf")
|
||||
export_to_csv(table, "output.csv")
|
||||
```
|
||||
|
||||
### Export to Excel
|
||||
|
||||
```python
|
||||
import pandas as pd
|
||||
|
||||
def export_to_excel(tables, output_path):
|
||||
"""Export multiple tables to Excel with sheets."""
|
||||
|
||||
with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
|
||||
for i, table in enumerate(tables):
|
||||
if not table:
|
||||
continue
|
||||
|
||||
# Convert to DataFrame
|
||||
headers = table[0]
|
||||
data = table[1:]
|
||||
df = pd.DataFrame(data, columns=headers)
|
||||
|
||||
# Write to sheet
|
||||
sheet_name = f"Table_{i + 1}"
|
||||
df.to_excel(writer, sheet_name=sheet_name, index=False)
|
||||
|
||||
# Auto-adjust column widths
|
||||
worksheet = writer.sheets[sheet_name]
|
||||
for column in worksheet.columns:
|
||||
max_length = 0
|
||||
column_letter = column[0].column_letter
|
||||
for cell in column:
|
||||
if len(str(cell.value)) > max_length:
|
||||
max_length = len(str(cell.value))
|
||||
worksheet.column_dimensions[column_letter].width = max_length + 2
|
||||
|
||||
# Usage
|
||||
tables = extract_all_tables("report.pdf")
|
||||
export_to_excel(tables, "output.xlsx")
|
||||
```
|
||||
|
||||
### Export to JSON
|
||||
|
||||
```python
|
||||
import json
|
||||
|
||||
def export_to_json(table, output_path):
|
||||
"""Export table to JSON."""
|
||||
|
||||
if not table:
|
||||
return
|
||||
|
||||
headers = table[0]
|
||||
data = table[1:]
|
||||
|
||||
# Convert to list of dictionaries
|
||||
records = []
|
||||
for row in data:
|
||||
record = {}
|
||||
for i, header in enumerate(headers):
|
||||
value = row[i] if i < len(row) else None
|
||||
record[header] = value
|
||||
records.append(record)
|
||||
|
||||
# Save to JSON
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
json.dump(records, f, indent=2)
|
||||
|
||||
# Usage
|
||||
table = extract_table("report.pdf")
|
||||
export_to_json(table, "output.json")
|
||||
```
|
||||
|
||||
## Table detection algorithms
|
||||
|
||||
### Visual debugging
|
||||
|
||||
```python
|
||||
import pdfplumber
|
||||
|
||||
def visualize_table_detection(pdf_path, page_num=0, output_path="debug.png"):
|
||||
"""Visualize detected table structure."""
|
||||
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
page = pdf.pages[page_num]
|
||||
|
||||
# Draw detected table lines
|
||||
im = page.to_image(resolution=150)
|
||||
im = im.debug_tablefinder()
|
||||
im.save(output_path)
|
||||
|
||||
print(f"Saved debug image to {output_path}")
|
||||
|
||||
# Usage
|
||||
visualize_table_detection("document.pdf", page_num=0)
|
||||
```
|
||||
|
||||
### Algorithm: Line-based detection
|
||||
|
||||
Best for tables with visible borders:
|
||||
|
||||
```python
|
||||
table_settings = {
|
||||
"vertical_strategy": "lines",
|
||||
"horizontal_strategy": "lines"
|
||||
}
|
||||
|
||||
tables = page.extract_tables(table_settings=table_settings)
|
||||
```
|
||||
|
||||
### Algorithm: Text-based detection
|
||||
|
||||
Best for tables without borders:
|
||||
|
||||
```python
|
||||
table_settings = {
|
||||
"vertical_strategy": "text",
|
||||
"horizontal_strategy": "text"
|
||||
}
|
||||
|
||||
tables = page.extract_tables(table_settings=table_settings)
|
||||
```
|
||||
|
||||
### Algorithm: Explicit lines
|
||||
|
||||
For complex layouts, define lines manually:
|
||||
|
||||
```python
|
||||
# Define vertical lines at x-coordinates
|
||||
vertical_lines = [50, 150, 250, 350, 450, 550]
|
||||
|
||||
# Define horizontal lines at y-coordinates
|
||||
horizontal_lines = [100, 130, 160, 190, 220, 250]
|
||||
|
||||
table_settings = {
|
||||
"explicit_vertical_lines": vertical_lines,
|
||||
"explicit_horizontal_lines": horizontal_lines
|
||||
}
|
||||
|
||||
tables = page.extract_tables(table_settings=table_settings)
|
||||
```
|
||||
|
||||
## Custom extraction rules
|
||||
|
||||
### Rule-based extraction
|
||||
|
||||
```python
|
||||
def extract_with_rules(page, rules):
|
||||
"""Extract table using custom rules."""
|
||||
|
||||
# Rule: "Headers are bold"
|
||||
if rules.get("bold_headers"):
|
||||
chars = page.chars
|
||||
bold_chars = [c for c in chars if "Bold" in c.get("fontname", "")]
|
||||
# Use bold chars to identify header row
|
||||
pass
|
||||
|
||||
# Rule: "First column is always left-aligned"
|
||||
if rules.get("left_align_first_col"):
|
||||
# Adjust extraction to respect alignment
|
||||
pass
|
||||
|
||||
# Rule: "Currency values in last column"
|
||||
if rules.get("currency_last_col"):
|
||||
# Parse currency format
|
||||
pass
|
||||
|
||||
# Extract with adjusted settings
|
||||
return page.extract_tables()
|
||||
```
|
||||
|
||||
### Post-processing rules
|
||||
|
||||
```python
|
||||
def apply_post_processing(table, rules):
|
||||
"""Apply post-processing rules to extracted table."""
|
||||
|
||||
processed = []
|
||||
|
||||
for row in table:
|
||||
new_row = []
|
||||
|
||||
for i, cell in enumerate(row):
|
||||
value = cell
|
||||
|
||||
# Rule: Strip whitespace
|
||||
if rules.get("strip_whitespace"):
|
||||
value = value.strip() if value else value
|
||||
|
||||
# Rule: Convert currency to float
|
||||
if rules.get("parse_currency") and i == len(row) - 1:
|
||||
if value and "$" in value:
|
||||
value = float(value.replace("$", "").replace(",", ""))
|
||||
|
||||
# Rule: Parse dates
|
||||
if rules.get("parse_dates") and i == 0:
|
||||
# Convert to datetime
|
||||
pass
|
||||
|
||||
new_row.append(value)
|
||||
|
||||
processed.append(new_row)
|
||||
|
||||
return processed
|
||||
```
|
||||
|
||||
## Performance optimization
|
||||
|
||||
### Process large PDFs efficiently
|
||||
|
||||
```python
|
||||
def extract_tables_optimized(pdf_path):
|
||||
"""Extract tables with memory optimization."""
|
||||
|
||||
import gc
|
||||
|
||||
results = []
|
||||
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
for page_num, page in enumerate(pdf.pages):
|
||||
print(f"Processing page {page_num + 1}/{len(pdf.pages)}")
|
||||
|
||||
# Extract tables from current page
|
||||
tables = page.extract_tables()
|
||||
results.extend(tables)
|
||||
|
||||
# Force garbage collection
|
||||
gc.collect()
|
||||
|
||||
return results
|
||||
```
|
||||
|
||||
### Parallel processing
|
||||
|
||||
```python
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
import pdfplumber
|
||||
|
||||
def extract_page_tables(args):
|
||||
"""Extract tables from a single page."""
|
||||
pdf_path, page_num = args
|
||||
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
page = pdf.pages[page_num]
|
||||
return page.extract_tables()
|
||||
|
||||
def extract_tables_parallel(pdf_path, max_workers=4):
|
||||
"""Extract tables using multiple processes."""
|
||||
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
page_count = len(pdf.pages)
|
||||
|
||||
# Create tasks
|
||||
tasks = [(pdf_path, i) for i in range(page_count)]
|
||||
|
||||
# Process in parallel
|
||||
with ProcessPoolExecutor(max_workers=max_workers) as executor:
|
||||
results = list(executor.map(extract_page_tables, tasks))
|
||||
|
||||
# Flatten results
|
||||
all_tables = []
|
||||
for page_tables in results:
|
||||
all_tables.extend(page_tables)
|
||||
|
||||
return all_tables
|
||||
```
|
||||
|
||||
## Production examples
|
||||
|
||||
### Example 1: Financial report extraction
|
||||
|
||||
```python
|
||||
import pdfplumber
|
||||
import pandas as pd
|
||||
from decimal import Decimal
|
||||
|
||||
def extract_financial_tables(pdf_path):
|
||||
"""Extract financial data with proper number formatting."""
|
||||
|
||||
tables = []
|
||||
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
for page in pdf.pages:
|
||||
page_tables = page.extract_tables()
|
||||
|
||||
for table in page_tables:
|
||||
# Convert to DataFrame
|
||||
df = pd.DataFrame(table[1:], columns=table[0])
|
||||
|
||||
# Parse currency columns
|
||||
for col in df.columns:
|
||||
if df[col].str.contains("$", na=False).any():
|
||||
df[col] = df[col].str.replace(r"[$,()]", "", regex=True)
|
||||
df[col] = pd.to_numeric(df[col], errors="coerce")
|
||||
|
||||
tables.append(df)
|
||||
|
||||
return tables
|
||||
```
|
||||
|
||||
### Example 2: Batch table extraction
|
||||
|
||||
```python
|
||||
import glob
|
||||
from pathlib import Path
|
||||
|
||||
def batch_extract_tables(input_dir, output_dir):
|
||||
"""Extract tables from all PDFs in directory."""
|
||||
|
||||
input_path = Path(input_dir)
|
||||
output_path = Path(output_dir)
|
||||
output_path.mkdir(exist_ok=True)
|
||||
|
||||
for pdf_file in input_path.glob("*.pdf"):
|
||||
print(f"Processing: {pdf_file.name}")
|
||||
|
||||
try:
|
||||
# Extract tables
|
||||
tables = extract_all_tables(str(pdf_file))
|
||||
|
||||
# Export to Excel
|
||||
output_file = output_path / f"{pdf_file.stem}_tables.xlsx"
|
||||
export_to_excel(tables, str(output_file))
|
||||
|
||||
print(f" ✓ Extracted {len(tables)} table(s)")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ✗ Error: {e}")
|
||||
|
||||
# Usage
|
||||
batch_extract_tables("invoices/", "extracted/")
|
||||
```
|
||||
|
||||
## Best practices
|
||||
|
||||
1. **Visualize first**: Use debug mode to understand table structure
|
||||
2. **Test settings**: Try different strategies for best results
|
||||
3. **Handle errors**: PDFs vary widely in quality
|
||||
4. **Validate output**: Check extracted data makes sense
|
||||
5. **Post-process**: Clean and normalize extracted data
|
||||
6. **Use pandas**: Leverage DataFrame operations for analysis
|
||||
7. **Cache results**: Avoid re-processing large files
|
||||
8. **Monitor performance**: Profile for bottlenecks
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Tables not detected
|
||||
|
||||
1. Try different detection strategies
|
||||
2. Use visual debugging to see structure
|
||||
3. Define explicit lines manually
|
||||
4. Check if table is actually an image
|
||||
|
||||
### Incorrect cell values
|
||||
|
||||
1. Adjust snap/join tolerance
|
||||
2. Check text extraction quality
|
||||
3. Use post-processing to clean data
|
||||
4. Verify PDF is not scanned image
|
||||
|
||||
### Performance issues
|
||||
|
||||
1. Process pages individually
|
||||
2. Use parallel processing
|
||||
3. Reduce image resolution
|
||||
4. Extract only needed pages
|
||||
307
skills/pdf-processing-pro/scripts/analyze_form.py
Normal file
307
skills/pdf-processing-pro/scripts/analyze_form.py
Normal file
@@ -0,0 +1,307 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Analyze PDF form fields and structure.
|
||||
|
||||
Usage:
|
||||
python analyze_form.py input.pdf [--output fields.json] [--verbose]
|
||||
|
||||
Returns:
|
||||
JSON with all form fields, types, positions, and metadata
|
||||
|
||||
Exit codes:
|
||||
0 - Success
|
||||
1 - File not found
|
||||
2 - Invalid PDF
|
||||
3 - Processing error
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
import logging
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Any
|
||||
|
||||
try:
|
||||
from pypdf import PdfReader
|
||||
except ImportError:
|
||||
print("Error: pypdf not installed. Run: pip install pypdf", file=sys.stderr)
|
||||
sys.exit(3)
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class FormField:
|
||||
"""Represents a PDF form field."""
|
||||
|
||||
def __init__(self, name: str, field_dict: Dict[str, Any]):
|
||||
self.name = name
|
||||
self.raw_data = field_dict
|
||||
|
||||
@property
|
||||
def field_type(self) -> str:
|
||||
"""Get field type."""
|
||||
ft = self.raw_data.get('/FT', '')
|
||||
type_map = {
|
||||
'/Tx': 'text',
|
||||
'/Btn': 'button', # checkbox or radio
|
||||
'/Ch': 'choice', # dropdown or list
|
||||
'/Sig': 'signature'
|
||||
}
|
||||
return type_map.get(ft, 'unknown')
|
||||
|
||||
@property
|
||||
def value(self) -> Optional[str]:
|
||||
"""Get current field value."""
|
||||
val = self.raw_data.get('/V')
|
||||
return str(val) if val else None
|
||||
|
||||
@property
|
||||
def default_value(self) -> Optional[str]:
|
||||
"""Get default field value."""
|
||||
dv = self.raw_data.get('/DV')
|
||||
return str(dv) if dv else None
|
||||
|
||||
@property
|
||||
def is_required(self) -> bool:
|
||||
"""Check if field is required."""
|
||||
flags = self.raw_data.get('/Ff', 0)
|
||||
# Bit 2 indicates required
|
||||
return bool(flags & 2)
|
||||
|
||||
@property
|
||||
def is_readonly(self) -> bool:
|
||||
"""Check if field is read-only."""
|
||||
flags = self.raw_data.get('/Ff', 0)
|
||||
# Bit 1 indicates read-only
|
||||
return bool(flags & 1)
|
||||
|
||||
@property
|
||||
def options(self) -> List[str]:
|
||||
"""Get options for choice fields."""
|
||||
if self.field_type != 'choice':
|
||||
return []
|
||||
|
||||
opts = self.raw_data.get('/Opt', [])
|
||||
if isinstance(opts, list):
|
||||
return [str(opt) for opt in opts]
|
||||
return []
|
||||
|
||||
@property
|
||||
def max_length(self) -> Optional[int]:
|
||||
"""Get max length for text fields."""
|
||||
if self.field_type == 'text':
|
||||
return self.raw_data.get('/MaxLen')
|
||||
return None
|
||||
|
||||
@property
|
||||
def rect(self) -> Optional[List[float]]:
|
||||
"""Get field position and size [x0, y0, x1, y1]."""
|
||||
return self.raw_data.get('/Rect')
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert to dictionary."""
|
||||
result = {
|
||||
'name': self.name,
|
||||
'type': self.field_type,
|
||||
'required': self.is_required,
|
||||
'readonly': self.is_readonly
|
||||
}
|
||||
|
||||
if self.value is not None:
|
||||
result['value'] = self.value
|
||||
|
||||
if self.default_value is not None:
|
||||
result['default_value'] = self.default_value
|
||||
|
||||
if self.options:
|
||||
result['options'] = self.options
|
||||
|
||||
if self.max_length is not None:
|
||||
result['max_length'] = self.max_length
|
||||
|
||||
if self.rect:
|
||||
result['position'] = {
|
||||
'x0': float(self.rect[0]),
|
||||
'y0': float(self.rect[1]),
|
||||
'x1': float(self.rect[2]),
|
||||
'y1': float(self.rect[3]),
|
||||
'width': float(self.rect[2] - self.rect[0]),
|
||||
'height': float(self.rect[3] - self.rect[1])
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
|
||||
class PDFFormAnalyzer:
|
||||
"""Analyzes PDF forms and extracts field information."""
|
||||
|
||||
def __init__(self, pdf_path: str):
|
||||
self.pdf_path = Path(pdf_path)
|
||||
self.reader: Optional[PdfReader] = None
|
||||
self._validate_file()
|
||||
|
||||
def _validate_file(self) -> None:
|
||||
"""Validate PDF file exists and is readable."""
|
||||
if not self.pdf_path.exists():
|
||||
logger.error(f"PDF not found: {self.pdf_path}")
|
||||
raise FileNotFoundError(f"PDF not found: {self.pdf_path}")
|
||||
|
||||
if not self.pdf_path.is_file():
|
||||
logger.error(f"Not a file: {self.pdf_path}")
|
||||
raise ValueError(f"Not a file: {self.pdf_path}")
|
||||
|
||||
if self.pdf_path.suffix.lower() != '.pdf':
|
||||
logger.error(f"Not a PDF file: {self.pdf_path}")
|
||||
raise ValueError(f"Not a PDF file: {self.pdf_path}")
|
||||
|
||||
def analyze(self) -> Dict[str, Dict[str, Any]]:
|
||||
"""
|
||||
Analyze PDF and extract all form fields.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping field names to field information
|
||||
"""
|
||||
try:
|
||||
self.reader = PdfReader(str(self.pdf_path))
|
||||
|
||||
if not self.reader.pages:
|
||||
logger.warning("PDF has no pages")
|
||||
return {}
|
||||
|
||||
logger.info(f"Analyzing PDF with {len(self.reader.pages)} pages")
|
||||
|
||||
# Get form fields
|
||||
raw_fields = self.reader.get_fields()
|
||||
|
||||
if not raw_fields:
|
||||
logger.warning("PDF has no form fields")
|
||||
return {}
|
||||
|
||||
logger.info(f"Found {len(raw_fields)} form fields")
|
||||
|
||||
# Process fields
|
||||
fields = {}
|
||||
for field_name, field_dict in raw_fields.items():
|
||||
try:
|
||||
field = FormField(field_name, field_dict)
|
||||
fields[field_name] = field.to_dict()
|
||||
except Exception as e:
|
||||
logger.warning(f"Error processing field {field_name}: {e}")
|
||||
continue
|
||||
|
||||
return fields
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error analyzing PDF: {e}")
|
||||
raise
|
||||
|
||||
def get_summary(self) -> Dict[str, Any]:
|
||||
"""Get summary statistics."""
|
||||
fields = self.analyze()
|
||||
|
||||
summary = {
|
||||
'total_fields': len(fields),
|
||||
'field_types': {},
|
||||
'required_fields': [],
|
||||
'readonly_fields': [],
|
||||
'fields_with_values': []
|
||||
}
|
||||
|
||||
for field_name, field_data in fields.items():
|
||||
# Count by type
|
||||
field_type = field_data['type']
|
||||
summary['field_types'][field_type] = summary['field_types'].get(field_type, 0) + 1
|
||||
|
||||
# Required fields
|
||||
if field_data.get('required'):
|
||||
summary['required_fields'].append(field_name)
|
||||
|
||||
# Read-only fields
|
||||
if field_data.get('readonly'):
|
||||
summary['readonly_fields'].append(field_name)
|
||||
|
||||
# Fields with values
|
||||
if field_data.get('value'):
|
||||
summary['fields_with_values'].append(field_name)
|
||||
|
||||
return summary
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Analyze PDF form fields',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog='''
|
||||
Examples:
|
||||
%(prog)s form.pdf
|
||||
%(prog)s form.pdf --output fields.json
|
||||
%(prog)s form.pdf --output fields.json --verbose
|
||||
%(prog)s form.pdf --summary
|
||||
|
||||
Exit codes:
|
||||
0 - Success
|
||||
1 - File not found
|
||||
2 - Invalid PDF
|
||||
3 - Processing error
|
||||
'''
|
||||
)
|
||||
|
||||
parser.add_argument('input', help='Input PDF file')
|
||||
parser.add_argument('--output', '-o', help='Output JSON file (default: stdout)')
|
||||
parser.add_argument('--summary', '-s', action='store_true', help='Show summary only')
|
||||
parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Set log level
|
||||
if args.verbose:
|
||||
logger.setLevel(logging.DEBUG)
|
||||
else:
|
||||
logger.setLevel(logging.WARNING)
|
||||
|
||||
try:
|
||||
# Analyze form
|
||||
analyzer = PDFFormAnalyzer(args.input)
|
||||
|
||||
if args.summary:
|
||||
result = analyzer.get_summary()
|
||||
else:
|
||||
result = analyzer.analyze()
|
||||
|
||||
# Output
|
||||
json_output = json.dumps(result, indent=2)
|
||||
|
||||
if args.output:
|
||||
with open(args.output, 'w', encoding='utf-8') as f:
|
||||
f.write(json_output)
|
||||
logger.info(f"Saved to {args.output}")
|
||||
else:
|
||||
print(json_output)
|
||||
|
||||
return 0
|
||||
|
||||
except FileNotFoundError:
|
||||
logger.error(f"File not found: {args.input}")
|
||||
return 1
|
||||
|
||||
except ValueError as e:
|
||||
logger.error(f"Invalid input: {e}")
|
||||
return 2
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error: {e}")
|
||||
if args.verbose:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return 3
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user