Initial commit

This commit is contained in:
Zhongwei Li
2025-11-29 18:32:37 +08:00
commit 7822766a14
70 changed files with 27471 additions and 0 deletions

View File

@@ -0,0 +1,610 @@
# PDF Form Processing Guide
Complete guide for processing PDF forms in production environments.
## Table of contents
- Form analysis and field detection
- Form filling workflows
- Validation strategies
- Field types and handling
- Multi-page forms
- Flattening and finalization
- Error handling patterns
- Production examples
## Form analysis
### Analyze form structure
Use `analyze_form.py` to extract complete form information:
```bash
python scripts/analyze_form.py application.pdf --output schema.json
```
Output format:
```json
{
"full_name": {
"type": "text",
"required": true,
"max_length": 100,
"x": 120.5,
"y": 450.2,
"width": 300,
"height": 20
},
"date_of_birth": {
"type": "text",
"required": true,
"format": "MM/DD/YYYY",
"x": 120.5,
"y": 400.8,
"width": 150,
"height": 20
},
"email_newsletter": {
"type": "checkbox",
"required": false,
"x": 120.5,
"y": 350.4,
"width": 15,
"height": 15
},
"preferred_contact": {
"type": "radio",
"required": true,
"options": ["email", "phone", "mail"],
"x": 120.5,
"y": 300.0,
"width": 200,
"height": 60
}
}
```
### Programmatic analysis
```python
from pypdf import PdfReader
reader = PdfReader("form.pdf")
fields = reader.get_fields()
for field_name, field_info in fields.items():
print(f"Field: {field_name}")
print(f" Type: {field_info.get('/FT')}")
print(f" Value: {field_info.get('/V')}")
print(f" Flags: {field_info.get('/Ff', 0)}")
print()
```
## Form filling workflows
### Basic workflow
```bash
# 1. Analyze form
python scripts/analyze_form.py template.pdf --output schema.json
# 2. Prepare data
cat > data.json << EOF
{
"full_name": "John Doe",
"date_of_birth": "01/15/1990",
"email": "john@example.com",
"email_newsletter": true,
"preferred_contact": "email"
}
EOF
# 3. Validate data
python scripts/validate_form.py data.json schema.json
# 4. Fill form
python scripts/fill_form.py template.pdf data.json filled.pdf
# 5. Flatten (optional - makes fields non-editable)
python scripts/flatten_form.py filled.pdf final.pdf
```
### Programmatic filling
```python
from pypdf import PdfReader, PdfWriter
reader = PdfReader("template.pdf")
writer = PdfWriter()
# Clone all pages
for page in reader.pages:
writer.add_page(page)
# Fill form fields
writer.update_page_form_field_values(
writer.pages[0],
{
"full_name": "John Doe",
"date_of_birth": "01/15/1990",
"email": "john@example.com",
"email_newsletter": "/Yes", # Checkbox value
"preferred_contact": "/email" # Radio value
}
)
# Save filled form
with open("filled.pdf", "wb") as output:
writer.write(output)
```
## Field types and handling
### Text fields
```python
# Simple text
field_values["customer_name"] = "Jane Smith"
# Formatted text (dates)
field_values["date"] = "12/25/2024"
# Numbers
field_values["amount"] = "1234.56"
# Multi-line text
field_values["comments"] = "Line 1\nLine 2\nLine 3"
```
### Checkboxes
Checkboxes typically use `/Yes` for checked, `/Off` for unchecked:
```python
# Check checkbox
field_values["agree_to_terms"] = "/Yes"
# Uncheck checkbox
field_values["newsletter_opt_out"] = "/Off"
```
**Note**: Some PDFs use different values. Check with `analyze_form.py`:
```json
{
"some_checkbox": {
"type": "checkbox",
"on_value": "/On", # Check this
"off_value": "/Off"
}
}
```
### Radio buttons
Radio buttons are mutually exclusive options:
```python
# Select one option from radio group
field_values["preferred_contact"] = "/email"
# Other options in same group
# field_values["preferred_contact"] = "/phone"
# field_values["preferred_contact"] = "/mail"
```
### Dropdown/List boxes
```python
# Single selection
field_values["country"] = "United States"
# List of available options in schema
"country": {
"type": "dropdown",
"options": ["United States", "Canada", "Mexico", ...]
}
```
## Validation strategies
### Schema-based validation
```python
import json
from jsonschema import validate, ValidationError
# Load schema from analyze_form.py output
with open("schema.json") as f:
schema = json.load(f)
# Load form data
with open("data.json") as f:
data = json.load(f)
# Validate all fields
errors = []
for field_name, field_schema in schema.items():
value = data.get(field_name)
# Check required fields
if field_schema.get("required") and not value:
errors.append(f"Missing required field: {field_name}")
# Check field type
if value and field_schema.get("type") == "text":
if not isinstance(value, str):
errors.append(f"Field {field_name} must be string")
# Check max length
max_length = field_schema.get("max_length")
if value and max_length and len(str(value)) > max_length:
errors.append(f"Field {field_name} exceeds max length {max_length}")
# Check format (dates, emails, etc)
format_type = field_schema.get("format")
if value and format_type:
if not validate_format(value, format_type):
errors.append(f"Field {field_name} has invalid format")
if errors:
print("Validation errors:")
for error in errors:
print(f" - {error}")
exit(1)
print("Validation passed")
```
### Format validation
```python
import re
from datetime import datetime
def validate_format(value, format_type):
"""Validate field format."""
if format_type == "email":
pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
return re.match(pattern, value) is not None
elif format_type == "phone":
# US phone: (555) 123-4567 or 555-123-4567
pattern = r'^\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}$'
return re.match(pattern, value) is not None
elif format_type == "MM/DD/YYYY":
try:
datetime.strptime(value, "%m/%d/%Y")
return True
except ValueError:
return False
elif format_type == "SSN":
# XXX-XX-XXXX
pattern = r'^\d{3}-\d{2}-\d{4}$'
return re.match(pattern, value) is not None
elif format_type == "ZIP":
# XXXXX or XXXXX-XXXX
pattern = r'^\d{5}(-\d{4})?$'
return re.match(pattern, value) is not None
return True # Unknown format, skip validation
```
## Multi-page forms
### Handling multi-page forms
```python
from pypdf import PdfReader, PdfWriter
reader = PdfReader("multi_page_form.pdf")
writer = PdfWriter()
# Clone all pages
for page in reader.pages:
writer.add_page(page)
# Fill fields on page 1
writer.update_page_form_field_values(
writer.pages[0],
{
"name_page1": "John Doe",
"email_page1": "john@example.com"
}
)
# Fill fields on page 2
writer.update_page_form_field_values(
writer.pages[1],
{
"address_page2": "123 Main St",
"city_page2": "Springfield"
}
)
# Fill fields on page 3
writer.update_page_form_field_values(
writer.pages[2],
{
"signature_page3": "John Doe",
"date_page3": "12/25/2024"
}
)
with open("filled_multi_page.pdf", "wb") as output:
writer.write(output)
```
### Identifying page-specific fields
```python
# Analyze which fields are on which pages
for page_num, page in enumerate(reader.pages, 1):
fields = page.get("/Annots", [])
if fields:
print(f"\nPage {page_num} fields:")
for field_ref in fields:
field = field_ref.get_object()
field_name = field.get("/T", "Unknown")
print(f" - {field_name}")
```
## Flattening forms
### Why flatten
Flattening makes form fields non-editable, embedding values permanently:
- **Security**: Prevent modifications
- **Distribution**: Share read-only forms
- **Printing**: Ensure correct appearance
- **Archival**: Long-term storage
### Flatten with pypdf
```python
from pypdf import PdfReader, PdfWriter
reader = PdfReader("filled.pdf")
writer = PdfWriter()
# Add all pages
for page in reader.pages:
writer.add_page(page)
# Flatten all form fields
writer.flatten_fields()
# Save flattened PDF
with open("flattened.pdf", "wb") as output:
writer.write(output)
```
### Using included script
```bash
python scripts/flatten_form.py filled.pdf flattened.pdf
```
## Error handling patterns
### Robust form filling
```python
import logging
from pathlib import Path
from pypdf import PdfReader, PdfWriter
from pypdf.errors import PdfReadError
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def fill_form_safe(template_path, data, output_path):
"""Fill form with comprehensive error handling."""
try:
# Validate inputs
template = Path(template_path)
if not template.exists():
raise FileNotFoundError(f"Template not found: {template_path}")
# Read template
logger.info(f"Reading template: {template_path}")
reader = PdfReader(template_path)
if not reader.pages:
raise ValueError("PDF has no pages")
# Check if form has fields
fields = reader.get_fields()
if not fields:
logger.warning("PDF has no form fields")
return False
# Create writer
writer = PdfWriter()
for page in reader.pages:
writer.add_page(page)
# Validate data against schema
missing_required = []
invalid_fields = []
for field_name, field_info in fields.items():
# Check required fields
is_required = field_info.get("/Ff", 0) & 2 == 2
if is_required and field_name not in data:
missing_required.append(field_name)
# Check invalid field names in data
if field_name in data:
value = data[field_name]
# Add type validation here if needed
if missing_required:
raise ValueError(f"Missing required fields: {missing_required}")
# Fill fields
logger.info("Filling form fields")
writer.update_page_form_field_values(
writer.pages[0],
data
)
# Write output
logger.info(f"Writing output: {output_path}")
with open(output_path, "wb") as output:
writer.write(output)
logger.info("Form filled successfully")
return True
except PdfReadError as e:
logger.error(f"PDF read error: {e}")
return False
except FileNotFoundError as e:
logger.error(f"File error: {e}")
return False
except ValueError as e:
logger.error(f"Validation error: {e}")
return False
except Exception as e:
logger.error(f"Unexpected error: {e}")
return False
# Usage
success = fill_form_safe(
"template.pdf",
{"name": "John", "email": "john@example.com"},
"filled.pdf"
)
if not success:
exit(1)
```
## Production examples
### Example 1: Batch form processing
```python
import json
import glob
from pathlib import Path
from fill_form_safe import fill_form_safe
# Process multiple submissions
submissions_dir = Path("submissions")
template = "application_template.pdf"
output_dir = Path("completed")
output_dir.mkdir(exist_ok=True)
for submission_file in submissions_dir.glob("*.json"):
print(f"Processing: {submission_file.name}")
# Load submission data
with open(submission_file) as f:
data = json.load(f)
# Fill form
applicant_id = data.get("id", "unknown")
output_file = output_dir / f"application_{applicant_id}.pdf"
success = fill_form_safe(template, data, output_file)
if success:
print(f" ✓ Completed: {output_file}")
else:
print(f" ✗ Failed: {submission_file.name}")
```
### Example 2: Form with conditional logic
```python
def prepare_form_data(raw_data):
"""Prepare form data with conditional logic."""
form_data = {}
# Basic fields
form_data["full_name"] = raw_data["name"]
form_data["email"] = raw_data["email"]
# Conditional fields
if raw_data.get("is_student"):
form_data["student_id"] = raw_data["student_id"]
form_data["school_name"] = raw_data["school"]
else:
form_data["employer"] = raw_data.get("employer", "")
# Checkbox logic
form_data["newsletter"] = "/Yes" if raw_data.get("opt_in") else "/Off"
# Calculated fields
total = sum(raw_data.get("items", []))
form_data["total_amount"] = f"${total:.2f}"
return form_data
# Usage
raw_input = {
"name": "Jane Smith",
"email": "jane@example.com",
"is_student": True,
"student_id": "12345",
"school": "State University",
"opt_in": True,
"items": [10.00, 25.50, 15.75]
}
form_data = prepare_form_data(raw_input)
fill_form_safe("template.pdf", form_data, "output.pdf")
```
## Best practices
1. **Always analyze before filling**: Use `analyze_form.py` to understand structure
2. **Validate early**: Check data before attempting to fill
3. **Use logging**: Track operations for debugging
4. **Handle errors gracefully**: Don't crash on invalid data
5. **Test with samples**: Verify with small datasets first
6. **Flatten when distributing**: Make read-only for recipients
7. **Keep templates versioned**: Track form template changes
8. **Document field mappings**: Maintain data-to-field documentation
## Troubleshooting
### Fields not filling
1. Check field names match exactly (case-sensitive)
2. Verify checkbox/radio values (`/Yes`, `/On`, etc.)
3. Ensure PDF is not encrypted or protected
4. Check if form uses XFA format (not supported by pypdf)
### Encoding issues
```python
# Handle special characters
field_values["name"] = "José García" # UTF-8 encoded
```
### Large batch processing
```python
# Process in chunks to avoid memory issues
chunk_size = 100
for i in range(0, len(submissions), chunk_size):
chunk = submissions[i:i + chunk_size]
process_batch(chunk)
```

View File

@@ -0,0 +1,137 @@
# PDF OCR Processing Guide
Extract text from scanned PDFs and image-based documents.
## Quick start
```python
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
# Convert PDF to images
images = convert_from_path("scanned.pdf")
# Extract text from each page
for i, image in enumerate(images):
text = pytesseract.image_to_string(image)
print(f"Page {i+1}:\n{text}\n")
```
## Installation
### Install Tesseract
**macOS:**
```bash
brew install tesseract
```
**Ubuntu/Debian:**
```bash
sudo apt-get install tesseract-ocr
```
**Windows:**
Download from: https://github.com/UB-Mannheim/tesseract/wiki
### Install Python packages
```bash
pip install pytesseract pdf2image pillow
```
## Language support
```python
# English (default)
text = pytesseract.image_to_string(image, lang="eng")
# Spanish
text = pytesseract.image_to_string(image, lang="spa")
# Multiple languages
text = pytesseract.image_to_string(image, lang="eng+spa+fra")
```
Install additional languages:
```bash
# macOS
brew install tesseract-lang
# Ubuntu
sudo apt-get install tesseract-ocr-spa tesseract-ocr-fra
```
## Image preprocessing
```python
from PIL import Image, ImageEnhance, ImageFilter
def preprocess_for_ocr(image):
"""Optimize image for better OCR accuracy."""
# Convert to grayscale
image = image.convert("L")
# Increase contrast
enhancer = ImageEnhance.Contrast(image)
image = enhancer.enhance(2.0)
# Denoise
image = image.filter(ImageFilter.MedianFilter())
# Sharpen
image = image.filter(ImageFilter.SHARPEN)
return image
# Usage
image = Image.open("scanned_page.png")
processed = preprocess_for_ocr(image)
text = pytesseract.image_to_string(processed)
```
## Best practices
1. **Preprocess images** for better accuracy
2. **Use appropriate language** models
3. **Batch process** large documents
4. **Cache results** to avoid re-processing
5. **Validate output** - OCR is not 100% accurate
6. **Consider confidence scores** for quality checks
## Production example
```python
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
def ocr_pdf(pdf_path, output_path):
"""OCR PDF and save to text file."""
# Convert to images
images = convert_from_path(pdf_path, dpi=300)
full_text = []
for i, image in enumerate(images, 1):
print(f"Processing page {i}/{len(images)}")
# Preprocess
processed = preprocess_for_ocr(image)
# OCR
text = pytesseract.image_to_string(processed, lang="eng")
full_text.append(f"--- Page {i} ---\n{text}\n")
# Save
with open(output_path, "w", encoding="utf-8") as f:
f.write("\n".join(full_text))
print(f"Saved to {output_path}")
# Usage
ocr_pdf("scanned_document.pdf", "extracted_text.txt")
```

View File

@@ -0,0 +1,296 @@
---
name: PDF Processing Pro
description: Production-ready PDF processing with forms, tables, OCR, validation, and batch operations. Use when working with complex PDF workflows in production environments, processing large volumes of PDFs, or requiring robust error handling and validation.
---
# PDF Processing Pro
Production-ready PDF processing toolkit with pre-built scripts, comprehensive error handling, and support for complex workflows.
## Quick start
### Extract text from PDF
```python
import pdfplumber
with pdfplumber.open("document.pdf") as pdf:
text = pdf.pages[0].extract_text()
print(text)
```
### Analyze PDF form (using included script)
```bash
python scripts/analyze_form.py input.pdf --output fields.json
# Returns: JSON with all form fields, types, and positions
```
### Fill PDF form with validation
```bash
python scripts/fill_form.py input.pdf data.json output.pdf
# Validates all fields before filling, includes error reporting
```
### Extract tables from PDF
```bash
python scripts/extract_tables.py report.pdf --output tables.csv
# Extracts all tables with automatic column detection
```
## Features
### ✅ Production-ready scripts
All scripts include:
- **Error handling**: Graceful failures with detailed error messages
- **Validation**: Input validation and type checking
- **Logging**: Configurable logging with timestamps
- **Type hints**: Full type annotations for IDE support
- **CLI interface**: `--help` flag for all scripts
- **Exit codes**: Proper exit codes for automation
### ✅ Comprehensive workflows
- **PDF Forms**: Complete form processing pipeline
- **Table Extraction**: Advanced table detection and extraction
- **OCR Processing**: Scanned PDF text extraction
- **Batch Operations**: Process multiple PDFs efficiently
- **Validation**: Pre and post-processing validation
## Advanced topics
### PDF Form Processing
For complete form workflows including:
- Field analysis and detection
- Dynamic form filling
- Validation rules
- Multi-page forms
- Checkbox and radio button handling
See [FORMS.md](FORMS.md)
### Table Extraction
For complex table extraction:
- Multi-page tables
- Merged cells
- Nested tables
- Custom table detection
- Export to CSV/Excel
See [TABLES.md](TABLES.md)
### OCR Processing
For scanned PDFs and image-based documents:
- Tesseract integration
- Language support
- Image preprocessing
- Confidence scoring
- Batch OCR
See [OCR.md](OCR.md)
## Included scripts
### Form processing
**analyze_form.py** - Extract form field information
```bash
python scripts/analyze_form.py input.pdf [--output fields.json] [--verbose]
```
**fill_form.py** - Fill PDF forms with data
```bash
python scripts/fill_form.py input.pdf data.json output.pdf [--validate]
```
**validate_form.py** - Validate form data before filling
```bash
python scripts/validate_form.py data.json schema.json
```
### Table extraction
**extract_tables.py** - Extract tables to CSV/Excel
```bash
python scripts/extract_tables.py input.pdf [--output tables.csv] [--format csv|excel]
```
### Text extraction
**extract_text.py** - Extract text with formatting preservation
```bash
python scripts/extract_text.py input.pdf [--output text.txt] [--preserve-formatting]
```
### Utilities
**merge_pdfs.py** - Merge multiple PDFs
```bash
python scripts/merge_pdfs.py file1.pdf file2.pdf file3.pdf --output merged.pdf
```
**split_pdf.py** - Split PDF into individual pages
```bash
python scripts/split_pdf.py input.pdf --output-dir pages/
```
**validate_pdf.py** - Validate PDF integrity
```bash
python scripts/validate_pdf.py input.pdf
```
## Common workflows
### Workflow 1: Process form submissions
```bash
# 1. Analyze form structure
python scripts/analyze_form.py template.pdf --output schema.json
# 2. Validate submission data
python scripts/validate_form.py submission.json schema.json
# 3. Fill form
python scripts/fill_form.py template.pdf submission.json completed.pdf
# 4. Validate output
python scripts/validate_pdf.py completed.pdf
```
### Workflow 2: Extract data from reports
```bash
# 1. Extract tables
python scripts/extract_tables.py monthly_report.pdf --output data.csv
# 2. Extract text for analysis
python scripts/extract_text.py monthly_report.pdf --output report.txt
```
### Workflow 3: Batch processing
```python
import glob
from pathlib import Path
import subprocess
# Process all PDFs in directory
for pdf_file in glob.glob("invoices/*.pdf"):
output_file = Path("processed") / Path(pdf_file).name
result = subprocess.run([
"python", "scripts/extract_text.py",
pdf_file,
"--output", str(output_file)
], capture_output=True)
if result.returncode == 0:
print(f"✓ Processed: {pdf_file}")
else:
print(f"✗ Failed: {pdf_file} - {result.stderr}")
```
## Error handling
All scripts follow consistent error patterns:
```python
# Exit codes
# 0 - Success
# 1 - File not found
# 2 - Invalid input
# 3 - Processing error
# 4 - Validation error
# Example usage in automation
result = subprocess.run(["python", "scripts/fill_form.py", ...])
if result.returncode == 0:
print("Success")
elif result.returncode == 4:
print("Validation failed - check input data")
else:
print(f"Error occurred: {result.returncode}")
```
## Dependencies
All scripts require:
```bash
pip install pdfplumber pypdf pillow pytesseract pandas
```
Optional for OCR:
```bash
# Install tesseract-ocr system package
# macOS: brew install tesseract
# Ubuntu: apt-get install tesseract-ocr
# Windows: Download from GitHub releases
```
## Performance tips
- **Use batch processing** for multiple PDFs
- **Enable multiprocessing** with `--parallel` flag (where supported)
- **Cache extracted data** to avoid re-processing
- **Validate inputs early** to fail fast
- **Use streaming** for large PDFs (>50MB)
## Best practices
1. **Always validate inputs** before processing
2. **Use try-except** in custom scripts
3. **Log all operations** for debugging
4. **Test with sample PDFs** before production
5. **Set timeouts** for long-running operations
6. **Check exit codes** in automation
7. **Backup originals** before modification
## Troubleshooting
### Common issues
**"Module not found" errors**:
```bash
pip install -r requirements.txt
```
**Tesseract not found**:
```bash
# Install tesseract system package (see Dependencies)
```
**Memory errors with large PDFs**:
```python
# Process page by page instead of loading entire PDF
with pdfplumber.open("large.pdf") as pdf:
for page in pdf.pages:
text = page.extract_text()
# Process page immediately
```
**Permission errors**:
```bash
chmod +x scripts/*.py
```
## Getting help
All scripts support `--help`:
```bash
python scripts/analyze_form.py --help
python scripts/extract_tables.py --help
```
For detailed documentation on specific topics, see:
- [FORMS.md](FORMS.md) - Complete form processing guide
- [TABLES.md](TABLES.md) - Advanced table extraction
- [OCR.md](OCR.md) - Scanned PDF processing

View File

@@ -0,0 +1,626 @@
# PDF Table Extraction Guide
Advanced table extraction strategies for production environments.
## Table of contents
- Basic table extraction
- Multi-page tables
- Complex table structures
- Export formats
- Table detection algorithms
- Custom extraction rules
- Performance optimization
- Production examples
## Basic table extraction
### Using pdfplumber (recommended)
```python
import pdfplumber
with pdfplumber.open("report.pdf") as pdf:
page = pdf.pages[0]
tables = page.extract_tables()
for i, table in enumerate(tables):
print(f"\nTable {i + 1}:")
for row in table:
print(row)
```
### Using included script
```bash
python scripts/extract_tables.py report.pdf --output tables.csv
```
Output:
```csv
Name,Age,City
John Doe,30,New York
Jane Smith,25,Los Angeles
Bob Johnson,35,Chicago
```
## Table extraction strategies
### Strategy 1: Automatic detection
Let pdfplumber auto-detect tables:
```python
import pdfplumber
with pdfplumber.open("document.pdf") as pdf:
for page_num, page in enumerate(pdf.pages, 1):
tables = page.extract_tables()
if tables:
print(f"Found {len(tables)} table(s) on page {page_num}")
for table_num, table in enumerate(tables, 1):
print(f"\nTable {table_num}:")
# First row is usually headers
headers = table[0]
print(f"Columns: {headers}")
# Data rows
for row in table[1:]:
print(row)
```
### Strategy 2: Custom table settings
Fine-tune detection with custom settings:
```python
import pdfplumber
table_settings = {
"vertical_strategy": "lines", # or "text", "lines_strict"
"horizontal_strategy": "lines",
"explicit_vertical_lines": [],
"explicit_horizontal_lines": [],
"snap_tolerance": 3,
"join_tolerance": 3,
"edge_min_length": 3,
"min_words_vertical": 3,
"min_words_horizontal": 1,
"keep_blank_chars": False,
"text_tolerance": 3,
"text_x_tolerance": 3,
"text_y_tolerance": 3,
"intersection_tolerance": 3
}
with pdfplumber.open("document.pdf") as pdf:
page = pdf.pages[0]
tables = page.extract_tables(table_settings=table_settings)
```
### Strategy 3: Explicit boundaries
Define table boundaries manually:
```python
import pdfplumber
with pdfplumber.open("document.pdf") as pdf:
page = pdf.pages[0]
# Define bounding box (x0, top, x1, bottom)
bbox = (50, 100, 550, 700)
# Extract table within bounding box
cropped = page.within_bbox(bbox)
tables = cropped.extract_tables()
```
## Multi-page tables
### Detect and merge multi-page tables
```python
import pdfplumber
def extract_multipage_table(pdf_path, start_page=0, end_page=None):
"""Extract table that spans multiple pages."""
all_rows = []
headers = None
with pdfplumber.open(pdf_path) as pdf:
pages = pdf.pages[start_page:end_page]
for page_num, page in enumerate(pages):
tables = page.extract_tables()
if not tables:
continue
# Assume first table on page
table = tables[0]
if page_num == 0:
# First page: capture headers and data
headers = table[0]
all_rows.extend(table[1:])
else:
# Subsequent pages: skip headers if they repeat
if table[0] == headers:
all_rows.extend(table[1:])
else:
all_rows.extend(table)
return [headers] + all_rows if headers else all_rows
# Usage
table = extract_multipage_table("report.pdf", start_page=2, end_page=5)
print(f"Extracted {len(table) - 1} rows")
print(f"Columns: {table[0]}")
```
## Complex table structures
### Handling merged cells
```python
import pdfplumber
def handle_merged_cells(table):
"""Process table with merged cells."""
processed = []
for row in table:
new_row = []
last_value = None
for cell in row:
if cell is None or cell == "":
# Merged cell - use value from left
new_row.append(last_value)
else:
new_row.append(cell)
last_value = cell
processed.append(new_row)
return processed
# Usage
with pdfplumber.open("document.pdf") as pdf:
table = pdf.pages[0].extract_tables()[0]
clean_table = handle_merged_cells(table)
```
### Nested tables
```python
def extract_nested_tables(page, bbox):
"""Extract nested tables from a region."""
cropped = page.within_bbox(bbox)
# Try to detect sub-regions with tables
tables = cropped.extract_tables()
result = []
for table in tables:
# Process each nested table
if table:
result.append({
"type": "nested",
"data": table
})
return result
```
### Tables with varying column counts
```python
def normalize_table_columns(table):
"""Normalize table with inconsistent column counts."""
if not table:
return table
# Find max column count
max_cols = max(len(row) for row in table)
# Pad short rows
normalized = []
for row in table:
if len(row) < max_cols:
# Pad with empty strings
row = row + [""] * (max_cols - len(row))
normalized.append(row)
return normalized
```
## Export formats
### Export to CSV
```python
import csv
def export_to_csv(table, output_path):
"""Export table to CSV."""
with open(output_path, "w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerows(table)
# Usage
table = extract_table("report.pdf")
export_to_csv(table, "output.csv")
```
### Export to Excel
```python
import pandas as pd
def export_to_excel(tables, output_path):
"""Export multiple tables to Excel with sheets."""
with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
for i, table in enumerate(tables):
if not table:
continue
# Convert to DataFrame
headers = table[0]
data = table[1:]
df = pd.DataFrame(data, columns=headers)
# Write to sheet
sheet_name = f"Table_{i + 1}"
df.to_excel(writer, sheet_name=sheet_name, index=False)
# Auto-adjust column widths
worksheet = writer.sheets[sheet_name]
for column in worksheet.columns:
max_length = 0
column_letter = column[0].column_letter
for cell in column:
if len(str(cell.value)) > max_length:
max_length = len(str(cell.value))
worksheet.column_dimensions[column_letter].width = max_length + 2
# Usage
tables = extract_all_tables("report.pdf")
export_to_excel(tables, "output.xlsx")
```
### Export to JSON
```python
import json
def export_to_json(table, output_path):
"""Export table to JSON."""
if not table:
return
headers = table[0]
data = table[1:]
# Convert to list of dictionaries
records = []
for row in data:
record = {}
for i, header in enumerate(headers):
value = row[i] if i < len(row) else None
record[header] = value
records.append(record)
# Save to JSON
with open(output_path, "w", encoding="utf-8") as f:
json.dump(records, f, indent=2)
# Usage
table = extract_table("report.pdf")
export_to_json(table, "output.json")
```
## Table detection algorithms
### Visual debugging
```python
import pdfplumber
def visualize_table_detection(pdf_path, page_num=0, output_path="debug.png"):
"""Visualize detected table structure."""
with pdfplumber.open(pdf_path) as pdf:
page = pdf.pages[page_num]
# Draw detected table lines
im = page.to_image(resolution=150)
im = im.debug_tablefinder()
im.save(output_path)
print(f"Saved debug image to {output_path}")
# Usage
visualize_table_detection("document.pdf", page_num=0)
```
### Algorithm: Line-based detection
Best for tables with visible borders:
```python
table_settings = {
"vertical_strategy": "lines",
"horizontal_strategy": "lines"
}
tables = page.extract_tables(table_settings=table_settings)
```
### Algorithm: Text-based detection
Best for tables without borders:
```python
table_settings = {
"vertical_strategy": "text",
"horizontal_strategy": "text"
}
tables = page.extract_tables(table_settings=table_settings)
```
### Algorithm: Explicit lines
For complex layouts, define lines manually:
```python
# Define vertical lines at x-coordinates
vertical_lines = [50, 150, 250, 350, 450, 550]
# Define horizontal lines at y-coordinates
horizontal_lines = [100, 130, 160, 190, 220, 250]
table_settings = {
"explicit_vertical_lines": vertical_lines,
"explicit_horizontal_lines": horizontal_lines
}
tables = page.extract_tables(table_settings=table_settings)
```
## Custom extraction rules
### Rule-based extraction
```python
def extract_with_rules(page, rules):
"""Extract table using custom rules."""
# Rule: "Headers are bold"
if rules.get("bold_headers"):
chars = page.chars
bold_chars = [c for c in chars if "Bold" in c.get("fontname", "")]
# Use bold chars to identify header row
pass
# Rule: "First column is always left-aligned"
if rules.get("left_align_first_col"):
# Adjust extraction to respect alignment
pass
# Rule: "Currency values in last column"
if rules.get("currency_last_col"):
# Parse currency format
pass
# Extract with adjusted settings
return page.extract_tables()
```
### Post-processing rules
```python
def apply_post_processing(table, rules):
"""Apply post-processing rules to extracted table."""
processed = []
for row in table:
new_row = []
for i, cell in enumerate(row):
value = cell
# Rule: Strip whitespace
if rules.get("strip_whitespace"):
value = value.strip() if value else value
# Rule: Convert currency to float
if rules.get("parse_currency") and i == len(row) - 1:
if value and "$" in value:
value = float(value.replace("$", "").replace(",", ""))
# Rule: Parse dates
if rules.get("parse_dates") and i == 0:
# Convert to datetime
pass
new_row.append(value)
processed.append(new_row)
return processed
```
## Performance optimization
### Process large PDFs efficiently
```python
def extract_tables_optimized(pdf_path):
"""Extract tables with memory optimization."""
import gc
results = []
with pdfplumber.open(pdf_path) as pdf:
for page_num, page in enumerate(pdf.pages):
print(f"Processing page {page_num + 1}/{len(pdf.pages)}")
# Extract tables from current page
tables = page.extract_tables()
results.extend(tables)
# Force garbage collection
gc.collect()
return results
```
### Parallel processing
```python
from concurrent.futures import ProcessPoolExecutor
import pdfplumber
def extract_page_tables(args):
"""Extract tables from a single page."""
pdf_path, page_num = args
with pdfplumber.open(pdf_path) as pdf:
page = pdf.pages[page_num]
return page.extract_tables()
def extract_tables_parallel(pdf_path, max_workers=4):
"""Extract tables using multiple processes."""
with pdfplumber.open(pdf_path) as pdf:
page_count = len(pdf.pages)
# Create tasks
tasks = [(pdf_path, i) for i in range(page_count)]
# Process in parallel
with ProcessPoolExecutor(max_workers=max_workers) as executor:
results = list(executor.map(extract_page_tables, tasks))
# Flatten results
all_tables = []
for page_tables in results:
all_tables.extend(page_tables)
return all_tables
```
## Production examples
### Example 1: Financial report extraction
```python
import pdfplumber
import pandas as pd
from decimal import Decimal
def extract_financial_tables(pdf_path):
"""Extract financial data with proper number formatting."""
tables = []
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
page_tables = page.extract_tables()
for table in page_tables:
# Convert to DataFrame
df = pd.DataFrame(table[1:], columns=table[0])
# Parse currency columns
for col in df.columns:
if df[col].str.contains("$", na=False).any():
df[col] = df[col].str.replace(r"[$,()]", "", regex=True)
df[col] = pd.to_numeric(df[col], errors="coerce")
tables.append(df)
return tables
```
### Example 2: Batch table extraction
```python
import glob
from pathlib import Path
def batch_extract_tables(input_dir, output_dir):
"""Extract tables from all PDFs in directory."""
input_path = Path(input_dir)
output_path = Path(output_dir)
output_path.mkdir(exist_ok=True)
for pdf_file in input_path.glob("*.pdf"):
print(f"Processing: {pdf_file.name}")
try:
# Extract tables
tables = extract_all_tables(str(pdf_file))
# Export to Excel
output_file = output_path / f"{pdf_file.stem}_tables.xlsx"
export_to_excel(tables, str(output_file))
print(f" ✓ Extracted {len(tables)} table(s)")
except Exception as e:
print(f" ✗ Error: {e}")
# Usage
batch_extract_tables("invoices/", "extracted/")
```
## Best practices
1. **Visualize first**: Use debug mode to understand table structure
2. **Test settings**: Try different strategies for best results
3. **Handle errors**: PDFs vary widely in quality
4. **Validate output**: Check extracted data makes sense
5. **Post-process**: Clean and normalize extracted data
6. **Use pandas**: Leverage DataFrame operations for analysis
7. **Cache results**: Avoid re-processing large files
8. **Monitor performance**: Profile for bottlenecks
## Troubleshooting
### Tables not detected
1. Try different detection strategies
2. Use visual debugging to see structure
3. Define explicit lines manually
4. Check if table is actually an image
### Incorrect cell values
1. Adjust snap/join tolerance
2. Check text extraction quality
3. Use post-processing to clean data
4. Verify PDF is not scanned image
### Performance issues
1. Process pages individually
2. Use parallel processing
3. Reduce image resolution
4. Extract only needed pages

View File

@@ -0,0 +1,307 @@
#!/usr/bin/env python3
"""
Analyze PDF form fields and structure.
Usage:
python analyze_form.py input.pdf [--output fields.json] [--verbose]
Returns:
JSON with all form fields, types, positions, and metadata
Exit codes:
0 - Success
1 - File not found
2 - Invalid PDF
3 - Processing error
"""
import sys
import json
import logging
import argparse
from pathlib import Path
from typing import Dict, List, Optional, Any
try:
from pypdf import PdfReader
except ImportError:
print("Error: pypdf not installed. Run: pip install pypdf", file=sys.stderr)
sys.exit(3)
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
class FormField:
"""Represents a PDF form field."""
def __init__(self, name: str, field_dict: Dict[str, Any]):
self.name = name
self.raw_data = field_dict
@property
def field_type(self) -> str:
"""Get field type."""
ft = self.raw_data.get('/FT', '')
type_map = {
'/Tx': 'text',
'/Btn': 'button', # checkbox or radio
'/Ch': 'choice', # dropdown or list
'/Sig': 'signature'
}
return type_map.get(ft, 'unknown')
@property
def value(self) -> Optional[str]:
"""Get current field value."""
val = self.raw_data.get('/V')
return str(val) if val else None
@property
def default_value(self) -> Optional[str]:
"""Get default field value."""
dv = self.raw_data.get('/DV')
return str(dv) if dv else None
@property
def is_required(self) -> bool:
"""Check if field is required."""
flags = self.raw_data.get('/Ff', 0)
# Bit 2 indicates required
return bool(flags & 2)
@property
def is_readonly(self) -> bool:
"""Check if field is read-only."""
flags = self.raw_data.get('/Ff', 0)
# Bit 1 indicates read-only
return bool(flags & 1)
@property
def options(self) -> List[str]:
"""Get options for choice fields."""
if self.field_type != 'choice':
return []
opts = self.raw_data.get('/Opt', [])
if isinstance(opts, list):
return [str(opt) for opt in opts]
return []
@property
def max_length(self) -> Optional[int]:
"""Get max length for text fields."""
if self.field_type == 'text':
return self.raw_data.get('/MaxLen')
return None
@property
def rect(self) -> Optional[List[float]]:
"""Get field position and size [x0, y0, x1, y1]."""
return self.raw_data.get('/Rect')
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary."""
result = {
'name': self.name,
'type': self.field_type,
'required': self.is_required,
'readonly': self.is_readonly
}
if self.value is not None:
result['value'] = self.value
if self.default_value is not None:
result['default_value'] = self.default_value
if self.options:
result['options'] = self.options
if self.max_length is not None:
result['max_length'] = self.max_length
if self.rect:
result['position'] = {
'x0': float(self.rect[0]),
'y0': float(self.rect[1]),
'x1': float(self.rect[2]),
'y1': float(self.rect[3]),
'width': float(self.rect[2] - self.rect[0]),
'height': float(self.rect[3] - self.rect[1])
}
return result
class PDFFormAnalyzer:
"""Analyzes PDF forms and extracts field information."""
def __init__(self, pdf_path: str):
self.pdf_path = Path(pdf_path)
self.reader: Optional[PdfReader] = None
self._validate_file()
def _validate_file(self) -> None:
"""Validate PDF file exists and is readable."""
if not self.pdf_path.exists():
logger.error(f"PDF not found: {self.pdf_path}")
raise FileNotFoundError(f"PDF not found: {self.pdf_path}")
if not self.pdf_path.is_file():
logger.error(f"Not a file: {self.pdf_path}")
raise ValueError(f"Not a file: {self.pdf_path}")
if self.pdf_path.suffix.lower() != '.pdf':
logger.error(f"Not a PDF file: {self.pdf_path}")
raise ValueError(f"Not a PDF file: {self.pdf_path}")
def analyze(self) -> Dict[str, Dict[str, Any]]:
"""
Analyze PDF and extract all form fields.
Returns:
Dictionary mapping field names to field information
"""
try:
self.reader = PdfReader(str(self.pdf_path))
if not self.reader.pages:
logger.warning("PDF has no pages")
return {}
logger.info(f"Analyzing PDF with {len(self.reader.pages)} pages")
# Get form fields
raw_fields = self.reader.get_fields()
if not raw_fields:
logger.warning("PDF has no form fields")
return {}
logger.info(f"Found {len(raw_fields)} form fields")
# Process fields
fields = {}
for field_name, field_dict in raw_fields.items():
try:
field = FormField(field_name, field_dict)
fields[field_name] = field.to_dict()
except Exception as e:
logger.warning(f"Error processing field {field_name}: {e}")
continue
return fields
except Exception as e:
logger.error(f"Error analyzing PDF: {e}")
raise
def get_summary(self) -> Dict[str, Any]:
"""Get summary statistics."""
fields = self.analyze()
summary = {
'total_fields': len(fields),
'field_types': {},
'required_fields': [],
'readonly_fields': [],
'fields_with_values': []
}
for field_name, field_data in fields.items():
# Count by type
field_type = field_data['type']
summary['field_types'][field_type] = summary['field_types'].get(field_type, 0) + 1
# Required fields
if field_data.get('required'):
summary['required_fields'].append(field_name)
# Read-only fields
if field_data.get('readonly'):
summary['readonly_fields'].append(field_name)
# Fields with values
if field_data.get('value'):
summary['fields_with_values'].append(field_name)
return summary
def main():
"""Main entry point."""
parser = argparse.ArgumentParser(
description='Analyze PDF form fields',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog='''
Examples:
%(prog)s form.pdf
%(prog)s form.pdf --output fields.json
%(prog)s form.pdf --output fields.json --verbose
%(prog)s form.pdf --summary
Exit codes:
0 - Success
1 - File not found
2 - Invalid PDF
3 - Processing error
'''
)
parser.add_argument('input', help='Input PDF file')
parser.add_argument('--output', '-o', help='Output JSON file (default: stdout)')
parser.add_argument('--summary', '-s', action='store_true', help='Show summary only')
parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output')
args = parser.parse_args()
# Set log level
if args.verbose:
logger.setLevel(logging.DEBUG)
else:
logger.setLevel(logging.WARNING)
try:
# Analyze form
analyzer = PDFFormAnalyzer(args.input)
if args.summary:
result = analyzer.get_summary()
else:
result = analyzer.analyze()
# Output
json_output = json.dumps(result, indent=2)
if args.output:
with open(args.output, 'w', encoding='utf-8') as f:
f.write(json_output)
logger.info(f"Saved to {args.output}")
else:
print(json_output)
return 0
except FileNotFoundError:
logger.error(f"File not found: {args.input}")
return 1
except ValueError as e:
logger.error(f"Invalid input: {e}")
return 2
except Exception as e:
logger.error(f"Error: {e}")
if args.verbose:
import traceback
traceback.print_exc()
return 3
if __name__ == '__main__':
sys.exit(main())