308 lines
8.5 KiB
Python
308 lines
8.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Analyze PDF form fields and structure.
|
|
|
|
Usage:
|
|
python analyze_form.py input.pdf [--output fields.json] [--verbose]
|
|
|
|
Returns:
|
|
JSON with all form fields, types, positions, and metadata
|
|
|
|
Exit codes:
|
|
0 - Success
|
|
1 - File not found
|
|
2 - Invalid PDF
|
|
3 - Processing error
|
|
"""
|
|
|
|
import sys
|
|
import json
|
|
import logging
|
|
import argparse
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Any
|
|
|
|
try:
|
|
from pypdf import PdfReader
|
|
except ImportError:
|
|
print("Error: pypdf not installed. Run: pip install pypdf", file=sys.stderr)
|
|
sys.exit(3)
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class FormField:
|
|
"""Represents a PDF form field."""
|
|
|
|
def __init__(self, name: str, field_dict: Dict[str, Any]):
|
|
self.name = name
|
|
self.raw_data = field_dict
|
|
|
|
@property
|
|
def field_type(self) -> str:
|
|
"""Get field type."""
|
|
ft = self.raw_data.get('/FT', '')
|
|
type_map = {
|
|
'/Tx': 'text',
|
|
'/Btn': 'button', # checkbox or radio
|
|
'/Ch': 'choice', # dropdown or list
|
|
'/Sig': 'signature'
|
|
}
|
|
return type_map.get(ft, 'unknown')
|
|
|
|
@property
|
|
def value(self) -> Optional[str]:
|
|
"""Get current field value."""
|
|
val = self.raw_data.get('/V')
|
|
return str(val) if val else None
|
|
|
|
@property
|
|
def default_value(self) -> Optional[str]:
|
|
"""Get default field value."""
|
|
dv = self.raw_data.get('/DV')
|
|
return str(dv) if dv else None
|
|
|
|
@property
|
|
def is_required(self) -> bool:
|
|
"""Check if field is required."""
|
|
flags = self.raw_data.get('/Ff', 0)
|
|
# Bit 2 indicates required
|
|
return bool(flags & 2)
|
|
|
|
@property
|
|
def is_readonly(self) -> bool:
|
|
"""Check if field is read-only."""
|
|
flags = self.raw_data.get('/Ff', 0)
|
|
# Bit 1 indicates read-only
|
|
return bool(flags & 1)
|
|
|
|
@property
|
|
def options(self) -> List[str]:
|
|
"""Get options for choice fields."""
|
|
if self.field_type != 'choice':
|
|
return []
|
|
|
|
opts = self.raw_data.get('/Opt', [])
|
|
if isinstance(opts, list):
|
|
return [str(opt) for opt in opts]
|
|
return []
|
|
|
|
@property
|
|
def max_length(self) -> Optional[int]:
|
|
"""Get max length for text fields."""
|
|
if self.field_type == 'text':
|
|
return self.raw_data.get('/MaxLen')
|
|
return None
|
|
|
|
@property
|
|
def rect(self) -> Optional[List[float]]:
|
|
"""Get field position and size [x0, y0, x1, y1]."""
|
|
return self.raw_data.get('/Rect')
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
"""Convert to dictionary."""
|
|
result = {
|
|
'name': self.name,
|
|
'type': self.field_type,
|
|
'required': self.is_required,
|
|
'readonly': self.is_readonly
|
|
}
|
|
|
|
if self.value is not None:
|
|
result['value'] = self.value
|
|
|
|
if self.default_value is not None:
|
|
result['default_value'] = self.default_value
|
|
|
|
if self.options:
|
|
result['options'] = self.options
|
|
|
|
if self.max_length is not None:
|
|
result['max_length'] = self.max_length
|
|
|
|
if self.rect:
|
|
result['position'] = {
|
|
'x0': float(self.rect[0]),
|
|
'y0': float(self.rect[1]),
|
|
'x1': float(self.rect[2]),
|
|
'y1': float(self.rect[3]),
|
|
'width': float(self.rect[2] - self.rect[0]),
|
|
'height': float(self.rect[3] - self.rect[1])
|
|
}
|
|
|
|
return result
|
|
|
|
|
|
class PDFFormAnalyzer:
|
|
"""Analyzes PDF forms and extracts field information."""
|
|
|
|
def __init__(self, pdf_path: str):
|
|
self.pdf_path = Path(pdf_path)
|
|
self.reader: Optional[PdfReader] = None
|
|
self._validate_file()
|
|
|
|
def _validate_file(self) -> None:
|
|
"""Validate PDF file exists and is readable."""
|
|
if not self.pdf_path.exists():
|
|
logger.error(f"PDF not found: {self.pdf_path}")
|
|
raise FileNotFoundError(f"PDF not found: {self.pdf_path}")
|
|
|
|
if not self.pdf_path.is_file():
|
|
logger.error(f"Not a file: {self.pdf_path}")
|
|
raise ValueError(f"Not a file: {self.pdf_path}")
|
|
|
|
if self.pdf_path.suffix.lower() != '.pdf':
|
|
logger.error(f"Not a PDF file: {self.pdf_path}")
|
|
raise ValueError(f"Not a PDF file: {self.pdf_path}")
|
|
|
|
def analyze(self) -> Dict[str, Dict[str, Any]]:
|
|
"""
|
|
Analyze PDF and extract all form fields.
|
|
|
|
Returns:
|
|
Dictionary mapping field names to field information
|
|
"""
|
|
try:
|
|
self.reader = PdfReader(str(self.pdf_path))
|
|
|
|
if not self.reader.pages:
|
|
logger.warning("PDF has no pages")
|
|
return {}
|
|
|
|
logger.info(f"Analyzing PDF with {len(self.reader.pages)} pages")
|
|
|
|
# Get form fields
|
|
raw_fields = self.reader.get_fields()
|
|
|
|
if not raw_fields:
|
|
logger.warning("PDF has no form fields")
|
|
return {}
|
|
|
|
logger.info(f"Found {len(raw_fields)} form fields")
|
|
|
|
# Process fields
|
|
fields = {}
|
|
for field_name, field_dict in raw_fields.items():
|
|
try:
|
|
field = FormField(field_name, field_dict)
|
|
fields[field_name] = field.to_dict()
|
|
except Exception as e:
|
|
logger.warning(f"Error processing field {field_name}: {e}")
|
|
continue
|
|
|
|
return fields
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error analyzing PDF: {e}")
|
|
raise
|
|
|
|
def get_summary(self) -> Dict[str, Any]:
|
|
"""Get summary statistics."""
|
|
fields = self.analyze()
|
|
|
|
summary = {
|
|
'total_fields': len(fields),
|
|
'field_types': {},
|
|
'required_fields': [],
|
|
'readonly_fields': [],
|
|
'fields_with_values': []
|
|
}
|
|
|
|
for field_name, field_data in fields.items():
|
|
# Count by type
|
|
field_type = field_data['type']
|
|
summary['field_types'][field_type] = summary['field_types'].get(field_type, 0) + 1
|
|
|
|
# Required fields
|
|
if field_data.get('required'):
|
|
summary['required_fields'].append(field_name)
|
|
|
|
# Read-only fields
|
|
if field_data.get('readonly'):
|
|
summary['readonly_fields'].append(field_name)
|
|
|
|
# Fields with values
|
|
if field_data.get('value'):
|
|
summary['fields_with_values'].append(field_name)
|
|
|
|
return summary
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
parser = argparse.ArgumentParser(
|
|
description='Analyze PDF form fields',
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog='''
|
|
Examples:
|
|
%(prog)s form.pdf
|
|
%(prog)s form.pdf --output fields.json
|
|
%(prog)s form.pdf --output fields.json --verbose
|
|
%(prog)s form.pdf --summary
|
|
|
|
Exit codes:
|
|
0 - Success
|
|
1 - File not found
|
|
2 - Invalid PDF
|
|
3 - Processing error
|
|
'''
|
|
)
|
|
|
|
parser.add_argument('input', help='Input PDF file')
|
|
parser.add_argument('--output', '-o', help='Output JSON file (default: stdout)')
|
|
parser.add_argument('--summary', '-s', action='store_true', help='Show summary only')
|
|
parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output')
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Set log level
|
|
if args.verbose:
|
|
logger.setLevel(logging.DEBUG)
|
|
else:
|
|
logger.setLevel(logging.WARNING)
|
|
|
|
try:
|
|
# Analyze form
|
|
analyzer = PDFFormAnalyzer(args.input)
|
|
|
|
if args.summary:
|
|
result = analyzer.get_summary()
|
|
else:
|
|
result = analyzer.analyze()
|
|
|
|
# Output
|
|
json_output = json.dumps(result, indent=2)
|
|
|
|
if args.output:
|
|
with open(args.output, 'w', encoding='utf-8') as f:
|
|
f.write(json_output)
|
|
logger.info(f"Saved to {args.output}")
|
|
else:
|
|
print(json_output)
|
|
|
|
return 0
|
|
|
|
except FileNotFoundError:
|
|
logger.error(f"File not found: {args.input}")
|
|
return 1
|
|
|
|
except ValueError as e:
|
|
logger.error(f"Invalid input: {e}")
|
|
return 2
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error: {e}")
|
|
if args.verbose:
|
|
import traceback
|
|
traceback.print_exc()
|
|
return 3
|
|
|
|
|
|
if __name__ == '__main__':
|
|
sys.exit(main())
|