Initial commit
This commit is contained in:
307
skills/pdf-processing-pro/scripts/analyze_form.py
Normal file
307
skills/pdf-processing-pro/scripts/analyze_form.py
Normal file
@@ -0,0 +1,307 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Analyze PDF form fields and structure.
|
||||
|
||||
Usage:
|
||||
python analyze_form.py input.pdf [--output fields.json] [--verbose]
|
||||
|
||||
Returns:
|
||||
JSON with all form fields, types, positions, and metadata
|
||||
|
||||
Exit codes:
|
||||
0 - Success
|
||||
1 - File not found
|
||||
2 - Invalid PDF
|
||||
3 - Processing error
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
import logging
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Any
|
||||
|
||||
try:
|
||||
from pypdf import PdfReader
|
||||
except ImportError:
|
||||
print("Error: pypdf not installed. Run: pip install pypdf", file=sys.stderr)
|
||||
sys.exit(3)
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class FormField:
|
||||
"""Represents a PDF form field."""
|
||||
|
||||
def __init__(self, name: str, field_dict: Dict[str, Any]):
|
||||
self.name = name
|
||||
self.raw_data = field_dict
|
||||
|
||||
@property
|
||||
def field_type(self) -> str:
|
||||
"""Get field type."""
|
||||
ft = self.raw_data.get('/FT', '')
|
||||
type_map = {
|
||||
'/Tx': 'text',
|
||||
'/Btn': 'button', # checkbox or radio
|
||||
'/Ch': 'choice', # dropdown or list
|
||||
'/Sig': 'signature'
|
||||
}
|
||||
return type_map.get(ft, 'unknown')
|
||||
|
||||
@property
|
||||
def value(self) -> Optional[str]:
|
||||
"""Get current field value."""
|
||||
val = self.raw_data.get('/V')
|
||||
return str(val) if val else None
|
||||
|
||||
@property
|
||||
def default_value(self) -> Optional[str]:
|
||||
"""Get default field value."""
|
||||
dv = self.raw_data.get('/DV')
|
||||
return str(dv) if dv else None
|
||||
|
||||
@property
|
||||
def is_required(self) -> bool:
|
||||
"""Check if field is required."""
|
||||
flags = self.raw_data.get('/Ff', 0)
|
||||
# Bit 2 indicates required
|
||||
return bool(flags & 2)
|
||||
|
||||
@property
|
||||
def is_readonly(self) -> bool:
|
||||
"""Check if field is read-only."""
|
||||
flags = self.raw_data.get('/Ff', 0)
|
||||
# Bit 1 indicates read-only
|
||||
return bool(flags & 1)
|
||||
|
||||
@property
|
||||
def options(self) -> List[str]:
|
||||
"""Get options for choice fields."""
|
||||
if self.field_type != 'choice':
|
||||
return []
|
||||
|
||||
opts = self.raw_data.get('/Opt', [])
|
||||
if isinstance(opts, list):
|
||||
return [str(opt) for opt in opts]
|
||||
return []
|
||||
|
||||
@property
|
||||
def max_length(self) -> Optional[int]:
|
||||
"""Get max length for text fields."""
|
||||
if self.field_type == 'text':
|
||||
return self.raw_data.get('/MaxLen')
|
||||
return None
|
||||
|
||||
@property
|
||||
def rect(self) -> Optional[List[float]]:
|
||||
"""Get field position and size [x0, y0, x1, y1]."""
|
||||
return self.raw_data.get('/Rect')
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert to dictionary."""
|
||||
result = {
|
||||
'name': self.name,
|
||||
'type': self.field_type,
|
||||
'required': self.is_required,
|
||||
'readonly': self.is_readonly
|
||||
}
|
||||
|
||||
if self.value is not None:
|
||||
result['value'] = self.value
|
||||
|
||||
if self.default_value is not None:
|
||||
result['default_value'] = self.default_value
|
||||
|
||||
if self.options:
|
||||
result['options'] = self.options
|
||||
|
||||
if self.max_length is not None:
|
||||
result['max_length'] = self.max_length
|
||||
|
||||
if self.rect:
|
||||
result['position'] = {
|
||||
'x0': float(self.rect[0]),
|
||||
'y0': float(self.rect[1]),
|
||||
'x1': float(self.rect[2]),
|
||||
'y1': float(self.rect[3]),
|
||||
'width': float(self.rect[2] - self.rect[0]),
|
||||
'height': float(self.rect[3] - self.rect[1])
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
|
||||
class PDFFormAnalyzer:
|
||||
"""Analyzes PDF forms and extracts field information."""
|
||||
|
||||
def __init__(self, pdf_path: str):
|
||||
self.pdf_path = Path(pdf_path)
|
||||
self.reader: Optional[PdfReader] = None
|
||||
self._validate_file()
|
||||
|
||||
def _validate_file(self) -> None:
|
||||
"""Validate PDF file exists and is readable."""
|
||||
if not self.pdf_path.exists():
|
||||
logger.error(f"PDF not found: {self.pdf_path}")
|
||||
raise FileNotFoundError(f"PDF not found: {self.pdf_path}")
|
||||
|
||||
if not self.pdf_path.is_file():
|
||||
logger.error(f"Not a file: {self.pdf_path}")
|
||||
raise ValueError(f"Not a file: {self.pdf_path}")
|
||||
|
||||
if self.pdf_path.suffix.lower() != '.pdf':
|
||||
logger.error(f"Not a PDF file: {self.pdf_path}")
|
||||
raise ValueError(f"Not a PDF file: {self.pdf_path}")
|
||||
|
||||
def analyze(self) -> Dict[str, Dict[str, Any]]:
|
||||
"""
|
||||
Analyze PDF and extract all form fields.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping field names to field information
|
||||
"""
|
||||
try:
|
||||
self.reader = PdfReader(str(self.pdf_path))
|
||||
|
||||
if not self.reader.pages:
|
||||
logger.warning("PDF has no pages")
|
||||
return {}
|
||||
|
||||
logger.info(f"Analyzing PDF with {len(self.reader.pages)} pages")
|
||||
|
||||
# Get form fields
|
||||
raw_fields = self.reader.get_fields()
|
||||
|
||||
if not raw_fields:
|
||||
logger.warning("PDF has no form fields")
|
||||
return {}
|
||||
|
||||
logger.info(f"Found {len(raw_fields)} form fields")
|
||||
|
||||
# Process fields
|
||||
fields = {}
|
||||
for field_name, field_dict in raw_fields.items():
|
||||
try:
|
||||
field = FormField(field_name, field_dict)
|
||||
fields[field_name] = field.to_dict()
|
||||
except Exception as e:
|
||||
logger.warning(f"Error processing field {field_name}: {e}")
|
||||
continue
|
||||
|
||||
return fields
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error analyzing PDF: {e}")
|
||||
raise
|
||||
|
||||
def get_summary(self) -> Dict[str, Any]:
|
||||
"""Get summary statistics."""
|
||||
fields = self.analyze()
|
||||
|
||||
summary = {
|
||||
'total_fields': len(fields),
|
||||
'field_types': {},
|
||||
'required_fields': [],
|
||||
'readonly_fields': [],
|
||||
'fields_with_values': []
|
||||
}
|
||||
|
||||
for field_name, field_data in fields.items():
|
||||
# Count by type
|
||||
field_type = field_data['type']
|
||||
summary['field_types'][field_type] = summary['field_types'].get(field_type, 0) + 1
|
||||
|
||||
# Required fields
|
||||
if field_data.get('required'):
|
||||
summary['required_fields'].append(field_name)
|
||||
|
||||
# Read-only fields
|
||||
if field_data.get('readonly'):
|
||||
summary['readonly_fields'].append(field_name)
|
||||
|
||||
# Fields with values
|
||||
if field_data.get('value'):
|
||||
summary['fields_with_values'].append(field_name)
|
||||
|
||||
return summary
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Analyze PDF form fields',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog='''
|
||||
Examples:
|
||||
%(prog)s form.pdf
|
||||
%(prog)s form.pdf --output fields.json
|
||||
%(prog)s form.pdf --output fields.json --verbose
|
||||
%(prog)s form.pdf --summary
|
||||
|
||||
Exit codes:
|
||||
0 - Success
|
||||
1 - File not found
|
||||
2 - Invalid PDF
|
||||
3 - Processing error
|
||||
'''
|
||||
)
|
||||
|
||||
parser.add_argument('input', help='Input PDF file')
|
||||
parser.add_argument('--output', '-o', help='Output JSON file (default: stdout)')
|
||||
parser.add_argument('--summary', '-s', action='store_true', help='Show summary only')
|
||||
parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Set log level
|
||||
if args.verbose:
|
||||
logger.setLevel(logging.DEBUG)
|
||||
else:
|
||||
logger.setLevel(logging.WARNING)
|
||||
|
||||
try:
|
||||
# Analyze form
|
||||
analyzer = PDFFormAnalyzer(args.input)
|
||||
|
||||
if args.summary:
|
||||
result = analyzer.get_summary()
|
||||
else:
|
||||
result = analyzer.analyze()
|
||||
|
||||
# Output
|
||||
json_output = json.dumps(result, indent=2)
|
||||
|
||||
if args.output:
|
||||
with open(args.output, 'w', encoding='utf-8') as f:
|
||||
f.write(json_output)
|
||||
logger.info(f"Saved to {args.output}")
|
||||
else:
|
||||
print(json_output)
|
||||
|
||||
return 0
|
||||
|
||||
except FileNotFoundError:
|
||||
logger.error(f"File not found: {args.input}")
|
||||
return 1
|
||||
|
||||
except ValueError as e:
|
||||
logger.error(f"Invalid input: {e}")
|
||||
return 2
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error: {e}")
|
||||
if args.verbose:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return 3
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user