Initial commit
This commit is contained in:
@@ -0,0 +1,78 @@
|
||||
from dataclasses import dataclass
|
||||
import json
|
||||
import sys
|
||||
|
||||
|
||||
# Script to check that bounding boxes in a JSON file do not overlap or have other issues.
|
||||
# Works with any coordinate system since it only checks geometric relationships.
|
||||
|
||||
|
||||
@dataclass
|
||||
class RectAndField:
|
||||
rect: list[float]
|
||||
rect_type: str
|
||||
field: dict
|
||||
|
||||
|
||||
# Returns a list of messages that are printed to stdout for Claude to read.
|
||||
def get_bounding_box_messages(fields_json_stream) -> list[str]:
|
||||
messages = []
|
||||
fields = json.load(fields_json_stream)
|
||||
messages.append(f"Read {len(fields)} fields")
|
||||
|
||||
def rects_intersect(r1, r2):
|
||||
disjoint_horizontal = r1[0] >= r2[2] or r1[2] <= r2[0]
|
||||
disjoint_vertical = r1[1] >= r2[3] or r1[3] <= r2[1]
|
||||
return not (disjoint_horizontal or disjoint_vertical)
|
||||
|
||||
rects_and_fields = []
|
||||
for f in fields:
|
||||
# Skip empty label rects (used for fields without labels)
|
||||
label_rect = f.get('label_rect', [0, 0, 0, 0])
|
||||
if label_rect != [0, 0, 0, 0]:
|
||||
rects_and_fields.append(RectAndField(label_rect, "label", f))
|
||||
rects_and_fields.append(RectAndField(f['rect'], "entry", f))
|
||||
|
||||
has_error = False
|
||||
for i, ri in enumerate(rects_and_fields):
|
||||
# This is O(N^2); we can optimize if it becomes a problem.
|
||||
for j in range(i + 1, len(rects_and_fields)):
|
||||
rj = rects_and_fields[j]
|
||||
if ri.field['page'] == rj.field['page'] and rects_intersect(ri.rect, rj.rect):
|
||||
has_error = True
|
||||
if ri.field is rj.field:
|
||||
messages.append(f"FAILURE: intersection between label and entry bounding boxes for `{ri.field['field_id']}` ({ri.rect}, {rj.rect})")
|
||||
else:
|
||||
messages.append(f"FAILURE: intersection between {ri.rect_type} bounding box for `{ri.field['field_id']}` ({ri.rect}) and {rj.rect_type} bounding box for `{rj.field['field_id']}` ({rj.rect})")
|
||||
if len(messages) >= 20:
|
||||
messages.append("Aborting further checks; fix bounding boxes and try again")
|
||||
return messages
|
||||
if ri.rect_type == "entry":
|
||||
if "entry_text" in ri.field:
|
||||
font_size = ri.field["entry_text"].get("font_size", 14)
|
||||
entry_height = ri.rect[3] - ri.rect[1]
|
||||
if entry_height < font_size:
|
||||
has_error = True
|
||||
messages.append(f"FAILURE: entry bounding box height ({entry_height}) for `{ri.field['field_id']}` is too short for the text content (font size: {font_size}). Increase the box height or decrease the font size.")
|
||||
if len(messages) >= 20:
|
||||
messages.append("Aborting further checks; fix bounding boxes and try again")
|
||||
return messages
|
||||
|
||||
if not has_error:
|
||||
messages.append("SUCCESS: All bounding boxes are valid")
|
||||
return messages
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) != 2:
|
||||
print("Usage: check_bounding_boxes.py [fields.json or scan.json]")
|
||||
print()
|
||||
print("Examples:")
|
||||
print(" python check_bounding_boxes.py form.chatfield/form.scan.json")
|
||||
print(" python check_bounding_boxes.py form.chatfield/form.form.json")
|
||||
sys.exit(1)
|
||||
# Input file can be .scan.json (image coords) or .form.json (PDF coords)
|
||||
# The geometry checks work the same either way
|
||||
with open(sys.argv[1]) as f:
|
||||
messages = get_bounding_box_messages(f)
|
||||
for msg in messages:
|
||||
print(msg)
|
||||
@@ -0,0 +1,226 @@
|
||||
import unittest
|
||||
import json
|
||||
import io
|
||||
from check_bounding_boxes import get_bounding_box_messages
|
||||
|
||||
|
||||
# Currently this is not run automatically in CI; it's just for documentation and manual checking.
|
||||
class TestGetBoundingBoxMessages(unittest.TestCase):
|
||||
|
||||
def create_json_stream(self, data):
|
||||
"""Helper to create a JSON stream from data"""
|
||||
return io.StringIO(json.dumps(data))
|
||||
|
||||
def test_no_intersections(self):
|
||||
"""Test case with no bounding box intersections"""
|
||||
data = {
|
||||
"form_fields": [
|
||||
{
|
||||
"description": "Name",
|
||||
"page_number": 1,
|
||||
"label_bounding_box": [10, 10, 50, 30],
|
||||
"entry_bounding_box": [60, 10, 150, 30]
|
||||
},
|
||||
{
|
||||
"description": "Email",
|
||||
"page_number": 1,
|
||||
"label_bounding_box": [10, 40, 50, 60],
|
||||
"entry_bounding_box": [60, 40, 150, 60]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
stream = self.create_json_stream(data)
|
||||
messages = get_bounding_box_messages(stream)
|
||||
self.assertTrue(any("SUCCESS" in msg for msg in messages))
|
||||
self.assertFalse(any("FAILURE" in msg for msg in messages))
|
||||
|
||||
def test_label_entry_intersection_same_field(self):
|
||||
"""Test intersection between label and entry of the same field"""
|
||||
data = {
|
||||
"form_fields": [
|
||||
{
|
||||
"description": "Name",
|
||||
"page_number": 1,
|
||||
"label_bounding_box": [10, 10, 60, 30],
|
||||
"entry_bounding_box": [50, 10, 150, 30] # Overlaps with label
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
stream = self.create_json_stream(data)
|
||||
messages = get_bounding_box_messages(stream)
|
||||
self.assertTrue(any("FAILURE" in msg and "intersection" in msg for msg in messages))
|
||||
self.assertFalse(any("SUCCESS" in msg for msg in messages))
|
||||
|
||||
def test_intersection_between_different_fields(self):
|
||||
"""Test intersection between bounding boxes of different fields"""
|
||||
data = {
|
||||
"form_fields": [
|
||||
{
|
||||
"description": "Name",
|
||||
"page_number": 1,
|
||||
"label_bounding_box": [10, 10, 50, 30],
|
||||
"entry_bounding_box": [60, 10, 150, 30]
|
||||
},
|
||||
{
|
||||
"description": "Email",
|
||||
"page_number": 1,
|
||||
"label_bounding_box": [40, 20, 80, 40], # Overlaps with Name's boxes
|
||||
"entry_bounding_box": [160, 10, 250, 30]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
stream = self.create_json_stream(data)
|
||||
messages = get_bounding_box_messages(stream)
|
||||
self.assertTrue(any("FAILURE" in msg and "intersection" in msg for msg in messages))
|
||||
self.assertFalse(any("SUCCESS" in msg for msg in messages))
|
||||
|
||||
def test_different_pages_no_intersection(self):
|
||||
"""Test that boxes on different pages don't count as intersecting"""
|
||||
data = {
|
||||
"form_fields": [
|
||||
{
|
||||
"description": "Name",
|
||||
"page_number": 1,
|
||||
"label_bounding_box": [10, 10, 50, 30],
|
||||
"entry_bounding_box": [60, 10, 150, 30]
|
||||
},
|
||||
{
|
||||
"description": "Email",
|
||||
"page_number": 2,
|
||||
"label_bounding_box": [10, 10, 50, 30], # Same coordinates but different page
|
||||
"entry_bounding_box": [60, 10, 150, 30]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
stream = self.create_json_stream(data)
|
||||
messages = get_bounding_box_messages(stream)
|
||||
self.assertTrue(any("SUCCESS" in msg for msg in messages))
|
||||
self.assertFalse(any("FAILURE" in msg for msg in messages))
|
||||
|
||||
def test_entry_height_too_small(self):
|
||||
"""Test that entry box height is checked against font size"""
|
||||
data = {
|
||||
"form_fields": [
|
||||
{
|
||||
"description": "Name",
|
||||
"page_number": 1,
|
||||
"label_bounding_box": [10, 10, 50, 30],
|
||||
"entry_bounding_box": [60, 10, 150, 20], # Height is 10
|
||||
"entry_text": {
|
||||
"font_size": 14 # Font size larger than height
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
stream = self.create_json_stream(data)
|
||||
messages = get_bounding_box_messages(stream)
|
||||
self.assertTrue(any("FAILURE" in msg and "height" in msg for msg in messages))
|
||||
self.assertFalse(any("SUCCESS" in msg for msg in messages))
|
||||
|
||||
def test_entry_height_adequate(self):
|
||||
"""Test that adequate entry box height passes"""
|
||||
data = {
|
||||
"form_fields": [
|
||||
{
|
||||
"description": "Name",
|
||||
"page_number": 1,
|
||||
"label_bounding_box": [10, 10, 50, 30],
|
||||
"entry_bounding_box": [60, 10, 150, 30], # Height is 20
|
||||
"entry_text": {
|
||||
"font_size": 14 # Font size smaller than height
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
stream = self.create_json_stream(data)
|
||||
messages = get_bounding_box_messages(stream)
|
||||
self.assertTrue(any("SUCCESS" in msg for msg in messages))
|
||||
self.assertFalse(any("FAILURE" in msg for msg in messages))
|
||||
|
||||
def test_default_font_size(self):
|
||||
"""Test that default font size is used when not specified"""
|
||||
data = {
|
||||
"form_fields": [
|
||||
{
|
||||
"description": "Name",
|
||||
"page_number": 1,
|
||||
"label_bounding_box": [10, 10, 50, 30],
|
||||
"entry_bounding_box": [60, 10, 150, 20], # Height is 10
|
||||
"entry_text": {} # No font_size specified, should use default 14
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
stream = self.create_json_stream(data)
|
||||
messages = get_bounding_box_messages(stream)
|
||||
self.assertTrue(any("FAILURE" in msg and "height" in msg for msg in messages))
|
||||
self.assertFalse(any("SUCCESS" in msg for msg in messages))
|
||||
|
||||
def test_no_entry_text(self):
|
||||
"""Test that missing entry_text doesn't cause height check"""
|
||||
data = {
|
||||
"form_fields": [
|
||||
{
|
||||
"description": "Name",
|
||||
"page_number": 1,
|
||||
"label_bounding_box": [10, 10, 50, 30],
|
||||
"entry_bounding_box": [60, 10, 150, 20] # Small height but no entry_text
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
stream = self.create_json_stream(data)
|
||||
messages = get_bounding_box_messages(stream)
|
||||
self.assertTrue(any("SUCCESS" in msg for msg in messages))
|
||||
self.assertFalse(any("FAILURE" in msg for msg in messages))
|
||||
|
||||
def test_multiple_errors_limit(self):
|
||||
"""Test that error messages are limited to prevent excessive output"""
|
||||
fields = []
|
||||
# Create many overlapping fields
|
||||
for i in range(25):
|
||||
fields.append({
|
||||
"description": f"Field{i}",
|
||||
"page_number": 1,
|
||||
"label_bounding_box": [10, 10, 50, 30], # All overlap
|
||||
"entry_bounding_box": [20, 15, 60, 35] # All overlap
|
||||
})
|
||||
|
||||
data = {"form_fields": fields}
|
||||
|
||||
stream = self.create_json_stream(data)
|
||||
messages = get_bounding_box_messages(stream)
|
||||
# Should abort after ~20 messages
|
||||
self.assertTrue(any("Aborting" in msg for msg in messages))
|
||||
# Should have some FAILURE messages but not hundreds
|
||||
failure_count = sum(1 for msg in messages if "FAILURE" in msg)
|
||||
self.assertGreater(failure_count, 0)
|
||||
self.assertLess(len(messages), 30) # Should be limited
|
||||
|
||||
def test_edge_touching_boxes(self):
|
||||
"""Test that boxes touching at edges don't count as intersecting"""
|
||||
data = {
|
||||
"form_fields": [
|
||||
{
|
||||
"description": "Name",
|
||||
"page_number": 1,
|
||||
"label_bounding_box": [10, 10, 50, 30],
|
||||
"entry_bounding_box": [50, 10, 150, 30] # Touches at x=50
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
stream = self.create_json_stream(data)
|
||||
messages = get_bounding_box_messages(stream)
|
||||
self.assertTrue(any("SUCCESS" in msg for msg in messages))
|
||||
self.assertFalse(any("FAILURE" in msg for msg in messages))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -0,0 +1,12 @@
|
||||
import sys
|
||||
from pypdf import PdfReader
|
||||
|
||||
|
||||
# Script for Claude to run to determine whether a PDF has fillable form fields. See forms.md.
|
||||
|
||||
|
||||
reader = PdfReader(sys.argv[1])
|
||||
if (reader.get_fields()):
|
||||
print("This PDF has fillable form fields")
|
||||
else:
|
||||
print("This PDF does not have fillable form fields")
|
||||
179
skills/extracting-form-fields/scripts/convert_coordinates.py
Normal file
179
skills/extracting-form-fields/scripts/convert_coordinates.py
Normal file
@@ -0,0 +1,179 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Converts bounding box coordinates from image coordinates to PDF coordinates.
|
||||
|
||||
This script takes a .scan.json file (with image coordinates) and converts all
|
||||
bounding boxes to PDF coordinates, producing a .form.json file.
|
||||
|
||||
Image coordinates: Origin at top-left, Y increases downward
|
||||
PDF coordinates: Origin at bottom-left, Y increases upward
|
||||
|
||||
Usage:
|
||||
python convert_coordinates.py <scan.json> <pdf_file>
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from PIL import Image
|
||||
from pypdf import PdfReader
|
||||
|
||||
|
||||
def image_to_pdf_coords(image_bbox, image_width, image_height, pdf_width, pdf_height):
|
||||
"""
|
||||
Convert bounding box from image coordinates to PDF coordinates.
|
||||
|
||||
Args:
|
||||
image_bbox: [x1, y1, x2, y2] in image coordinates (top-left origin)
|
||||
image_width: Width of the image in pixels
|
||||
image_height: Height of the image in pixels
|
||||
pdf_width: Width of the PDF page in points
|
||||
pdf_height: Height of the PDF page in points
|
||||
|
||||
Returns:
|
||||
[x1, y1, x2, y2] in PDF coordinates (bottom-left origin)
|
||||
"""
|
||||
x_scale = pdf_width / image_width
|
||||
y_scale = pdf_height / image_height
|
||||
|
||||
# Convert X coordinates (simple scaling, same origin)
|
||||
pdf_x1 = image_bbox[0] * x_scale
|
||||
pdf_x2 = image_bbox[2] * x_scale
|
||||
|
||||
# Convert Y coordinates (flip vertical axis)
|
||||
# Image: y1 is top, y2 is bottom (y1 < y2 in image coords)
|
||||
# PDF: need to flip - what's at top of image is high Y in PDF
|
||||
pdf_y1 = (image_height - image_bbox[3]) * y_scale # Bottom in PDF (was bottom in image)
|
||||
pdf_y2 = (image_height - image_bbox[1]) * y_scale # Top in PDF (was top in image)
|
||||
|
||||
return [pdf_x1, pdf_y1, pdf_x2, pdf_y2]
|
||||
|
||||
|
||||
def get_image_dimensions(images_dir, page_number):
|
||||
"""Get dimensions of the PNG image for a specific page."""
|
||||
image_path = Path(images_dir) / f"page_{page_number}.png"
|
||||
if not image_path.exists():
|
||||
raise FileNotFoundError(f"Image not found: {image_path}")
|
||||
|
||||
with Image.open(image_path) as img:
|
||||
return img.width, img.height
|
||||
|
||||
|
||||
def convert_scan_to_form(scan_json_path, pdf_path, output_json_path):
|
||||
"""
|
||||
Convert .scan.json (image coords) to .form.json (PDF coords).
|
||||
|
||||
Args:
|
||||
scan_json_path: Path to input .scan.json file
|
||||
pdf_path: Path to the PDF file
|
||||
output_json_path: Path to output .form.json file
|
||||
"""
|
||||
# Load scan data
|
||||
with open(scan_json_path, 'r') as f:
|
||||
fields = json.load(f)
|
||||
|
||||
# Get PDF dimensions
|
||||
reader = PdfReader(pdf_path)
|
||||
|
||||
# Determine images directory (same directory as scan.json)
|
||||
scan_path = Path(scan_json_path)
|
||||
images_dir = scan_path.parent
|
||||
|
||||
if not images_dir.exists():
|
||||
raise FileNotFoundError(
|
||||
f"Images directory not found: {images_dir}\n"
|
||||
f"Expected to find page images in {images_dir}"
|
||||
)
|
||||
|
||||
# Convert each field
|
||||
converted_fields = []
|
||||
|
||||
for field in fields:
|
||||
page_num = field.get('page', 1)
|
||||
|
||||
# Get dimensions for this page
|
||||
page = reader.pages[page_num - 1] # Convert to 0-indexed
|
||||
pdf_width = float(page.mediabox.width)
|
||||
pdf_height = float(page.mediabox.height)
|
||||
image_width, image_height = get_image_dimensions(images_dir, page_num)
|
||||
|
||||
# Create converted field
|
||||
converted_field = field.copy()
|
||||
|
||||
# Convert main rect
|
||||
if 'rect' in field:
|
||||
converted_field['rect'] = image_to_pdf_coords(
|
||||
field['rect'],
|
||||
image_width, image_height,
|
||||
pdf_width, pdf_height
|
||||
)
|
||||
|
||||
# Convert label_rect if present
|
||||
if 'label_rect' in field:
|
||||
converted_field['label_rect'] = image_to_pdf_coords(
|
||||
field['label_rect'],
|
||||
image_width, image_height,
|
||||
pdf_width, pdf_height
|
||||
)
|
||||
|
||||
# Convert radio button options if present
|
||||
if 'radio_options' in field:
|
||||
converted_options = []
|
||||
for option in field['radio_options']:
|
||||
converted_option = option.copy()
|
||||
if 'rect' in option:
|
||||
converted_option['rect'] = image_to_pdf_coords(
|
||||
option['rect'],
|
||||
image_width, image_height,
|
||||
pdf_width, pdf_height
|
||||
)
|
||||
converted_options.append(converted_option)
|
||||
converted_field['radio_options'] = converted_options
|
||||
|
||||
converted_fields.append(converted_field)
|
||||
|
||||
# Write output
|
||||
with open(output_json_path, 'w') as f:
|
||||
json.dump(converted_fields, f, indent=2)
|
||||
|
||||
print(f"Converted {len(converted_fields)} fields from image to PDF coordinates")
|
||||
print(f"Input: {scan_json_path}")
|
||||
print(f"Output: {output_json_path}")
|
||||
|
||||
# Show an example conversion
|
||||
if converted_fields:
|
||||
print("\nExample conversion (first field):")
|
||||
orig = fields[0]
|
||||
conv = converted_fields[0]
|
||||
print(f" Field: {orig.get('field_id')}")
|
||||
print(f" Image rect: {orig.get('rect')}")
|
||||
print(f" PDF rect: {conv.get('rect')}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) != 3:
|
||||
print("Usage: convert_coordinates.py <scan.json> <pdf_file>")
|
||||
print()
|
||||
print("Example:")
|
||||
print(" python convert_coordinates.py my_form.chatfield/my_form.scan.json my_form.pdf")
|
||||
print()
|
||||
print("Output filename is automatically computed by replacing .scan.json with .form.json")
|
||||
sys.exit(1)
|
||||
|
||||
scan_json_path = sys.argv[1]
|
||||
pdf_path = sys.argv[2]
|
||||
|
||||
# Compute output filename by replacing .scan.json with .form.json
|
||||
scan_path = Path(scan_json_path)
|
||||
if not scan_path.name.endswith('.scan.json'):
|
||||
print(f"Error: Input file must end with .scan.json, got: {scan_path.name}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
output_json_path = str(scan_path.parent / scan_path.name.replace('.scan.json', '.form.json'))
|
||||
|
||||
try:
|
||||
convert_scan_to_form(scan_json_path, pdf_path, output_json_path)
|
||||
except Exception as e:
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
@@ -0,0 +1,35 @@
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pdf2image import convert_from_path
|
||||
|
||||
|
||||
# Converts each page of a PDF to a PNG image.
|
||||
|
||||
|
||||
def convert(pdf_path, output_dir, max_dim=1000):
|
||||
images = convert_from_path(pdf_path, dpi=200)
|
||||
|
||||
for i, image in enumerate(images):
|
||||
# Scale image if needed to keep width/height under `max_dim`
|
||||
width, height = image.size
|
||||
if width > max_dim or height > max_dim:
|
||||
scale_factor = min(max_dim / width, max_dim / height)
|
||||
new_width = int(width * scale_factor)
|
||||
new_height = int(height * scale_factor)
|
||||
image = image.resize((new_width, new_height))
|
||||
|
||||
image_path = os.path.join(output_dir, f"page_{i+1}.png")
|
||||
image.save(image_path)
|
||||
print(f"Saved page {i+1} as {image_path} (size: {image.size})")
|
||||
|
||||
print(f"Converted {len(images)} pages to PNG images")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) != 3:
|
||||
print("Usage: convert_pdf_to_images.py [input pdf] [output directory]")
|
||||
sys.exit(1)
|
||||
pdf_path = sys.argv[1]
|
||||
output_directory = sys.argv[2]
|
||||
convert(pdf_path, output_directory)
|
||||
@@ -0,0 +1,59 @@
|
||||
import json
|
||||
import sys
|
||||
|
||||
from PIL import Image, ImageDraw
|
||||
|
||||
|
||||
# Creates "validation" images with rectangles for the bounding box information that
|
||||
# Claude creates when determining where to add text annotations in PDFs.
|
||||
# This version works with IMAGE coordinates (from .scan.json files).
|
||||
|
||||
|
||||
def create_validation_image(page_number, fields_json_path, input_path, output_path):
|
||||
"""
|
||||
Create a validation image with bounding boxes overlaid.
|
||||
|
||||
Args:
|
||||
page_number: Page number (1-indexed)
|
||||
fields_json_path: Path to .scan.json file (IMAGE coordinates)
|
||||
input_path: Path to input PNG image
|
||||
output_path: Path to output validation image
|
||||
"""
|
||||
# Input file should be in the .scan.json format with IMAGE coordinates
|
||||
with open(fields_json_path, 'r') as f:
|
||||
fields = json.load(f)
|
||||
|
||||
img = Image.open(input_path)
|
||||
draw = ImageDraw.Draw(img)
|
||||
num_boxes = 0
|
||||
|
||||
for field in fields:
|
||||
if field['page'] == page_number:
|
||||
# Coordinates are already in image space - use them directly!
|
||||
entry_box_img = field['rect']
|
||||
label_box_img = field.get('label_rect', [0, 0, 0, 0])
|
||||
|
||||
# Draw red rectangle over entry bounding box
|
||||
draw.rectangle(entry_box_img, outline='red', width=2)
|
||||
num_boxes += 1
|
||||
|
||||
if label_box_img != [0, 0, 0, 0]:
|
||||
draw.rectangle(label_box_img, outline='blue', width=2)
|
||||
num_boxes += 1
|
||||
|
||||
img.save(output_path)
|
||||
print(f"Created validation image at {output_path} with {num_boxes} bounding boxes")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) != 5:
|
||||
print("Usage: create_validation_image.py [page number] [scan.json file] [input image path] [output image path]")
|
||||
print()
|
||||
print("Example:")
|
||||
print(" python create_validation_image.py 1 form.chatfield/form.scan.json form.chatfield/page_1.png form.chatfield/page_1_validation.png")
|
||||
sys.exit(1)
|
||||
page_number = int(sys.argv[1])
|
||||
fields_json_path = sys.argv[2]
|
||||
input_image_path = sys.argv[3]
|
||||
output_image_path = sys.argv[4]
|
||||
create_validation_image(page_number, fields_json_path, input_image_path, output_image_path)
|
||||
158
skills/extracting-form-fields/scripts/extract_form_field_info.py
Normal file
158
skills/extracting-form-fields/scripts/extract_form_field_info.py
Normal file
@@ -0,0 +1,158 @@
|
||||
import json
|
||||
import sys
|
||||
|
||||
from pypdf import PdfReader
|
||||
|
||||
|
||||
# Extracts data for the fillable form fields in a PDF and outputs JSON that
|
||||
# Claude uses to fill the fields. See forms.md.
|
||||
|
||||
|
||||
# This matches the format used by PdfReader `get_fields` and `update_page_form_field_values` methods.
|
||||
def get_full_annotation_field_id(annotation):
|
||||
components = []
|
||||
while annotation:
|
||||
field_name = annotation.get('/T')
|
||||
if field_name:
|
||||
components.append(field_name)
|
||||
annotation = annotation.get('/Parent')
|
||||
return ".".join(reversed(components)) if components else None
|
||||
|
||||
|
||||
def make_field_dict(field, field_id):
|
||||
field_dict = {"field_id": field_id}
|
||||
ft = field.get('/FT')
|
||||
if ft == "/Tx":
|
||||
field_dict["type"] = "text"
|
||||
elif ft == "/Btn":
|
||||
field_dict["type"] = "checkbox" # radio groups handled separately
|
||||
states = field.get("/_States_", [])
|
||||
if len(states) == 2:
|
||||
# "/Off" seems to always be the unchecked value, as suggested by
|
||||
# https://opensource.adobe.com/dc-acrobat-sdk-docs/standards/pdfstandards/pdf/PDF32000_2008.pdf#page=448
|
||||
# It can be either first or second in the "/_States_" list.
|
||||
if "/Off" in states:
|
||||
field_dict["checked_value"] = states[0] if states[0] != "/Off" else states[1]
|
||||
field_dict["unchecked_value"] = "/Off"
|
||||
else:
|
||||
print(f"Unexpected state values for checkbox `${field_id}`. Its checked and unchecked values may not be correct; if you're trying to check it, visually verify the results.")
|
||||
field_dict["checked_value"] = states[0]
|
||||
field_dict["unchecked_value"] = states[1]
|
||||
elif ft == "/Ch":
|
||||
field_dict["type"] = "choice"
|
||||
states = field.get("/_States_", [])
|
||||
field_dict["choice_options"] = [{
|
||||
"value": state[0],
|
||||
"text": state[1],
|
||||
} for state in states]
|
||||
else:
|
||||
field_dict["type"] = f"unknown ({ft})"
|
||||
|
||||
# Extract tooltip (TU = tooltip/user-facing text)
|
||||
tooltip = field.get('/TU')
|
||||
if tooltip:
|
||||
field_dict["tooltip"] = tooltip
|
||||
|
||||
return field_dict
|
||||
|
||||
|
||||
# Returns a list of fillable PDF fields:
|
||||
# [
|
||||
# {
|
||||
# "field_id": "name",
|
||||
# "page": 1,
|
||||
# "type": ("text", "checkbox", "radio_group", or "choice")
|
||||
# // Per-type additional fields described in forms.md
|
||||
# },
|
||||
# ]
|
||||
def get_field_info(reader: PdfReader):
|
||||
fields = reader.get_fields()
|
||||
|
||||
field_info_by_id = {}
|
||||
possible_radio_names = set()
|
||||
|
||||
for field_id, field in fields.items():
|
||||
# Skip if this is a container field with children, except that it might be
|
||||
# a parent group for radio button options.
|
||||
if field.get("/Kids"):
|
||||
if field.get("/FT") == "/Btn":
|
||||
possible_radio_names.add(field_id)
|
||||
continue
|
||||
field_info_by_id[field_id] = make_field_dict(field, field_id)
|
||||
|
||||
# Bounding rects are stored in annotations in page objects.
|
||||
|
||||
# Radio button options have a separate annotation for each choice;
|
||||
# all choices have the same field name.
|
||||
# See https://westhealth.github.io/exploring-fillable-forms-with-pdfrw.html
|
||||
radio_fields_by_id = {}
|
||||
|
||||
for page_index, page in enumerate(reader.pages):
|
||||
annotations = page.get('/Annots', [])
|
||||
for ann in annotations:
|
||||
field_id = get_full_annotation_field_id(ann)
|
||||
if field_id in field_info_by_id:
|
||||
field_info_by_id[field_id]["page"] = page_index + 1
|
||||
field_info_by_id[field_id]["rect"] = ann.get('/Rect')
|
||||
elif field_id in possible_radio_names:
|
||||
try:
|
||||
# ann['/AP']['/N'] should have two items. One of them is '/Off',
|
||||
# the other is the active value.
|
||||
on_values = [v for v in ann["/AP"]["/N"] if v != "/Off"]
|
||||
except KeyError:
|
||||
continue
|
||||
if len(on_values) == 1:
|
||||
rect = ann.get("/Rect")
|
||||
if field_id not in radio_fields_by_id:
|
||||
radio_fields_by_id[field_id] = {
|
||||
"field_id": field_id,
|
||||
"type": "radio_group",
|
||||
"page": page_index + 1,
|
||||
"radio_options": [],
|
||||
}
|
||||
# Note: at least on macOS 15.7, Preview.app doesn't show selected
|
||||
# radio buttons correctly. (It does if you remove the leading slash
|
||||
# from the value, but that causes them not to appear correctly in
|
||||
# Chrome/Firefox/Acrobat/etc).
|
||||
radio_fields_by_id[field_id]["radio_options"].append({
|
||||
"value": on_values[0],
|
||||
"rect": rect,
|
||||
})
|
||||
|
||||
# Some PDFs have form field definitions without corresponding annotations,
|
||||
# so we can't tell where they are. Ignore these fields for now.
|
||||
fields_with_location = []
|
||||
for field_info in field_info_by_id.values():
|
||||
if "page" in field_info:
|
||||
fields_with_location.append(field_info)
|
||||
else:
|
||||
print(f"Unable to determine location for field id: {field_info.get('field_id')}, ignoring")
|
||||
|
||||
# Sort by page number, then Y position (flipped in PDF coordinate system), then X.
|
||||
def sort_key(f):
|
||||
if "radio_options" in f:
|
||||
rect = f["radio_options"][0]["rect"] or [0, 0, 0, 0]
|
||||
else:
|
||||
rect = f.get("rect") or [0, 0, 0, 0]
|
||||
adjusted_position = [-rect[1], rect[0]]
|
||||
return [f.get("page"), adjusted_position]
|
||||
|
||||
sorted_fields = fields_with_location + list(radio_fields_by_id.values())
|
||||
sorted_fields.sort(key=sort_key)
|
||||
|
||||
return sorted_fields
|
||||
|
||||
|
||||
def write_field_info(pdf_path: str, json_output_path: str):
|
||||
reader = PdfReader(pdf_path)
|
||||
field_info = get_field_info(reader)
|
||||
with open(json_output_path, "w") as f:
|
||||
json.dump(field_info, f, indent=2)
|
||||
print(f"Wrote {len(field_info)} fields to {json_output_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) != 3:
|
||||
print("Usage: extract_form_field_info.py [input pdf] [output json]")
|
||||
sys.exit(1)
|
||||
write_field_info(sys.argv[1], sys.argv[2])
|
||||
Reference in New Issue
Block a user