Initial commit

This commit is contained in:
Zhongwei Li
2025-11-29 18:03:26 +08:00
commit 3cf7af5962
134 changed files with 53723 additions and 0 deletions

View File

@@ -0,0 +1,108 @@
import json
import sys
from pypdf import PdfReader, PdfWriter
from pypdf.annotations import FreeText
# Fills a PDF by adding text annotations defined in `fields.json`. See FORMS.md.
def transform_coordinates(bbox, image_width, image_height, pdf_width, pdf_height):
"""Transform bounding box from image coordinates to PDF coordinates"""
# Image coordinates: origin at top-left, y increases downward
# PDF coordinates: origin at bottom-left, y increases upward
x_scale = pdf_width / image_width
y_scale = pdf_height / image_height
left = bbox[0] * x_scale
right = bbox[2] * x_scale
# Flip Y coordinates for PDF
top = pdf_height - (bbox[1] * y_scale)
bottom = pdf_height - (bbox[3] * y_scale)
return left, bottom, right, top
def fill_pdf_form(input_pdf_path, fields_json_path, output_pdf_path):
"""Fill the PDF form with data from fields.json"""
# `fields.json` format described in FORMS.md.
with open(fields_json_path, "r") as f:
fields_data = json.load(f)
# Open the PDF
reader = PdfReader(input_pdf_path)
writer = PdfWriter()
# Copy all pages to writer
writer.append(reader)
# Get PDF dimensions for each page
pdf_dimensions = {}
for i, page in enumerate(reader.pages):
mediabox = page.mediabox
pdf_dimensions[i + 1] = [mediabox.width, mediabox.height]
# Process each form field
annotations = []
for field in fields_data["form_fields"]:
page_num = field["page_number"]
# Get page dimensions and transform coordinates.
page_info = next(p for p in fields_data["pages"] if p["page_number"] == page_num)
image_width = page_info["image_width"]
image_height = page_info["image_height"]
pdf_width, pdf_height = pdf_dimensions[page_num]
transformed_entry_box = transform_coordinates(
field["entry_bounding_box"],
image_width, image_height,
pdf_width, pdf_height
)
# Skip empty fields
if "entry_text" not in field or "text" not in field["entry_text"]:
continue
entry_text = field["entry_text"]
text = entry_text["text"]
if not text:
continue
font_name = entry_text.get("font", "Arial")
font_size = str(entry_text.get("font_size", 14)) + "pt"
font_color = entry_text.get("font_color", "000000")
# Font size/color seems to not work reliably across viewers:
# https://github.com/py-pdf/pypdf/issues/2084
annotation = FreeText(
text=text,
rect=transformed_entry_box,
font=font_name,
font_size=font_size,
font_color=font_color,
border_color=None,
background_color=None,
)
annotations.append(annotation)
# page_number is 0-based for pypdf
writer.add_annotation(page_number=page_num - 1, annotation=annotation)
# Save the filled PDF
with open(output_pdf_path, "wb") as output:
writer.write(output)
print(f"Successfully filled PDF form and saved to {output_pdf_path}")
print(f"Added {len(annotations)} text annotations")
if __name__ == "__main__":
if len(sys.argv) != 4:
print("Usage: fill_pdf_form_with_annotations.py [input pdf] [fields.json] [output pdf]")
sys.exit(1)
input_pdf = sys.argv[1]
fields_json = sys.argv[2]
output_pdf = sys.argv[3]
fill_pdf_form(input_pdf, fields_json, output_pdf)