import json import sys from pypdf import PdfReader # Extracts data for the fillable form fields in a PDF and outputs JSON that # Claude uses to fill the fields. See forms.md. # This matches the format used by PdfReader `get_fields` and `update_page_form_field_values` methods. def get_full_annotation_field_id(annotation): components = [] while annotation: field_name = annotation.get('/T') if field_name: components.append(field_name) annotation = annotation.get('/Parent') return ".".join(reversed(components)) if components else None def make_field_dict(field, field_id): field_dict = {"field_id": field_id} ft = field.get('/FT') if ft == "/Tx": field_dict["type"] = "text" elif ft == "/Btn": field_dict["type"] = "checkbox" # radio groups handled separately states = field.get("/_States_", []) if len(states) == 2: # "/Off" seems to always be the unchecked value, as suggested by # https://opensource.adobe.com/dc-acrobat-sdk-docs/standards/pdfstandards/pdf/PDF32000_2008.pdf#page=448 # It can be either first or second in the "/_States_" list. if "/Off" in states: field_dict["checked_value"] = states[0] if states[0] != "/Off" else states[1] field_dict["unchecked_value"] = "/Off" else: print(f"Unexpected state values for checkbox `${field_id}`. Its checked and unchecked values may not be correct; if you're trying to check it, visually verify the results.") field_dict["checked_value"] = states[0] field_dict["unchecked_value"] = states[1] elif ft == "/Ch": field_dict["type"] = "choice" states = field.get("/_States_", []) field_dict["choice_options"] = [{ "value": state[0], "text": state[1], } for state in states] else: field_dict["type"] = f"unknown ({ft})" return field_dict # Returns a list of fillable PDF fields: # [ # { # "field_id": "name", # "page": 1, # "type": ("text", "checkbox", "radio_group", or "choice") # // Per-type additional fields described in forms.md # }, # ] def get_field_info(reader: PdfReader): fields = reader.get_fields() field_info_by_id = {} possible_radio_names = set() for field_id, field in fields.items(): # Skip if this is a container field with children, except that it might be # a parent group for radio button options. if field.get("/Kids"): if field.get("/FT") == "/Btn": possible_radio_names.add(field_id) continue field_info_by_id[field_id] = make_field_dict(field, field_id) # Bounding rects are stored in annotations in page objects. # Radio button options have a separate annotation for each choice; # all choices have the same field name. # See https://westhealth.github.io/exploring-fillable-forms-with-pdfrw.html radio_fields_by_id = {} for page_index, page in enumerate(reader.pages): annotations = page.get('/Annots', []) for ann in annotations: field_id = get_full_annotation_field_id(ann) if field_id in field_info_by_id: field_info_by_id[field_id]["page"] = page_index + 1 field_info_by_id[field_id]["rect"] = ann.get('/Rect') elif field_id in possible_radio_names: try: # ann['/AP']['/N'] should have two items. One of them is '/Off', # the other is the active value. on_values = [v for v in ann["/AP"]["/N"] if v != "/Off"] except KeyError: continue if len(on_values) == 1: rect = ann.get("/Rect") if field_id not in radio_fields_by_id: radio_fields_by_id[field_id] = { "field_id": field_id, "type": "radio_group", "page": page_index + 1, "radio_options": [], } # Note: at least on macOS 15.7, Preview.app doesn't show selected # radio buttons correctly. (It does if you remove the leading slash # from the value, but that causes them not to appear correctly in # Chrome/Firefox/Acrobat/etc). radio_fields_by_id[field_id]["radio_options"].append({ "value": on_values[0], "rect": rect, }) # Some PDFs have form field definitions without corresponding annotations, # so we can't tell where they are. Ignore these fields for now. fields_with_location = [] for field_info in field_info_by_id.values(): if "page" in field_info: fields_with_location.append(field_info) else: print(f"Unable to determine location for field id: {field_info.get('field_id')}, ignoring") # Sort by page number, then Y position (flipped in PDF coordinate system), then X. def sort_key(f): if "radio_options" in f: rect = f["radio_options"][0]["rect"] or [0, 0, 0, 0] else: rect = f.get("rect") or [0, 0, 0, 0] adjusted_position = [-rect[1], rect[0]] return [f.get("page"), adjusted_position] sorted_fields = fields_with_location + list(radio_fields_by_id.values()) sorted_fields.sort(key=sort_key) return sorted_fields def write_field_info(pdf_path: str, json_output_path: str): reader = PdfReader(pdf_path) field_info = get_field_info(reader) with open(json_output_path, "w") as f: json.dump(field_info, f, indent=2) print(f"Wrote {len(field_info)} fields to {json_output_path}") if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: extract_form_field_info.py [input pdf] [output json]") sys.exit(1) write_field_info(sys.argv[1], sys.argv[2])