#!/usr/bin/env python3 """ Tool to pack a directory into a .docx, .pptx, or .xlsx file with XML formatting undone. Example usage: python pack.py [--force] """ import argparse import shutil import subprocess import sys import tempfile import defusedxml.minidom import zipfile from pathlib import Path def main(): parser = argparse.ArgumentParser(description="Pack a directory into an Office file") parser.add_argument("input_directory", help="Unpacked Office document directory") parser.add_argument("output_file", help="Output Office file (.docx/.pptx/.xlsx)") parser.add_argument("--force", action="store_true", help="Skip validation") args = parser.parse_args() try: success = pack_document( args.input_directory, args.output_file, validate=not args.force ) # Show warning if validation was skipped if args.force: print("Warning: Skipped validation, file may be corrupt", file=sys.stderr) # Exit with error if validation failed elif not success: print("Contents would produce a corrupt file.", file=sys.stderr) print("Please validate XML before repacking.", file=sys.stderr) print("Use --force to skip validation and pack anyway.", file=sys.stderr) sys.exit(1) except ValueError as e: sys.exit(f"Error: {e}") def pack_document(input_dir, output_file, validate=False): """Pack a directory into an Office file (.docx/.pptx/.xlsx). Args: input_dir: Path to unpacked Office document directory output_file: Path to output Office file validate: If True, validates with soffice (default: False) Returns: bool: True if successful, False if validation failed """ input_dir = Path(input_dir) output_file = Path(output_file) if not input_dir.is_dir(): raise ValueError(f"{input_dir} is not a directory") if output_file.suffix.lower() not in {".docx", ".pptx", ".xlsx"}: raise ValueError(f"{output_file} must be a .docx, .pptx, or .xlsx file") # Work in temporary directory to avoid modifying original with tempfile.TemporaryDirectory() as temp_dir: temp_content_dir = Path(temp_dir) / "content" shutil.copytree(input_dir, temp_content_dir) # Process XML files to remove pretty-printing whitespace for pattern in ["*.xml", "*.rels"]: for xml_file in temp_content_dir.rglob(pattern): condense_xml(xml_file) # Create final Office file as zip archive output_file.parent.mkdir(parents=True, exist_ok=True) with zipfile.ZipFile(output_file, "w", zipfile.ZIP_DEFLATED) as zf: for f in temp_content_dir.rglob("*"): if f.is_file(): zf.write(f, f.relative_to(temp_content_dir)) # Validate if requested if validate: if not validate_document(output_file): output_file.unlink() # Delete the corrupt file return False return True def validate_document(doc_path): """Validate document by converting to HTML with soffice.""" # Determine the correct filter based on file extension match doc_path.suffix.lower(): case ".docx": filter_name = "html:HTML" case ".pptx": filter_name = "html:impress_html_Export" case ".xlsx": filter_name = "html:HTML (StarCalc)" with tempfile.TemporaryDirectory() as temp_dir: try: result = subprocess.run( [ "soffice", "--headless", "--convert-to", filter_name, "--outdir", temp_dir, str(doc_path), ], capture_output=True, timeout=10, text=True, ) if not (Path(temp_dir) / f"{doc_path.stem}.html").exists(): error_msg = result.stderr.strip() or "Document validation failed" print(f"Validation error: {error_msg}", file=sys.stderr) return False return True except FileNotFoundError: print("Warning: soffice not found. Skipping validation.", file=sys.stderr) return True except subprocess.TimeoutExpired: print("Validation error: Timeout during conversion", file=sys.stderr) return False except Exception as e: print(f"Validation error: {e}", file=sys.stderr) return False def condense_xml(xml_file): """Strip unnecessary whitespace and remove comments.""" with open(xml_file, "r", encoding="utf-8") as f: dom = defusedxml.minidom.parse(f) # Process each element to remove whitespace and comments for element in dom.getElementsByTagName("*"): # Skip w:t elements and their processing if element.tagName.endswith(":t"): continue # Remove whitespace-only text nodes and comment nodes for child in list(element.childNodes): if ( child.nodeType == child.TEXT_NODE and child.nodeValue and child.nodeValue.strip() == "" ) or child.nodeType == child.COMMENT_NODE: element.removeChild(child) # Write back the condensed XML with open(xml_file, "wb") as f: f.write(dom.toxml(encoding="UTF-8")) if __name__ == "__main__": main()