Files
gh-wielandbrendel-research-…/skills/pptx/ooxml/scripts/pack.py
2025-11-30 09:06:10 +08:00

160 lines
5.5 KiB
Python

#!/usr/bin/env python3
"""
Tool to pack a directory into a .docx, .pptx, or .xlsx file with XML formatting undone.
Example usage:
python pack.py <input_directory> <office_file> [--force]
"""
import argparse
import shutil
import subprocess
import sys
import tempfile
import defusedxml.minidom
import zipfile
from pathlib import Path
def main():
parser = argparse.ArgumentParser(description="Pack a directory into an Office file")
parser.add_argument("input_directory", help="Unpacked Office document directory")
parser.add_argument("output_file", help="Output Office file (.docx/.pptx/.xlsx)")
parser.add_argument("--force", action="store_true", help="Skip validation")
args = parser.parse_args()
try:
success = pack_document(
args.input_directory, args.output_file, validate=not args.force
)
# Show warning if validation was skipped
if args.force:
print("Warning: Skipped validation, file may be corrupt", file=sys.stderr)
# Exit with error if validation failed
elif not success:
print("Contents would produce a corrupt file.", file=sys.stderr)
print("Please validate XML before repacking.", file=sys.stderr)
print("Use --force to skip validation and pack anyway.", file=sys.stderr)
sys.exit(1)
except ValueError as e:
sys.exit(f"Error: {e}")
def pack_document(input_dir, output_file, validate=False):
"""Pack a directory into an Office file (.docx/.pptx/.xlsx).
Args:
input_dir: Path to unpacked Office document directory
output_file: Path to output Office file
validate: If True, validates with soffice (default: False)
Returns:
bool: True if successful, False if validation failed
"""
input_dir = Path(input_dir)
output_file = Path(output_file)
if not input_dir.is_dir():
raise ValueError(f"{input_dir} is not a directory")
if output_file.suffix.lower() not in {".docx", ".pptx", ".xlsx"}:
raise ValueError(f"{output_file} must be a .docx, .pptx, or .xlsx file")
# Work in temporary directory to avoid modifying original
with tempfile.TemporaryDirectory() as temp_dir:
temp_content_dir = Path(temp_dir) / "content"
shutil.copytree(input_dir, temp_content_dir)
# Process XML files to remove pretty-printing whitespace
for pattern in ["*.xml", "*.rels"]:
for xml_file in temp_content_dir.rglob(pattern):
condense_xml(xml_file)
# Create final Office file as zip archive
output_file.parent.mkdir(parents=True, exist_ok=True)
with zipfile.ZipFile(output_file, "w", zipfile.ZIP_DEFLATED) as zf:
for f in temp_content_dir.rglob("*"):
if f.is_file():
zf.write(f, f.relative_to(temp_content_dir))
# Validate if requested
if validate:
if not validate_document(output_file):
output_file.unlink() # Delete the corrupt file
return False
return True
def validate_document(doc_path):
"""Validate document by converting to HTML with soffice."""
# Determine the correct filter based on file extension
match doc_path.suffix.lower():
case ".docx":
filter_name = "html:HTML"
case ".pptx":
filter_name = "html:impress_html_Export"
case ".xlsx":
filter_name = "html:HTML (StarCalc)"
with tempfile.TemporaryDirectory() as temp_dir:
try:
result = subprocess.run(
[
"soffice",
"--headless",
"--convert-to",
filter_name,
"--outdir",
temp_dir,
str(doc_path),
],
capture_output=True,
timeout=10,
text=True,
)
if not (Path(temp_dir) / f"{doc_path.stem}.html").exists():
error_msg = result.stderr.strip() or "Document validation failed"
print(f"Validation error: {error_msg}", file=sys.stderr)
return False
return True
except FileNotFoundError:
print("Warning: soffice not found. Skipping validation.", file=sys.stderr)
return True
except subprocess.TimeoutExpired:
print("Validation error: Timeout during conversion", file=sys.stderr)
return False
except Exception as e:
print(f"Validation error: {e}", file=sys.stderr)
return False
def condense_xml(xml_file):
"""Strip unnecessary whitespace and remove comments."""
with open(xml_file, "r", encoding="utf-8") as f:
dom = defusedxml.minidom.parse(f)
# Process each element to remove whitespace and comments
for element in dom.getElementsByTagName("*"):
# Skip w:t elements and their processing
if element.tagName.endswith(":t"):
continue
# Remove whitespace-only text nodes and comment nodes
for child in list(element.childNodes):
if (
child.nodeType == child.TEXT_NODE
and child.nodeValue
and child.nodeValue.strip() == ""
) or child.nodeType == child.COMMENT_NODE:
element.removeChild(child)
# Write back the condensed XML
with open(xml_file, "wb") as f:
f.write(dom.toxml(encoding="UTF-8"))
if __name__ == "__main__":
main()