160 lines
5.5 KiB
Python
160 lines
5.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Tool to pack a directory into a .docx, .pptx, or .xlsx file with XML formatting undone.
|
|
|
|
Example usage:
|
|
python pack.py <input_directory> <office_file> [--force]
|
|
"""
|
|
|
|
import argparse
|
|
import shutil
|
|
import subprocess
|
|
import sys
|
|
import tempfile
|
|
import defusedxml.minidom
|
|
import zipfile
|
|
from pathlib import Path
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Pack a directory into an Office file")
|
|
parser.add_argument("input_directory", help="Unpacked Office document directory")
|
|
parser.add_argument("output_file", help="Output Office file (.docx/.pptx/.xlsx)")
|
|
parser.add_argument("--force", action="store_true", help="Skip validation")
|
|
args = parser.parse_args()
|
|
|
|
try:
|
|
success = pack_document(
|
|
args.input_directory, args.output_file, validate=not args.force
|
|
)
|
|
|
|
# Show warning if validation was skipped
|
|
if args.force:
|
|
print("Warning: Skipped validation, file may be corrupt", file=sys.stderr)
|
|
# Exit with error if validation failed
|
|
elif not success:
|
|
print("Contents would produce a corrupt file.", file=sys.stderr)
|
|
print("Please validate XML before repacking.", file=sys.stderr)
|
|
print("Use --force to skip validation and pack anyway.", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
except ValueError as e:
|
|
sys.exit(f"Error: {e}")
|
|
|
|
|
|
def pack_document(input_dir, output_file, validate=False):
|
|
"""Pack a directory into an Office file (.docx/.pptx/.xlsx).
|
|
|
|
Args:
|
|
input_dir: Path to unpacked Office document directory
|
|
output_file: Path to output Office file
|
|
validate: If True, validates with soffice (default: False)
|
|
|
|
Returns:
|
|
bool: True if successful, False if validation failed
|
|
"""
|
|
input_dir = Path(input_dir)
|
|
output_file = Path(output_file)
|
|
|
|
if not input_dir.is_dir():
|
|
raise ValueError(f"{input_dir} is not a directory")
|
|
if output_file.suffix.lower() not in {".docx", ".pptx", ".xlsx"}:
|
|
raise ValueError(f"{output_file} must be a .docx, .pptx, or .xlsx file")
|
|
|
|
# Work in temporary directory to avoid modifying original
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
temp_content_dir = Path(temp_dir) / "content"
|
|
shutil.copytree(input_dir, temp_content_dir)
|
|
|
|
# Process XML files to remove pretty-printing whitespace
|
|
for pattern in ["*.xml", "*.rels"]:
|
|
for xml_file in temp_content_dir.rglob(pattern):
|
|
condense_xml(xml_file)
|
|
|
|
# Create final Office file as zip archive
|
|
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
with zipfile.ZipFile(output_file, "w", zipfile.ZIP_DEFLATED) as zf:
|
|
for f in temp_content_dir.rglob("*"):
|
|
if f.is_file():
|
|
zf.write(f, f.relative_to(temp_content_dir))
|
|
|
|
# Validate if requested
|
|
if validate:
|
|
if not validate_document(output_file):
|
|
output_file.unlink() # Delete the corrupt file
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
def validate_document(doc_path):
|
|
"""Validate document by converting to HTML with soffice."""
|
|
# Determine the correct filter based on file extension
|
|
match doc_path.suffix.lower():
|
|
case ".docx":
|
|
filter_name = "html:HTML"
|
|
case ".pptx":
|
|
filter_name = "html:impress_html_Export"
|
|
case ".xlsx":
|
|
filter_name = "html:HTML (StarCalc)"
|
|
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
try:
|
|
result = subprocess.run(
|
|
[
|
|
"soffice",
|
|
"--headless",
|
|
"--convert-to",
|
|
filter_name,
|
|
"--outdir",
|
|
temp_dir,
|
|
str(doc_path),
|
|
],
|
|
capture_output=True,
|
|
timeout=10,
|
|
text=True,
|
|
)
|
|
if not (Path(temp_dir) / f"{doc_path.stem}.html").exists():
|
|
error_msg = result.stderr.strip() or "Document validation failed"
|
|
print(f"Validation error: {error_msg}", file=sys.stderr)
|
|
return False
|
|
return True
|
|
except FileNotFoundError:
|
|
print("Warning: soffice not found. Skipping validation.", file=sys.stderr)
|
|
return True
|
|
except subprocess.TimeoutExpired:
|
|
print("Validation error: Timeout during conversion", file=sys.stderr)
|
|
return False
|
|
except Exception as e:
|
|
print(f"Validation error: {e}", file=sys.stderr)
|
|
return False
|
|
|
|
|
|
def condense_xml(xml_file):
|
|
"""Strip unnecessary whitespace and remove comments."""
|
|
with open(xml_file, "r", encoding="utf-8") as f:
|
|
dom = defusedxml.minidom.parse(f)
|
|
|
|
# Process each element to remove whitespace and comments
|
|
for element in dom.getElementsByTagName("*"):
|
|
# Skip w:t elements and their processing
|
|
if element.tagName.endswith(":t"):
|
|
continue
|
|
|
|
# Remove whitespace-only text nodes and comment nodes
|
|
for child in list(element.childNodes):
|
|
if (
|
|
child.nodeType == child.TEXT_NODE
|
|
and child.nodeValue
|
|
and child.nodeValue.strip() == ""
|
|
) or child.nodeType == child.COMMENT_NODE:
|
|
element.removeChild(child)
|
|
|
|
# Write back the condensed XML
|
|
with open(xml_file, "wb") as f:
|
|
f.write(dom.toxml(encoding="UTF-8"))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|