Initial commit
This commit is contained in:
159
skills/docx/ooxml/scripts/pack.py
Executable file
159
skills/docx/ooxml/scripts/pack.py
Executable file
@@ -0,0 +1,159 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tool to pack a directory into a .docx, .pptx, or .xlsx file with XML formatting undone.
|
||||
|
||||
Example usage:
|
||||
python pack.py <input_directory> <office_file> [--force]
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import defusedxml.minidom
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Pack a directory into an Office file")
|
||||
parser.add_argument("input_directory", help="Unpacked Office document directory")
|
||||
parser.add_argument("output_file", help="Output Office file (.docx/.pptx/.xlsx)")
|
||||
parser.add_argument("--force", action="store_true", help="Skip validation")
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
success = pack_document(
|
||||
args.input_directory, args.output_file, validate=not args.force
|
||||
)
|
||||
|
||||
# Show warning if validation was skipped
|
||||
if args.force:
|
||||
print("Warning: Skipped validation, file may be corrupt", file=sys.stderr)
|
||||
# Exit with error if validation failed
|
||||
elif not success:
|
||||
print("Contents would produce a corrupt file.", file=sys.stderr)
|
||||
print("Please validate XML before repacking.", file=sys.stderr)
|
||||
print("Use --force to skip validation and pack anyway.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except ValueError as e:
|
||||
sys.exit(f"Error: {e}")
|
||||
|
||||
|
||||
def pack_document(input_dir, output_file, validate=False):
|
||||
"""Pack a directory into an Office file (.docx/.pptx/.xlsx).
|
||||
|
||||
Args:
|
||||
input_dir: Path to unpacked Office document directory
|
||||
output_file: Path to output Office file
|
||||
validate: If True, validates with soffice (default: False)
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False if validation failed
|
||||
"""
|
||||
input_dir = Path(input_dir)
|
||||
output_file = Path(output_file)
|
||||
|
||||
if not input_dir.is_dir():
|
||||
raise ValueError(f"{input_dir} is not a directory")
|
||||
if output_file.suffix.lower() not in {".docx", ".pptx", ".xlsx"}:
|
||||
raise ValueError(f"{output_file} must be a .docx, .pptx, or .xlsx file")
|
||||
|
||||
# Work in temporary directory to avoid modifying original
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
temp_content_dir = Path(temp_dir) / "content"
|
||||
shutil.copytree(input_dir, temp_content_dir)
|
||||
|
||||
# Process XML files to remove pretty-printing whitespace
|
||||
for pattern in ["*.xml", "*.rels"]:
|
||||
for xml_file in temp_content_dir.rglob(pattern):
|
||||
condense_xml(xml_file)
|
||||
|
||||
# Create final Office file as zip archive
|
||||
output_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
with zipfile.ZipFile(output_file, "w", zipfile.ZIP_DEFLATED) as zf:
|
||||
for f in temp_content_dir.rglob("*"):
|
||||
if f.is_file():
|
||||
zf.write(f, f.relative_to(temp_content_dir))
|
||||
|
||||
# Validate if requested
|
||||
if validate:
|
||||
if not validate_document(output_file):
|
||||
output_file.unlink() # Delete the corrupt file
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def validate_document(doc_path):
|
||||
"""Validate document by converting to HTML with soffice."""
|
||||
# Determine the correct filter based on file extension
|
||||
match doc_path.suffix.lower():
|
||||
case ".docx":
|
||||
filter_name = "html:HTML"
|
||||
case ".pptx":
|
||||
filter_name = "html:impress_html_Export"
|
||||
case ".xlsx":
|
||||
filter_name = "html:HTML (StarCalc)"
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[
|
||||
"soffice",
|
||||
"--headless",
|
||||
"--convert-to",
|
||||
filter_name,
|
||||
"--outdir",
|
||||
temp_dir,
|
||||
str(doc_path),
|
||||
],
|
||||
capture_output=True,
|
||||
timeout=10,
|
||||
text=True,
|
||||
)
|
||||
if not (Path(temp_dir) / f"{doc_path.stem}.html").exists():
|
||||
error_msg = result.stderr.strip() or "Document validation failed"
|
||||
print(f"Validation error: {error_msg}", file=sys.stderr)
|
||||
return False
|
||||
return True
|
||||
except FileNotFoundError:
|
||||
print("Warning: soffice not found. Skipping validation.", file=sys.stderr)
|
||||
return True
|
||||
except subprocess.TimeoutExpired:
|
||||
print("Validation error: Timeout during conversion", file=sys.stderr)
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f"Validation error: {e}", file=sys.stderr)
|
||||
return False
|
||||
|
||||
|
||||
def condense_xml(xml_file):
|
||||
"""Strip unnecessary whitespace and remove comments."""
|
||||
with open(xml_file, "r", encoding="utf-8") as f:
|
||||
dom = defusedxml.minidom.parse(f)
|
||||
|
||||
# Process each element to remove whitespace and comments
|
||||
for element in dom.getElementsByTagName("*"):
|
||||
# Skip w:t elements and their processing
|
||||
if element.tagName.endswith(":t"):
|
||||
continue
|
||||
|
||||
# Remove whitespace-only text nodes and comment nodes
|
||||
for child in list(element.childNodes):
|
||||
if (
|
||||
child.nodeType == child.TEXT_NODE
|
||||
and child.nodeValue
|
||||
and child.nodeValue.strip() == ""
|
||||
) or child.nodeType == child.COMMENT_NODE:
|
||||
element.removeChild(child)
|
||||
|
||||
# Write back the condensed XML
|
||||
with open(xml_file, "wb") as f:
|
||||
f.write(dom.toxml(encoding="UTF-8"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user