Initial commit

2025-11-30 08:30:10 +08:00
commit f0bd18fb4e
824 changed files with 331919 additions and 0 deletions
--- a/skills/docx/ooxml/scripts/pack.py
+++ b/skills/docx/ooxml/scripts/pack.py
@@ -0,0 +1,159 @@
+#!/usr/bin/env python3
+"""
+Tool to pack a directory into a .docx, .pptx, or .xlsx file with XML formatting undone.
+
+Example usage:
+    python pack.py <input_directory> <office_file> [--force]
+"""
+
+import argparse
+import shutil
+import subprocess
+import sys
+import tempfile
+import defusedxml.minidom
+import zipfile
+from pathlib import Path
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Pack a directory into an Office file")
+    parser.add_argument("input_directory", help="Unpacked Office document directory")
+    parser.add_argument("output_file", help="Output Office file (.docx/.pptx/.xlsx)")
+    parser.add_argument("--force", action="store_true", help="Skip validation")
+    args = parser.parse_args()
+
+    try:
+        success = pack_document(
+            args.input_directory, args.output_file, validate=not args.force
+        )
+
+        # Show warning if validation was skipped
+        if args.force:
+            print("Warning: Skipped validation, file may be corrupt", file=sys.stderr)
+        # Exit with error if validation failed
+        elif not success:
+            print("Contents would produce a corrupt file.", file=sys.stderr)
+            print("Please validate XML before repacking.", file=sys.stderr)
+            print("Use --force to skip validation and pack anyway.", file=sys.stderr)
+            sys.exit(1)
+
+    except ValueError as e:
+        sys.exit(f"Error: {e}")
+
+
+def pack_document(input_dir, output_file, validate=False):
+    """Pack a directory into an Office file (.docx/.pptx/.xlsx).
+
+    Args:
+        input_dir: Path to unpacked Office document directory
+        output_file: Path to output Office file
+        validate: If True, validates with soffice (default: False)
+
+    Returns:
+        bool: True if successful, False if validation failed
+    """
+    input_dir = Path(input_dir)
+    output_file = Path(output_file)
+
+    if not input_dir.is_dir():
+        raise ValueError(f"{input_dir} is not a directory")
+    if output_file.suffix.lower() not in {".docx", ".pptx", ".xlsx"}:
+        raise ValueError(f"{output_file} must be a .docx, .pptx, or .xlsx file")
+
+    # Work in temporary directory to avoid modifying original
+    with tempfile.TemporaryDirectory() as temp_dir:
+        temp_content_dir = Path(temp_dir) / "content"
+        shutil.copytree(input_dir, temp_content_dir)
+
+        # Process XML files to remove pretty-printing whitespace
+        for pattern in ["*.xml", "*.rels"]:
+            for xml_file in temp_content_dir.rglob(pattern):
+                condense_xml(xml_file)
+
+        # Create final Office file as zip archive
+        output_file.parent.mkdir(parents=True, exist_ok=True)
+        with zipfile.ZipFile(output_file, "w", zipfile.ZIP_DEFLATED) as zf:
+            for f in temp_content_dir.rglob("*"):
+                if f.is_file():
+                    zf.write(f, f.relative_to(temp_content_dir))
+
+        # Validate if requested
+        if validate:
+            if not validate_document(output_file):
+                output_file.unlink()  # Delete the corrupt file
+                return False
+
+    return True
+
+
+def validate_document(doc_path):
+    """Validate document by converting to HTML with soffice."""
+    # Determine the correct filter based on file extension
+    match doc_path.suffix.lower():
+        case ".docx":
+            filter_name = "html:HTML"
+        case ".pptx":
+            filter_name = "html:impress_html_Export"
+        case ".xlsx":
+            filter_name = "html:HTML (StarCalc)"
+
+    with tempfile.TemporaryDirectory() as temp_dir:
+        try:
+            result = subprocess.run(
+                [
+                    "soffice",
+                    "--headless",
+                    "--convert-to",
+                    filter_name,
+                    "--outdir",
+                    temp_dir,
+                    str(doc_path),
+                ],
+                capture_output=True,
+                timeout=10,
+                text=True,
+            )
+            if not (Path(temp_dir) / f"{doc_path.stem}.html").exists():
+                error_msg = result.stderr.strip() or "Document validation failed"
+                print(f"Validation error: {error_msg}", file=sys.stderr)
+                return False
+            return True
+        except FileNotFoundError:
+            print("Warning: soffice not found. Skipping validation.", file=sys.stderr)
+            return True
+        except subprocess.TimeoutExpired:
+            print("Validation error: Timeout during conversion", file=sys.stderr)
+            return False
+        except Exception as e:
+            print(f"Validation error: {e}", file=sys.stderr)
+            return False
+
+
+def condense_xml(xml_file):
+    """Strip unnecessary whitespace and remove comments."""
+    with open(xml_file, "r", encoding="utf-8") as f:
+        dom = defusedxml.minidom.parse(f)
+
+    # Process each element to remove whitespace and comments
+    for element in dom.getElementsByTagName("*"):
+        # Skip w:t elements and their processing
+        if element.tagName.endswith(":t"):
+            continue
+
+        # Remove whitespace-only text nodes and comment nodes
+        for child in list(element.childNodes):
+            if (
+                child.nodeType == child.TEXT_NODE
+                and child.nodeValue
+                and child.nodeValue.strip() == ""
+            ) or child.nodeType == child.COMMENT_NODE:
+                element.removeChild(child)
+
+    # Write back the condensed XML
+    with open(xml_file, "wb") as f:
+        f.write(dom.toxml(encoding="UTF-8"))
+
+
+if __name__ == "__main__":
+    main()