Initial commit

2025-11-30 08:32:07 +08:00
commit 555bb31091
133 changed files with 54100 additions and 0 deletions
--- a/skills/docx/ooxml/scripts/validation/docx.py
+++ b/skills/docx/ooxml/scripts/validation/docx.py
@@ -0,0 +1,274 @@
+"""
+Validator for Word document XML files against XSD schemas.
+"""
+
+import re
+import tempfile
+import zipfile
+
+import lxml.etree
+
+from .base import BaseSchemaValidator
+
+
+class DOCXSchemaValidator(BaseSchemaValidator):
+    """Validator for Word document XML files against XSD schemas."""
+
+    # Word-specific namespace
+    WORD_2006_NAMESPACE = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
+
+    # Word-specific element to relationship type mappings
+    # Start with empty mapping - add specific cases as we discover them
+    ELEMENT_RELATIONSHIP_TYPES = {}
+
+    def validate(self):
+        """Run all validation checks and return True if all pass."""
+        # Test 0: XML well-formedness
+        if not self.validate_xml():
+            return False
+
+        # Test 1: Namespace declarations
+        all_valid = True
+        if not self.validate_namespaces():
+            all_valid = False
+
+        # Test 2: Unique IDs
+        if not self.validate_unique_ids():
+            all_valid = False
+
+        # Test 3: Relationship and file reference validation
+        if not self.validate_file_references():
+            all_valid = False
+
+        # Test 4: Content type declarations
+        if not self.validate_content_types():
+            all_valid = False
+
+        # Test 5: XSD schema validation
+        if not self.validate_against_xsd():
+            all_valid = False
+
+        # Test 6: Whitespace preservation
+        if not self.validate_whitespace_preservation():
+            all_valid = False
+
+        # Test 7: Deletion validation
+        if not self.validate_deletions():
+            all_valid = False
+
+        # Test 8: Insertion validation
+        if not self.validate_insertions():
+            all_valid = False
+
+        # Test 9: Relationship ID reference validation
+        if not self.validate_all_relationship_ids():
+            all_valid = False
+
+        # Count and compare paragraphs
+        self.compare_paragraph_counts()
+
+        return all_valid
+
+    def validate_whitespace_preservation(self):
+        """
+        Validate that w:t elements with whitespace have xml:space='preserve'.
+        """
+        errors = []
+
+        for xml_file in self.xml_files:
+            # Only check document.xml files
+            if xml_file.name != "document.xml":
+                continue
+
+            try:
+                root = lxml.etree.parse(str(xml_file)).getroot()
+
+                # Find all w:t elements
+                for elem in root.iter(f"{{{self.WORD_2006_NAMESPACE}}}t"):
+                    if elem.text:
+                        text = elem.text
+                        # Check if text starts or ends with whitespace
+                        if re.match(r"^\s.*", text) or re.match(r".*\s$", text):
+                            # Check if xml:space="preserve" attribute exists
+                            xml_space_attr = f"{{{self.XML_NAMESPACE}}}space"
+                            if (
+                                xml_space_attr not in elem.attrib
+                                or elem.attrib[xml_space_attr] != "preserve"
+                            ):
+                                # Show a preview of the text
+                                text_preview = (
+                                    repr(text)[:50] + "..."
+                                    if len(repr(text)) > 50
+                                    else repr(text)
+                                )
+                                errors.append(
+                                    f"  {xml_file.relative_to(self.unpacked_dir)}: "
+                                    f"Line {elem.sourceline}: w:t element with whitespace missing xml:space='preserve': {text_preview}"
+                                )
+
+            except (lxml.etree.XMLSyntaxError, Exception) as e:
+                errors.append(
+                    f"  {xml_file.relative_to(self.unpacked_dir)}: Error: {e}"
+                )
+
+        if errors:
+            print(f"FAILED - Found {len(errors)} whitespace preservation violations:")
+            for error in errors:
+                print(error)
+            return False
+        else:
+            if self.verbose:
+                print("PASSED - All whitespace is properly preserved")
+            return True
+
+    def validate_deletions(self):
+        """
+        Validate that w:t elements are not within w:del elements.
+        For some reason, XSD validation does not catch this, so we do it manually.
+        """
+        errors = []
+
+        for xml_file in self.xml_files:
+            # Only check document.xml files
+            if xml_file.name != "document.xml":
+                continue
+
+            try:
+                root = lxml.etree.parse(str(xml_file)).getroot()
+
+                # Find all w:t elements that are descendants of w:del elements
+                namespaces = {"w": self.WORD_2006_NAMESPACE}
+                xpath_expression = ".//w:del//w:t"
+                problematic_t_elements = root.xpath(
+                    xpath_expression, namespaces=namespaces
+                )
+                for t_elem in problematic_t_elements:
+                    if t_elem.text:
+                        # Show a preview of the text
+                        text_preview = (
+                            repr(t_elem.text)[:50] + "..."
+                            if len(repr(t_elem.text)) > 50
+                            else repr(t_elem.text)
+                        )
+                        errors.append(
+                            f"  {xml_file.relative_to(self.unpacked_dir)}: "
+                            f"Line {t_elem.sourceline}: <w:t> found within <w:del>: {text_preview}"
+                        )
+
+            except (lxml.etree.XMLSyntaxError, Exception) as e:
+                errors.append(
+                    f"  {xml_file.relative_to(self.unpacked_dir)}: Error: {e}"
+                )
+
+        if errors:
+            print(f"FAILED - Found {len(errors)} deletion validation violations:")
+            for error in errors:
+                print(error)
+            return False
+        else:
+            if self.verbose:
+                print("PASSED - No w:t elements found within w:del elements")
+            return True
+
+    def count_paragraphs_in_unpacked(self):
+        """Count the number of paragraphs in the unpacked document."""
+        count = 0
+
+        for xml_file in self.xml_files:
+            # Only check document.xml files
+            if xml_file.name != "document.xml":
+                continue
+
+            try:
+                root = lxml.etree.parse(str(xml_file)).getroot()
+                # Count all w:p elements
+                paragraphs = root.findall(f".//{{{self.WORD_2006_NAMESPACE}}}p")
+                count = len(paragraphs)
+            except Exception as e:
+                print(f"Error counting paragraphs in unpacked document: {e}")
+
+        return count
+
+    def count_paragraphs_in_original(self):
+        """Count the number of paragraphs in the original docx file."""
+        count = 0
+
+        try:
+            # Create temporary directory to unpack original
+            with tempfile.TemporaryDirectory() as temp_dir:
+                # Unpack original docx
+                with zipfile.ZipFile(self.original_file, "r") as zip_ref:
+                    zip_ref.extractall(temp_dir)
+
+                # Parse document.xml
+                doc_xml_path = temp_dir + "/word/document.xml"
+                root = lxml.etree.parse(doc_xml_path).getroot()
+
+                # Count all w:p elements
+                paragraphs = root.findall(f".//{{{self.WORD_2006_NAMESPACE}}}p")
+                count = len(paragraphs)
+
+        except Exception as e:
+            print(f"Error counting paragraphs in original document: {e}")
+
+        return count
+
+    def validate_insertions(self):
+        """
+        Validate that w:delText elements are not within w:ins elements.
+        w:delText is only allowed in w:ins if nested within a w:del.
+        """
+        errors = []
+
+        for xml_file in self.xml_files:
+            if xml_file.name != "document.xml":
+                continue
+
+            try:
+                root = lxml.etree.parse(str(xml_file)).getroot()
+                namespaces = {"w": self.WORD_2006_NAMESPACE}
+
+                # Find w:delText in w:ins that are NOT within w:del
+                invalid_elements = root.xpath(
+                    ".//w:ins//w:delText[not(ancestor::w:del)]",
+                    namespaces=namespaces
+                )
+
+                for elem in invalid_elements:
+                    text_preview = (
+                        repr(elem.text or "")[:50] + "..."
+                        if len(repr(elem.text or "")) > 50
+                        else repr(elem.text or "")
+                    )
+                    errors.append(
+                        f"  {xml_file.relative_to(self.unpacked_dir)}: "
+                        f"Line {elem.sourceline}: <w:delText> within <w:ins>: {text_preview}"
+                    )
+
+            except (lxml.etree.XMLSyntaxError, Exception) as e:
+                errors.append(
+                    f"  {xml_file.relative_to(self.unpacked_dir)}: Error: {e}"
+                )
+
+        if errors:
+            print(f"FAILED - Found {len(errors)} insertion validation violations:")
+            for error in errors:
+                print(error)
+            return False
+        else:
+            if self.verbose:
+                print("PASSED - No w:delText elements within w:ins elements")
+            return True
+
+    def compare_paragraph_counts(self):
+        """Compare paragraph counts between original and new document."""
+        original_count = self.count_paragraphs_in_original()
+        new_count = self.count_paragraphs_in_unpacked()
+
+        diff = new_count - original_count
+        diff_str = f"+{diff}" if diff > 0 else str(diff)
+        print(f"\nParagraphs: {original_count} → {new_count} ({diff_str})")
+
+
+if __name__ == "__main__":
+    raise RuntimeError("This module should not be run directly.")