""" Validator for Word document XML files against XSD schemas. """ import re import tempfile import zipfile import lxml.etree from .base import BaseSchemaValidator class DOCXSchemaValidator(BaseSchemaValidator): """Validator for Word document XML files against XSD schemas.""" # Word-specific namespace WORD_2006_NAMESPACE = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" # Word-specific element to relationship type mappings # Start with empty mapping - add specific cases as we discover them ELEMENT_RELATIONSHIP_TYPES = {} def validate(self): """Run all validation checks and return True if all pass.""" # Test 0: XML well-formedness if not self.validate_xml(): return False # Test 1: Namespace declarations all_valid = True if not self.validate_namespaces(): all_valid = False # Test 2: Unique IDs if not self.validate_unique_ids(): all_valid = False # Test 3: Relationship and file reference validation if not self.validate_file_references(): all_valid = False # Test 4: Content type declarations if not self.validate_content_types(): all_valid = False # Test 5: XSD schema validation if not self.validate_against_xsd(): all_valid = False # Test 6: Whitespace preservation if not self.validate_whitespace_preservation(): all_valid = False # Test 7: Deletion validation if not self.validate_deletions(): all_valid = False # Test 8: Insertion validation if not self.validate_insertions(): all_valid = False # Test 9: Relationship ID reference validation if not self.validate_all_relationship_ids(): all_valid = False # Count and compare paragraphs self.compare_paragraph_counts() return all_valid def validate_whitespace_preservation(self): """ Validate that w:t elements with whitespace have xml:space='preserve'. """ errors = [] for xml_file in self.xml_files: # Only check document.xml files if xml_file.name != "document.xml": continue try: root = lxml.etree.parse(str(xml_file)).getroot() # Find all w:t elements for elem in root.iter(f"{{{self.WORD_2006_NAMESPACE}}}t"): if elem.text: text = elem.text # Check if text starts or ends with whitespace if re.match(r"^\s.*", text) or re.match(r".*\s$", text): # Check if xml:space="preserve" attribute exists xml_space_attr = f"{{{self.XML_NAMESPACE}}}space" if ( xml_space_attr not in elem.attrib or elem.attrib[xml_space_attr] != "preserve" ): # Show a preview of the text text_preview = ( repr(text)[:50] + "..." if len(repr(text)) > 50 else repr(text) ) errors.append( f" {xml_file.relative_to(self.unpacked_dir)}: " f"Line {elem.sourceline}: w:t element with whitespace missing xml:space='preserve': {text_preview}" ) except (lxml.etree.XMLSyntaxError, Exception) as e: errors.append( f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" ) if errors: print(f"FAILED - Found {len(errors)} whitespace preservation violations:") for error in errors: print(error) return False else: if self.verbose: print("PASSED - All whitespace is properly preserved") return True def validate_deletions(self): """ Validate that w:t elements are not within w:del elements. For some reason, XSD validation does not catch this, so we do it manually. """ errors = [] for xml_file in self.xml_files: # Only check document.xml files if xml_file.name != "document.xml": continue try: root = lxml.etree.parse(str(xml_file)).getroot() # Find all w:t elements that are descendants of w:del elements namespaces = {"w": self.WORD_2006_NAMESPACE} xpath_expression = ".//w:del//w:t" problematic_t_elements = root.xpath( xpath_expression, namespaces=namespaces ) for t_elem in problematic_t_elements: if t_elem.text: # Show a preview of the text text_preview = ( repr(t_elem.text)[:50] + "..." if len(repr(t_elem.text)) > 50 else repr(t_elem.text) ) errors.append( f" {xml_file.relative_to(self.unpacked_dir)}: " f"Line {t_elem.sourceline}: found within : {text_preview}" ) except (lxml.etree.XMLSyntaxError, Exception) as e: errors.append( f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" ) if errors: print(f"FAILED - Found {len(errors)} deletion validation violations:") for error in errors: print(error) return False else: if self.verbose: print("PASSED - No w:t elements found within w:del elements") return True def count_paragraphs_in_unpacked(self): """Count the number of paragraphs in the unpacked document.""" count = 0 for xml_file in self.xml_files: # Only check document.xml files if xml_file.name != "document.xml": continue try: root = lxml.etree.parse(str(xml_file)).getroot() # Count all w:p elements paragraphs = root.findall(f".//{{{self.WORD_2006_NAMESPACE}}}p") count = len(paragraphs) except Exception as e: print(f"Error counting paragraphs in unpacked document: {e}") return count def count_paragraphs_in_original(self): """Count the number of paragraphs in the original docx file.""" count = 0 try: # Create temporary directory to unpack original with tempfile.TemporaryDirectory() as temp_dir: # Unpack original docx with zipfile.ZipFile(self.original_file, "r") as zip_ref: zip_ref.extractall(temp_dir) # Parse document.xml doc_xml_path = temp_dir + "/word/document.xml" root = lxml.etree.parse(doc_xml_path).getroot() # Count all w:p elements paragraphs = root.findall(f".//{{{self.WORD_2006_NAMESPACE}}}p") count = len(paragraphs) except Exception as e: print(f"Error counting paragraphs in original document: {e}") return count def validate_insertions(self): """ Validate that w:delText elements are not within w:ins elements. w:delText is only allowed in w:ins if nested within a w:del. """ errors = [] for xml_file in self.xml_files: if xml_file.name != "document.xml": continue try: root = lxml.etree.parse(str(xml_file)).getroot() namespaces = {"w": self.WORD_2006_NAMESPACE} # Find w:delText in w:ins that are NOT within w:del invalid_elements = root.xpath( ".//w:ins//w:delText[not(ancestor::w:del)]", namespaces=namespaces ) for elem in invalid_elements: text_preview = ( repr(elem.text or "")[:50] + "..." if len(repr(elem.text or "")) > 50 else repr(elem.text or "") ) errors.append( f" {xml_file.relative_to(self.unpacked_dir)}: " f"Line {elem.sourceline}: within : {text_preview}" ) except (lxml.etree.XMLSyntaxError, Exception) as e: errors.append( f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" ) if errors: print(f"FAILED - Found {len(errors)} insertion validation violations:") for error in errors: print(error) return False else: if self.verbose: print("PASSED - No w:delText elements within w:ins elements") return True def compare_paragraph_counts(self): """Compare paragraph counts between original and new document.""" original_count = self.count_paragraphs_in_original() new_count = self.count_paragraphs_in_unpacked() diff = new_count - original_count diff_str = f"+{diff}" if diff > 0 else str(diff) print(f"\nParagraphs: {original_count} → {new_count} ({diff_str})") if __name__ == "__main__": raise RuntimeError("This module should not be run directly.")