Initial commit

2025-11-30 09:06:10 +08:00
commit 8b332b4007
64 changed files with 26839 additions and 0 deletions
--- a/skills/pptx/ooxml/scripts/pack.py
+++ b/skills/pptx/ooxml/scripts/pack.py
@@ -0,0 +1,159 @@
+#!/usr/bin/env python3
+"""
+Tool to pack a directory into a .docx, .pptx, or .xlsx file with XML formatting undone.
+
+Example usage:
+    python pack.py <input_directory> <office_file> [--force]
+"""
+
+import argparse
+import shutil
+import subprocess
+import sys
+import tempfile
+import defusedxml.minidom
+import zipfile
+from pathlib import Path
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Pack a directory into an Office file")
+    parser.add_argument("input_directory", help="Unpacked Office document directory")
+    parser.add_argument("output_file", help="Output Office file (.docx/.pptx/.xlsx)")
+    parser.add_argument("--force", action="store_true", help="Skip validation")
+    args = parser.parse_args()
+
+    try:
+        success = pack_document(
+            args.input_directory, args.output_file, validate=not args.force
+        )
+
+        # Show warning if validation was skipped
+        if args.force:
+            print("Warning: Skipped validation, file may be corrupt", file=sys.stderr)
+        # Exit with error if validation failed
+        elif not success:
+            print("Contents would produce a corrupt file.", file=sys.stderr)
+            print("Please validate XML before repacking.", file=sys.stderr)
+            print("Use --force to skip validation and pack anyway.", file=sys.stderr)
+            sys.exit(1)
+
+    except ValueError as e:
+        sys.exit(f"Error: {e}")
+
+
+def pack_document(input_dir, output_file, validate=False):
+    """Pack a directory into an Office file (.docx/.pptx/.xlsx).
+
+    Args:
+        input_dir: Path to unpacked Office document directory
+        output_file: Path to output Office file
+        validate: If True, validates with soffice (default: False)
+
+    Returns:
+        bool: True if successful, False if validation failed
+    """
+    input_dir = Path(input_dir)
+    output_file = Path(output_file)
+
+    if not input_dir.is_dir():
+        raise ValueError(f"{input_dir} is not a directory")
+    if output_file.suffix.lower() not in {".docx", ".pptx", ".xlsx"}:
+        raise ValueError(f"{output_file} must be a .docx, .pptx, or .xlsx file")
+
+    # Work in temporary directory to avoid modifying original
+    with tempfile.TemporaryDirectory() as temp_dir:
+        temp_content_dir = Path(temp_dir) / "content"
+        shutil.copytree(input_dir, temp_content_dir)
+
+        # Process XML files to remove pretty-printing whitespace
+        for pattern in ["*.xml", "*.rels"]:
+            for xml_file in temp_content_dir.rglob(pattern):
+                condense_xml(xml_file)
+
+        # Create final Office file as zip archive
+        output_file.parent.mkdir(parents=True, exist_ok=True)
+        with zipfile.ZipFile(output_file, "w", zipfile.ZIP_DEFLATED) as zf:
+            for f in temp_content_dir.rglob("*"):
+                if f.is_file():
+                    zf.write(f, f.relative_to(temp_content_dir))
+
+        # Validate if requested
+        if validate:
+            if not validate_document(output_file):
+                output_file.unlink()  # Delete the corrupt file
+                return False
+
+    return True
+
+
+def validate_document(doc_path):
+    """Validate document by converting to HTML with soffice."""
+    # Determine the correct filter based on file extension
+    match doc_path.suffix.lower():
+        case ".docx":
+            filter_name = "html:HTML"
+        case ".pptx":
+            filter_name = "html:impress_html_Export"
+        case ".xlsx":
+            filter_name = "html:HTML (StarCalc)"
+
+    with tempfile.TemporaryDirectory() as temp_dir:
+        try:
+            result = subprocess.run(
+                [
+                    "soffice",
+                    "--headless",
+                    "--convert-to",
+                    filter_name,
+                    "--outdir",
+                    temp_dir,
+                    str(doc_path),
+                ],
+                capture_output=True,
+                timeout=10,
+                text=True,
+            )
+            if not (Path(temp_dir) / f"{doc_path.stem}.html").exists():
+                error_msg = result.stderr.strip() or "Document validation failed"
+                print(f"Validation error: {error_msg}", file=sys.stderr)
+                return False
+            return True
+        except FileNotFoundError:
+            print("Warning: soffice not found. Skipping validation.", file=sys.stderr)
+            return True
+        except subprocess.TimeoutExpired:
+            print("Validation error: Timeout during conversion", file=sys.stderr)
+            return False
+        except Exception as e:
+            print(f"Validation error: {e}", file=sys.stderr)
+            return False
+
+
+def condense_xml(xml_file):
+    """Strip unnecessary whitespace and remove comments."""
+    with open(xml_file, "r", encoding="utf-8") as f:
+        dom = defusedxml.minidom.parse(f)
+
+    # Process each element to remove whitespace and comments
+    for element in dom.getElementsByTagName("*"):
+        # Skip w:t elements and their processing
+        if element.tagName.endswith(":t"):
+            continue
+
+        # Remove whitespace-only text nodes and comment nodes
+        for child in list(element.childNodes):
+            if (
+                child.nodeType == child.TEXT_NODE
+                and child.nodeValue
+                and child.nodeValue.strip() == ""
+            ) or child.nodeType == child.COMMENT_NODE:
+                element.removeChild(child)
+
+    # Write back the condensed XML
+    with open(xml_file, "wb") as f:
+        f.write(dom.toxml(encoding="UTF-8"))
+
+
+if __name__ == "__main__":
+    main()
--- a/skills/pptx/ooxml/scripts/unpack.py
+++ b/skills/pptx/ooxml/scripts/unpack.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+"""Unpack and format XML contents of Office files (.docx, .pptx, .xlsx)"""
+
+import random
+import sys
+import defusedxml.minidom
+import zipfile
+from pathlib import Path
+
+# Get command line arguments
+assert len(sys.argv) == 3, "Usage: python unpack.py <office_file> <output_dir>"
+input_file, output_dir = sys.argv[1], sys.argv[2]
+
+# Extract and format
+output_path = Path(output_dir)
+output_path.mkdir(parents=True, exist_ok=True)
+zipfile.ZipFile(input_file).extractall(output_path)
+
+# Pretty print all XML files
+xml_files = list(output_path.rglob("*.xml")) + list(output_path.rglob("*.rels"))
+for xml_file in xml_files:
+    content = xml_file.read_text(encoding="utf-8")
+    dom = defusedxml.minidom.parseString(content)
+    xml_file.write_bytes(dom.toprettyxml(indent="  ", encoding="ascii"))
+
+# For .docx files, suggest an RSID for tracked changes
+if input_file.endswith(".docx"):
+    suggested_rsid = "".join(random.choices("0123456789ABCDEF", k=8))
+    print(f"Suggested RSID for edit session: {suggested_rsid}")
--- a/skills/pptx/ooxml/scripts/validate.py
+++ b/skills/pptx/ooxml/scripts/validate.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+"""
+Command line tool to validate Office document XML files against XSD schemas and tracked changes.
+
+Usage:
+    python validate.py <dir> --original <original_file>
+"""
+
+import argparse
+import sys
+from pathlib import Path
+
+from validation import DOCXSchemaValidator, PPTXSchemaValidator, RedliningValidator
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Validate Office document XML files")
+    parser.add_argument(
+        "unpacked_dir",
+        help="Path to unpacked Office document directory",
+    )
+    parser.add_argument(
+        "--original",
+        required=True,
+        help="Path to original file (.docx/.pptx/.xlsx)",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        help="Enable verbose output",
+    )
+    args = parser.parse_args()
+
+    # Validate paths
+    unpacked_dir = Path(args.unpacked_dir)
+    original_file = Path(args.original)
+    file_extension = original_file.suffix.lower()
+    assert unpacked_dir.is_dir(), f"Error: {unpacked_dir} is not a directory"
+    assert original_file.is_file(), f"Error: {original_file} is not a file"
+    assert file_extension in [".docx", ".pptx", ".xlsx"], (
+        f"Error: {original_file} must be a .docx, .pptx, or .xlsx file"
+    )
+
+    # Run validations
+    match file_extension:
+        case ".docx":
+            validators = [DOCXSchemaValidator, RedliningValidator]
+        case ".pptx":
+            validators = [PPTXSchemaValidator]
+        case _:
+            print(f"Error: Validation not supported for file type {file_extension}")
+            sys.exit(1)
+
+    # Run validators
+    success = True
+    for V in validators:
+        validator = V(unpacked_dir, original_file, verbose=args.verbose)
+        if not validator.validate():
+            success = False
+
+    if success:
+        print("All validations PASSED!")
+
+    sys.exit(0 if success else 1)
+
+
+if __name__ == "__main__":
+    main()
--- a/skills/pptx/ooxml/scripts/validation/init.py
+++ b/skills/pptx/ooxml/scripts/validation/init.py
@@ -0,0 +1,15 @@
+"""
+Validation modules for Word document processing.
+"""
+
+from .base import BaseSchemaValidator
+from .docx import DOCXSchemaValidator
+from .pptx import PPTXSchemaValidator
+from .redlining import RedliningValidator
+
+__all__ = [
+    "BaseSchemaValidator",
+    "DOCXSchemaValidator",
+    "PPTXSchemaValidator",
+    "RedliningValidator",
+]
--- a/skills/pptx/ooxml/scripts/validation/base.py
+++ b/skills/pptx/ooxml/scripts/validation/base.py
@@ -0,0 +1,951 @@
+"""
+Base validator with common validation logic for document files.
+"""
+
+import re
+from pathlib import Path
+
+import lxml.etree
+
+
+class BaseSchemaValidator:
+    """Base validator with common validation logic for document files."""
+
+    # Elements whose 'id' attributes must be unique within their file
+    # Format: element_name -> (attribute_name, scope)
+    # scope can be 'file' (unique within file) or 'global' (unique across all files)
+    UNIQUE_ID_REQUIREMENTS = {
+        # Word elements
+        "comment": ("id", "file"),  # Comment IDs in comments.xml
+        "commentrangestart": ("id", "file"),  # Must match comment IDs
+        "commentrangeend": ("id", "file"),  # Must match comment IDs
+        "bookmarkstart": ("id", "file"),  # Bookmark start IDs
+        "bookmarkend": ("id", "file"),  # Bookmark end IDs
+        # Note: ins and del (track changes) can share IDs when part of same revision
+        # PowerPoint elements
+        "sldid": ("id", "file"),  # Slide IDs in presentation.xml
+        "sldmasterid": ("id", "global"),  # Slide master IDs must be globally unique
+        "sldlayoutid": ("id", "global"),  # Slide layout IDs must be globally unique
+        "cm": ("authorid", "file"),  # Comment author IDs
+        # Excel elements
+        "sheet": ("sheetid", "file"),  # Sheet IDs in workbook.xml
+        "definedname": ("id", "file"),  # Named range IDs
+        # Drawing/Shape elements (all formats)
+        "cxnsp": ("id", "file"),  # Connection shape IDs
+        "sp": ("id", "file"),  # Shape IDs
+        "pic": ("id", "file"),  # Picture IDs
+        "grpsp": ("id", "file"),  # Group shape IDs
+    }
+
+    # Mapping of element names to expected relationship types
+    # Subclasses should override this with format-specific mappings
+    ELEMENT_RELATIONSHIP_TYPES = {}
+
+    # Unified schema mappings for all Office document types
+    SCHEMA_MAPPINGS = {
+        # Document type specific schemas
+        "word": "ISO-IEC29500-4_2016/wml.xsd",  # Word documents
+        "ppt": "ISO-IEC29500-4_2016/pml.xsd",  # PowerPoint presentations
+        "xl": "ISO-IEC29500-4_2016/sml.xsd",  # Excel spreadsheets
+        # Common file types
+        "[Content_Types].xml": "ecma/fouth-edition/opc-contentTypes.xsd",
+        "app.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd",
+        "core.xml": "ecma/fouth-edition/opc-coreProperties.xsd",
+        "custom.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd",
+        ".rels": "ecma/fouth-edition/opc-relationships.xsd",
+        # Word-specific files
+        "people.xml": "microsoft/wml-2012.xsd",
+        "commentsIds.xml": "microsoft/wml-cid-2016.xsd",
+        "commentsExtensible.xml": "microsoft/wml-cex-2018.xsd",
+        "commentsExtended.xml": "microsoft/wml-2012.xsd",
+        # Chart files (common across document types)
+        "chart": "ISO-IEC29500-4_2016/dml-chart.xsd",
+        # Theme files (common across document types)
+        "theme": "ISO-IEC29500-4_2016/dml-main.xsd",
+        # Drawing and media files
+        "drawing": "ISO-IEC29500-4_2016/dml-main.xsd",
+    }
+
+    # Unified namespace constants
+    MC_NAMESPACE = "http://schemas.openxmlformats.org/markup-compatibility/2006"
+    XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
+
+    # Common OOXML namespaces used across validators
+    PACKAGE_RELATIONSHIPS_NAMESPACE = (
+        "http://schemas.openxmlformats.org/package/2006/relationships"
+    )
+    OFFICE_RELATIONSHIPS_NAMESPACE = (
+        "http://schemas.openxmlformats.org/officeDocument/2006/relationships"
+    )
+    CONTENT_TYPES_NAMESPACE = (
+        "http://schemas.openxmlformats.org/package/2006/content-types"
+    )
+
+    # Folders where we should clean ignorable namespaces
+    MAIN_CONTENT_FOLDERS = {"word", "ppt", "xl"}
+
+    # All allowed OOXML namespaces (superset of all document types)
+    OOXML_NAMESPACES = {
+        "http://schemas.openxmlformats.org/officeDocument/2006/math",
+        "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
+        "http://schemas.openxmlformats.org/schemaLibrary/2006/main",
+        "http://schemas.openxmlformats.org/drawingml/2006/main",
+        "http://schemas.openxmlformats.org/drawingml/2006/chart",
+        "http://schemas.openxmlformats.org/drawingml/2006/chartDrawing",
+        "http://schemas.openxmlformats.org/drawingml/2006/diagram",
+        "http://schemas.openxmlformats.org/drawingml/2006/picture",
+        "http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing",
+        "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
+        "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
+        "http://schemas.openxmlformats.org/presentationml/2006/main",
+        "http://schemas.openxmlformats.org/spreadsheetml/2006/main",
+        "http://schemas.openxmlformats.org/officeDocument/2006/sharedTypes",
+        "http://www.w3.org/XML/1998/namespace",
+    }
+
+    def __init__(self, unpacked_dir, original_file, verbose=False):
+        self.unpacked_dir = Path(unpacked_dir).resolve()
+        self.original_file = Path(original_file)
+        self.verbose = verbose
+
+        # Set schemas directory
+        self.schemas_dir = Path(__file__).parent.parent.parent / "schemas"
+
+        # Get all XML and .rels files
+        patterns = ["*.xml", "*.rels"]
+        self.xml_files = [
+            f for pattern in patterns for f in self.unpacked_dir.rglob(pattern)
+        ]
+
+        if not self.xml_files:
+            print(f"Warning: No XML files found in {self.unpacked_dir}")
+
+    def validate(self):
+        """Run all validation checks and return True if all pass."""
+        raise NotImplementedError("Subclasses must implement the validate method")
+
+    def validate_xml(self):
+        """Validate that all XML files are well-formed."""
+        errors = []
+
+        for xml_file in self.xml_files:
+            try:
+                # Try to parse the XML file
+                lxml.etree.parse(str(xml_file))
+            except lxml.etree.XMLSyntaxError as e:
+                errors.append(
+                    f"  {xml_file.relative_to(self.unpacked_dir)}: "
+                    f"Line {e.lineno}: {e.msg}"
+                )
+            except Exception as e:
+                errors.append(
+                    f"  {xml_file.relative_to(self.unpacked_dir)}: "
+                    f"Unexpected error: {str(e)}"
+                )
+
+        if errors:
+            print(f"FAILED - Found {len(errors)} XML violations:")
+            for error in errors:
+                print(error)
+            return False
+        else:
+            if self.verbose:
+                print("PASSED - All XML files are well-formed")
+            return True
+
+    def validate_namespaces(self):
+        """Validate that namespace prefixes in Ignorable attributes are declared."""
+        errors = []
+
+        for xml_file in self.xml_files:
+            try:
+                root = lxml.etree.parse(str(xml_file)).getroot()
+                declared = set(root.nsmap.keys()) - {None}  # Exclude default namespace
+
+                for attr_val in [
+                    v for k, v in root.attrib.items() if k.endswith("Ignorable")
+                ]:
+                    undeclared = set(attr_val.split()) - declared
+                    errors.extend(
+                        f"  {xml_file.relative_to(self.unpacked_dir)}: "
+                        f"Namespace '{ns}' in Ignorable but not declared"
+                        for ns in undeclared
+                    )
+            except lxml.etree.XMLSyntaxError:
+                continue
+
+        if errors:
+            print(f"FAILED - {len(errors)} namespace issues:")
+            for error in errors:
+                print(error)
+            return False
+        if self.verbose:
+            print("PASSED - All namespace prefixes properly declared")
+        return True
+
+    def validate_unique_ids(self):
+        """Validate that specific IDs are unique according to OOXML requirements."""
+        errors = []
+        global_ids = {}  # Track globally unique IDs across all files
+
+        for xml_file in self.xml_files:
+            try:
+                root = lxml.etree.parse(str(xml_file)).getroot()
+                file_ids = {}  # Track IDs that must be unique within this file
+
+                # Remove all mc:AlternateContent elements from the tree
+                mc_elements = root.xpath(
+                    ".//mc:AlternateContent", namespaces={"mc": self.MC_NAMESPACE}
+                )
+                for elem in mc_elements:
+                    elem.getparent().remove(elem)
+
+                # Now check IDs in the cleaned tree
+                for elem in root.iter():
+                    # Get the element name without namespace
+                    tag = (
+                        elem.tag.split("}")[-1].lower()
+                        if "}" in elem.tag
+                        else elem.tag.lower()
+                    )
+
+                    # Check if this element type has ID uniqueness requirements
+                    if tag in self.UNIQUE_ID_REQUIREMENTS:
+                        attr_name, scope = self.UNIQUE_ID_REQUIREMENTS[tag]
+
+                        # Look for the specified attribute
+                        id_value = None
+                        for attr, value in elem.attrib.items():
+                            attr_local = (
+                                attr.split("}")[-1].lower()
+                                if "}" in attr
+                                else attr.lower()
+                            )
+                            if attr_local == attr_name:
+                                id_value = value
+                                break
+
+                        if id_value is not None:
+                            if scope == "global":
+                                # Check global uniqueness
+                                if id_value in global_ids:
+                                    prev_file, prev_line, prev_tag = global_ids[
+                                        id_value
+                                    ]
+                                    errors.append(
+                                        f"  {xml_file.relative_to(self.unpacked_dir)}: "
+                                        f"Line {elem.sourceline}: Global ID '{id_value}' in <{tag}> "
+                                        f"already used in {prev_file} at line {prev_line} in <{prev_tag}>"
+                                    )
+                                else:
+                                    global_ids[id_value] = (
+                                        xml_file.relative_to(self.unpacked_dir),
+                                        elem.sourceline,
+                                        tag,
+                                    )
+                            elif scope == "file":
+                                # Check file-level uniqueness
+                                key = (tag, attr_name)
+                                if key not in file_ids:
+                                    file_ids[key] = {}
+
+                                if id_value in file_ids[key]:
+                                    prev_line = file_ids[key][id_value]
+                                    errors.append(
+                                        f"  {xml_file.relative_to(self.unpacked_dir)}: "
+                                        f"Line {elem.sourceline}: Duplicate {attr_name}='{id_value}' in <{tag}> "
+                                        f"(first occurrence at line {prev_line})"
+                                    )
+                                else:
+                                    file_ids[key][id_value] = elem.sourceline
+
+            except (lxml.etree.XMLSyntaxError, Exception) as e:
+                errors.append(
+                    f"  {xml_file.relative_to(self.unpacked_dir)}: Error: {e}"
+                )
+
+        if errors:
+            print(f"FAILED - Found {len(errors)} ID uniqueness violations:")
+            for error in errors:
+                print(error)
+            return False
+        else:
+            if self.verbose:
+                print("PASSED - All required IDs are unique")
+            return True
+
+    def validate_file_references(self):
+        """
+        Validate that all .rels files properly reference files and that all files are referenced.
+        """
+        errors = []
+
+        # Find all .rels files
+        rels_files = list(self.unpacked_dir.rglob("*.rels"))
+
+        if not rels_files:
+            if self.verbose:
+                print("PASSED - No .rels files found")
+            return True
+
+        # Get all files in the unpacked directory (excluding reference files)
+        all_files = []
+        for file_path in self.unpacked_dir.rglob("*"):
+            if (
+                file_path.is_file()
+                and file_path.name != "[Content_Types].xml"
+                and not file_path.name.endswith(".rels")
+            ):  # This file is not referenced by .rels
+                all_files.append(file_path.resolve())
+
+        # Track all files that are referenced by any .rels file
+        all_referenced_files = set()
+
+        if self.verbose:
+            print(
+                f"Found {len(rels_files)} .rels files and {len(all_files)} target files"
+            )
+
+        # Check each .rels file
+        for rels_file in rels_files:
+            try:
+                # Parse relationships file
+                rels_root = lxml.etree.parse(str(rels_file)).getroot()
+
+                # Get the directory where this .rels file is located
+                rels_dir = rels_file.parent
+
+                # Find all relationships and their targets
+                referenced_files = set()
+                broken_refs = []
+
+                for rel in rels_root.findall(
+                    ".//ns:Relationship",
+                    namespaces={"ns": self.PACKAGE_RELATIONSHIPS_NAMESPACE},
+                ):
+                    target = rel.get("Target")
+                    if target and not target.startswith(
+                        ("http", "mailto:")
+                    ):  # Skip external URLs
+                        # Resolve the target path relative to the .rels file location
+                        if rels_file.name == ".rels":
+                            # Root .rels file - targets are relative to unpacked_dir
+                            target_path = self.unpacked_dir / target
+                        else:
+                            # Other .rels files - targets are relative to their parent's parent
+                            # e.g., word/_rels/document.xml.rels -> targets relative to word/
+                            base_dir = rels_dir.parent
+                            target_path = base_dir / target
+
+                        # Normalize the path and check if it exists
+                        try:
+                            target_path = target_path.resolve()
+                            if target_path.exists() and target_path.is_file():
+                                referenced_files.add(target_path)
+                                all_referenced_files.add(target_path)
+                            else:
+                                broken_refs.append((target, rel.sourceline))
+                        except (OSError, ValueError):
+                            broken_refs.append((target, rel.sourceline))
+
+                # Report broken references
+                if broken_refs:
+                    rel_path = rels_file.relative_to(self.unpacked_dir)
+                    for broken_ref, line_num in broken_refs:
+                        errors.append(
+                            f"  {rel_path}: Line {line_num}: Broken reference to {broken_ref}"
+                        )
+
+            except Exception as e:
+                rel_path = rels_file.relative_to(self.unpacked_dir)
+                errors.append(f"  Error parsing {rel_path}: {e}")
+
+        # Check for unreferenced files (files that exist but are not referenced anywhere)
+        unreferenced_files = set(all_files) - all_referenced_files
+
+        if unreferenced_files:
+            for unref_file in sorted(unreferenced_files):
+                unref_rel_path = unref_file.relative_to(self.unpacked_dir)
+                errors.append(f"  Unreferenced file: {unref_rel_path}")
+
+        if errors:
+            print(f"FAILED - Found {len(errors)} relationship validation errors:")
+            for error in errors:
+                print(error)
+            print(
+                "CRITICAL: These errors will cause the document to appear corrupt. "
+                + "Broken references MUST be fixed, "
+                + "and unreferenced files MUST be referenced or removed."
+            )
+            return False
+        else:
+            if self.verbose:
+                print(
+                    "PASSED - All references are valid and all files are properly referenced"
+                )
+            return True
+
+    def validate_all_relationship_ids(self):
+        """
+        Validate that all r:id attributes in XML files reference existing IDs
+        in their corresponding .rels files, and optionally validate relationship types.
+        """
+        import lxml.etree
+
+        errors = []
+
+        # Process each XML file that might contain r:id references
+        for xml_file in self.xml_files:
+            # Skip .rels files themselves
+            if xml_file.suffix == ".rels":
+                continue
+
+            # Determine the corresponding .rels file
+            # For dir/file.xml, it's dir/_rels/file.xml.rels
+            rels_dir = xml_file.parent / "_rels"
+            rels_file = rels_dir / f"{xml_file.name}.rels"
+
+            # Skip if there's no corresponding .rels file (that's okay)
+            if not rels_file.exists():
+                continue
+
+            try:
+                # Parse the .rels file to get valid relationship IDs and their types
+                rels_root = lxml.etree.parse(str(rels_file)).getroot()
+                rid_to_type = {}
+
+                for rel in rels_root.findall(
+                    f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship"
+                ):
+                    rid = rel.get("Id")
+                    rel_type = rel.get("Type", "")
+                    if rid:
+                        # Check for duplicate rIds
+                        if rid in rid_to_type:
+                            rels_rel_path = rels_file.relative_to(self.unpacked_dir)
+                            errors.append(
+                                f"  {rels_rel_path}: Line {rel.sourceline}: "
+                                f"Duplicate relationship ID '{rid}' (IDs must be unique)"
+                            )
+                        # Extract just the type name from the full URL
+                        type_name = (
+                            rel_type.split("/")[-1] if "/" in rel_type else rel_type
+                        )
+                        rid_to_type[rid] = type_name
+
+                # Parse the XML file to find all r:id references
+                xml_root = lxml.etree.parse(str(xml_file)).getroot()
+
+                # Find all elements with r:id attributes
+                for elem in xml_root.iter():
+                    # Check for r:id attribute (relationship ID)
+                    rid_attr = elem.get(f"{{{self.OFFICE_RELATIONSHIPS_NAMESPACE}}}id")
+                    if rid_attr:
+                        xml_rel_path = xml_file.relative_to(self.unpacked_dir)
+                        elem_name = (
+                            elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag
+                        )
+
+                        # Check if the ID exists
+                        if rid_attr not in rid_to_type:
+                            errors.append(
+                                f"  {xml_rel_path}: Line {elem.sourceline}: "
+                                f"<{elem_name}> references non-existent relationship '{rid_attr}' "
+                                f"(valid IDs: {', '.join(sorted(rid_to_type.keys())[:5])}{'...' if len(rid_to_type) > 5 else ''})"
+                            )
+                        # Check if we have type expectations for this element
+                        elif self.ELEMENT_RELATIONSHIP_TYPES:
+                            expected_type = self._get_expected_relationship_type(
+                                elem_name
+                            )
+                            if expected_type:
+                                actual_type = rid_to_type[rid_attr]
+                                # Check if the actual type matches or contains the expected type
+                                if expected_type not in actual_type.lower():
+                                    errors.append(
+                                        f"  {xml_rel_path}: Line {elem.sourceline}: "
+                                        f"<{elem_name}> references '{rid_attr}' which points to '{actual_type}' "
+                                        f"but should point to a '{expected_type}' relationship"
+                                    )
+
+            except Exception as e:
+                xml_rel_path = xml_file.relative_to(self.unpacked_dir)
+                errors.append(f"  Error processing {xml_rel_path}: {e}")
+
+        if errors:
+            print(f"FAILED - Found {len(errors)} relationship ID reference errors:")
+            for error in errors:
+                print(error)
+            print("\nThese ID mismatches will cause the document to appear corrupt!")
+            return False
+        else:
+            if self.verbose:
+                print("PASSED - All relationship ID references are valid")
+            return True
+
+    def _get_expected_relationship_type(self, element_name):
+        """
+        Get the expected relationship type for an element.
+        First checks the explicit mapping, then tries pattern detection.
+        """
+        # Normalize element name to lowercase
+        elem_lower = element_name.lower()
+
+        # Check explicit mapping first
+        if elem_lower in self.ELEMENT_RELATIONSHIP_TYPES:
+            return self.ELEMENT_RELATIONSHIP_TYPES[elem_lower]
+
+        # Try pattern detection for common patterns
+        # Pattern 1: Elements ending in "Id" often expect a relationship of the prefix type
+        if elem_lower.endswith("id") and len(elem_lower) > 2:
+            # e.g., "sldId" -> "sld", "sldMasterId" -> "sldMaster"
+            prefix = elem_lower[:-2]  # Remove "id"
+            # Check if this might be a compound like "sldMasterId"
+            if prefix.endswith("master"):
+                return prefix.lower()
+            elif prefix.endswith("layout"):
+                return prefix.lower()
+            else:
+                # Simple case like "sldId" -> "slide"
+                # Common transformations
+                if prefix == "sld":
+                    return "slide"
+                return prefix.lower()
+
+        # Pattern 2: Elements ending in "Reference" expect a relationship of the prefix type
+        if elem_lower.endswith("reference") and len(elem_lower) > 9:
+            prefix = elem_lower[:-9]  # Remove "reference"
+            return prefix.lower()
+
+        return None
+
+    def validate_content_types(self):
+        """Validate that all content files are properly declared in [Content_Types].xml."""
+        errors = []
+
+        # Find [Content_Types].xml file
+        content_types_file = self.unpacked_dir / "[Content_Types].xml"
+        if not content_types_file.exists():
+            print("FAILED - [Content_Types].xml file not found")
+            return False
+
+        try:
+            # Parse and get all declared parts and extensions
+            root = lxml.etree.parse(str(content_types_file)).getroot()
+            declared_parts = set()
+            declared_extensions = set()
+
+            # Get Override declarations (specific files)
+            for override in root.findall(
+                f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Override"
+            ):
+                part_name = override.get("PartName")
+                if part_name is not None:
+                    declared_parts.add(part_name.lstrip("/"))
+
+            # Get Default declarations (by extension)
+            for default in root.findall(
+                f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Default"
+            ):
+                extension = default.get("Extension")
+                if extension is not None:
+                    declared_extensions.add(extension.lower())
+
+            # Root elements that require content type declaration
+            declarable_roots = {
+                "sld",
+                "sldLayout",
+                "sldMaster",
+                "presentation",  # PowerPoint
+                "document",  # Word
+                "workbook",
+                "worksheet",  # Excel
+                "theme",  # Common
+            }
+
+            # Common media file extensions that should be declared
+            media_extensions = {
+                "png": "image/png",
+                "jpg": "image/jpeg",
+                "jpeg": "image/jpeg",
+                "gif": "image/gif",
+                "bmp": "image/bmp",
+                "tiff": "image/tiff",
+                "wmf": "image/x-wmf",
+                "emf": "image/x-emf",
+            }
+
+            # Get all files in the unpacked directory
+            all_files = list(self.unpacked_dir.rglob("*"))
+            all_files = [f for f in all_files if f.is_file()]
+
+            # Check all XML files for Override declarations
+            for xml_file in self.xml_files:
+                path_str = str(xml_file.relative_to(self.unpacked_dir)).replace(
+                    "\\", "/"
+                )
+
+                # Skip non-content files
+                if any(
+                    skip in path_str
+                    for skip in [".rels", "[Content_Types]", "docProps/", "_rels/"]
+                ):
+                    continue
+
+                try:
+                    root_tag = lxml.etree.parse(str(xml_file)).getroot().tag
+                    root_name = root_tag.split("}")[-1] if "}" in root_tag else root_tag
+
+                    if root_name in declarable_roots and path_str not in declared_parts:
+                        errors.append(
+                            f"  {path_str}: File with <{root_name}> root not declared in [Content_Types].xml"
+                        )
+
+                except Exception:
+                    continue  # Skip unparseable files
+
+            # Check all non-XML files for Default extension declarations
+            for file_path in all_files:
+                # Skip XML files and metadata files (already checked above)
+                if file_path.suffix.lower() in {".xml", ".rels"}:
+                    continue
+                if file_path.name == "[Content_Types].xml":
+                    continue
+                if "_rels" in file_path.parts or "docProps" in file_path.parts:
+                    continue
+
+                extension = file_path.suffix.lstrip(".").lower()
+                if extension and extension not in declared_extensions:
+                    # Check if it's a known media extension that should be declared
+                    if extension in media_extensions:
+                        relative_path = file_path.relative_to(self.unpacked_dir)
+                        errors.append(
+                            f'  {relative_path}: File with extension \'{extension}\' not declared in [Content_Types].xml - should add: <Default Extension="{extension}" ContentType="{media_extensions[extension]}"/>'
+                        )
+
+        except Exception as e:
+            errors.append(f"  Error parsing [Content_Types].xml: {e}")
+
+        if errors:
+            print(f"FAILED - Found {len(errors)} content type declaration errors:")
+            for error in errors:
+                print(error)
+            return False
+        else:
+            if self.verbose:
+                print(
+                    "PASSED - All content files are properly declared in [Content_Types].xml"
+                )
+            return True
+
+    def validate_file_against_xsd(self, xml_file, verbose=False):
+        """Validate a single XML file against XSD schema, comparing with original.
+
+        Args:
+            xml_file: Path to XML file to validate
+            verbose: Enable verbose output
+
+        Returns:
+            tuple: (is_valid, new_errors_set) where is_valid is True/False/None (skipped)
+        """
+        # Resolve both paths to handle symlinks
+        xml_file = Path(xml_file).resolve()
+        unpacked_dir = self.unpacked_dir.resolve()
+
+        # Validate current file
+        is_valid, current_errors = self._validate_single_file_xsd(
+            xml_file, unpacked_dir
+        )
+
+        if is_valid is None:
+            return None, set()  # Skipped
+        elif is_valid:
+            return True, set()  # Valid, no errors
+
+        # Get errors from original file for this specific file
+        original_errors = self._get_original_file_errors(xml_file)
+
+        # Compare with original (both are guaranteed to be sets here)
+        assert current_errors is not None
+        new_errors = current_errors - original_errors
+
+        if new_errors:
+            if verbose:
+                relative_path = xml_file.relative_to(unpacked_dir)
+                print(f"FAILED - {relative_path}: {len(new_errors)} new error(s)")
+                for error in list(new_errors)[:3]:
+                    truncated = error[:250] + "..." if len(error) > 250 else error
+                    print(f"  - {truncated}")
+            return False, new_errors
+        else:
+            # All errors existed in original
+            if verbose:
+                print(
+                    f"PASSED - No new errors (original had {len(current_errors)} errors)"
+                )
+            return True, set()
+
+    def validate_against_xsd(self):
+        """Validate XML files against XSD schemas, showing only new errors compared to original."""
+        new_errors = []
+        original_error_count = 0
+        valid_count = 0
+        skipped_count = 0
+
+        for xml_file in self.xml_files:
+            relative_path = str(xml_file.relative_to(self.unpacked_dir))
+            is_valid, new_file_errors = self.validate_file_against_xsd(
+                xml_file, verbose=False
+            )
+
+            if is_valid is None:
+                skipped_count += 1
+                continue
+            elif is_valid and not new_file_errors:
+                valid_count += 1
+                continue
+            elif is_valid:
+                # Had errors but all existed in original
+                original_error_count += 1
+                valid_count += 1
+                continue
+
+            # Has new errors
+            new_errors.append(f"  {relative_path}: {len(new_file_errors)} new error(s)")
+            for error in list(new_file_errors)[:3]:  # Show first 3 errors
+                new_errors.append(
+                    f"    - {error[:250]}..." if len(error) > 250 else f"    - {error}"
+                )
+
+        # Print summary
+        if self.verbose:
+            print(f"Validated {len(self.xml_files)} files:")
+            print(f"  - Valid: {valid_count}")
+            print(f"  - Skipped (no schema): {skipped_count}")
+            if original_error_count:
+                print(f"  - With original errors (ignored): {original_error_count}")
+            print(
+                f"  - With NEW errors: {len(new_errors) > 0 and len([e for e in new_errors if not e.startswith('    ')]) or 0}"
+            )
+
+        if new_errors:
+            print("\nFAILED - Found NEW validation errors:")
+            for error in new_errors:
+                print(error)
+            return False
+        else:
+            if self.verbose:
+                print("\nPASSED - No new XSD validation errors introduced")
+            return True
+
+    def _get_schema_path(self, xml_file):
+        """Determine the appropriate schema path for an XML file."""
+        # Check exact filename match
+        if xml_file.name in self.SCHEMA_MAPPINGS:
+            return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.name]
+
+        # Check .rels files
+        if xml_file.suffix == ".rels":
+            return self.schemas_dir / self.SCHEMA_MAPPINGS[".rels"]
+
+        # Check chart files
+        if "charts/" in str(xml_file) and xml_file.name.startswith("chart"):
+            return self.schemas_dir / self.SCHEMA_MAPPINGS["chart"]
+
+        # Check theme files
+        if "theme/" in str(xml_file) and xml_file.name.startswith("theme"):
+            return self.schemas_dir / self.SCHEMA_MAPPINGS["theme"]
+
+        # Check if file is in a main content folder and use appropriate schema
+        if xml_file.parent.name in self.MAIN_CONTENT_FOLDERS:
+            return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.parent.name]
+
+        return None
+
+    def _clean_ignorable_namespaces(self, xml_doc):
+        """Remove attributes and elements not in allowed namespaces."""
+        # Create a clean copy
+        xml_string = lxml.etree.tostring(xml_doc, encoding="unicode")
+        xml_copy = lxml.etree.fromstring(xml_string)
+
+        # Remove attributes not in allowed namespaces
+        for elem in xml_copy.iter():
+            attrs_to_remove = []
+
+            for attr in elem.attrib:
+                # Check if attribute is from a namespace other than allowed ones
+                if "{" in attr:
+                    ns = attr.split("}")[0][1:]
+                    if ns not in self.OOXML_NAMESPACES:
+                        attrs_to_remove.append(attr)
+
+            # Remove collected attributes
+            for attr in attrs_to_remove:
+                del elem.attrib[attr]
+
+        # Remove elements not in allowed namespaces
+        self._remove_ignorable_elements(xml_copy)
+
+        return lxml.etree.ElementTree(xml_copy)
+
+    def _remove_ignorable_elements(self, root):
+        """Recursively remove all elements not in allowed namespaces."""
+        elements_to_remove = []
+
+        # Find elements to remove
+        for elem in list(root):
+            # Skip non-element nodes (comments, processing instructions, etc.)
+            if not hasattr(elem, "tag") or callable(elem.tag):
+                continue
+
+            tag_str = str(elem.tag)
+            if tag_str.startswith("{"):
+                ns = tag_str.split("}")[0][1:]
+                if ns not in self.OOXML_NAMESPACES:
+                    elements_to_remove.append(elem)
+                    continue
+
+            # Recursively clean child elements
+            self._remove_ignorable_elements(elem)
+
+        # Remove collected elements
+        for elem in elements_to_remove:
+            root.remove(elem)
+
+    def _preprocess_for_mc_ignorable(self, xml_doc):
+        """Preprocess XML to handle mc:Ignorable attribute properly."""
+        # Remove mc:Ignorable attributes before validation
+        root = xml_doc.getroot()
+
+        # Remove mc:Ignorable attribute from root
+        if f"{{{self.MC_NAMESPACE}}}Ignorable" in root.attrib:
+            del root.attrib[f"{{{self.MC_NAMESPACE}}}Ignorable"]
+
+        return xml_doc
+
+    def _validate_single_file_xsd(self, xml_file, base_path):
+        """Validate a single XML file against XSD schema. Returns (is_valid, errors_set)."""
+        schema_path = self._get_schema_path(xml_file)
+        if not schema_path:
+            return None, None  # Skip file
+
+        try:
+            # Load schema
+            with open(schema_path, "rb") as xsd_file:
+                parser = lxml.etree.XMLParser()
+                xsd_doc = lxml.etree.parse(
+                    xsd_file, parser=parser, base_url=str(schema_path)
+                )
+                schema = lxml.etree.XMLSchema(xsd_doc)
+
+            # Load and preprocess XML
+            with open(xml_file, "r") as f:
+                xml_doc = lxml.etree.parse(f)
+
+            xml_doc, _ = self._remove_template_tags_from_text_nodes(xml_doc)
+            xml_doc = self._preprocess_for_mc_ignorable(xml_doc)
+
+            # Clean ignorable namespaces if needed
+            relative_path = xml_file.relative_to(base_path)
+            if (
+                relative_path.parts
+                and relative_path.parts[0] in self.MAIN_CONTENT_FOLDERS
+            ):
+                xml_doc = self._clean_ignorable_namespaces(xml_doc)
+
+            # Validate
+            if schema.validate(xml_doc):
+                return True, set()
+            else:
+                errors = set()
+                for error in schema.error_log:
+                    # Store normalized error message (without line numbers for comparison)
+                    errors.add(error.message)
+                return False, errors
+
+        except Exception as e:
+            return False, {str(e)}
+
+    def _get_original_file_errors(self, xml_file):
+        """Get XSD validation errors from a single file in the original document.
+
+        Args:
+            xml_file: Path to the XML file in unpacked_dir to check
+
+        Returns:
+            set: Set of error messages from the original file
+        """
+        import tempfile
+        import zipfile
+
+        # Resolve both paths to handle symlinks (e.g., /var vs /private/var on macOS)
+        xml_file = Path(xml_file).resolve()
+        unpacked_dir = self.unpacked_dir.resolve()
+        relative_path = xml_file.relative_to(unpacked_dir)
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            temp_path = Path(temp_dir)
+
+            # Extract original file
+            with zipfile.ZipFile(self.original_file, "r") as zip_ref:
+                zip_ref.extractall(temp_path)
+
+            # Find corresponding file in original
+            original_xml_file = temp_path / relative_path
+
+            if not original_xml_file.exists():
+                # File didn't exist in original, so no original errors
+                return set()
+
+            # Validate the specific file in original
+            is_valid, errors = self._validate_single_file_xsd(
+                original_xml_file, temp_path
+            )
+            return errors if errors else set()
+
+    def _remove_template_tags_from_text_nodes(self, xml_doc):
+        """Remove template tags from XML text nodes and collect warnings.
+
+        Template tags follow the pattern {{ ... }} and are used as placeholders
+        for content replacement. They should be removed from text content before
+        XSD validation while preserving XML structure.
+
+        Returns:
+            tuple: (cleaned_xml_doc, warnings_list)
+        """
+        warnings = []
+        template_pattern = re.compile(r"\{\{[^}]*\}\}")
+
+        # Create a copy of the document to avoid modifying the original
+        xml_string = lxml.etree.tostring(xml_doc, encoding="unicode")
+        xml_copy = lxml.etree.fromstring(xml_string)
+
+        def process_text_content(text, content_type):
+            if not text:
+                return text
+            matches = list(template_pattern.finditer(text))
+            if matches:
+                for match in matches:
+                    warnings.append(
+                        f"Found template tag in {content_type}: {match.group()}"
+                    )
+                return template_pattern.sub("", text)
+            return text
+
+        # Process all text nodes in the document
+        for elem in xml_copy.iter():
+            # Skip processing if this is a w:t element
+            if not hasattr(elem, "tag") or callable(elem.tag):
+                continue
+            tag_str = str(elem.tag)
+            if tag_str.endswith("}t") or tag_str == "t":
+                continue
+
+            elem.text = process_text_content(elem.text, "text content")
+            elem.tail = process_text_content(elem.tail, "tail content")
+
+        return lxml.etree.ElementTree(xml_copy), warnings
+
+
+if __name__ == "__main__":
+    raise RuntimeError("This module should not be run directly.")
--- a/skills/pptx/ooxml/scripts/validation/docx.py
+++ b/skills/pptx/ooxml/scripts/validation/docx.py
@@ -0,0 +1,274 @@
+"""
+Validator for Word document XML files against XSD schemas.
+"""
+
+import re
+import tempfile
+import zipfile
+
+import lxml.etree
+
+from .base import BaseSchemaValidator
+
+
+class DOCXSchemaValidator(BaseSchemaValidator):
+    """Validator for Word document XML files against XSD schemas."""
+
+    # Word-specific namespace
+    WORD_2006_NAMESPACE = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
+
+    # Word-specific element to relationship type mappings
+    # Start with empty mapping - add specific cases as we discover them
+    ELEMENT_RELATIONSHIP_TYPES = {}
+
+    def validate(self):
+        """Run all validation checks and return True if all pass."""
+        # Test 0: XML well-formedness
+        if not self.validate_xml():
+            return False
+
+        # Test 1: Namespace declarations
+        all_valid = True
+        if not self.validate_namespaces():
+            all_valid = False
+
+        # Test 2: Unique IDs
+        if not self.validate_unique_ids():
+            all_valid = False
+
+        # Test 3: Relationship and file reference validation
+        if not self.validate_file_references():
+            all_valid = False
+
+        # Test 4: Content type declarations
+        if not self.validate_content_types():
+            all_valid = False
+
+        # Test 5: XSD schema validation
+        if not self.validate_against_xsd():
+            all_valid = False
+
+        # Test 6: Whitespace preservation
+        if not self.validate_whitespace_preservation():
+            all_valid = False
+
+        # Test 7: Deletion validation
+        if not self.validate_deletions():
+            all_valid = False
+
+        # Test 8: Insertion validation
+        if not self.validate_insertions():
+            all_valid = False
+
+        # Test 9: Relationship ID reference validation
+        if not self.validate_all_relationship_ids():
+            all_valid = False
+
+        # Count and compare paragraphs
+        self.compare_paragraph_counts()
+
+        return all_valid
+
+    def validate_whitespace_preservation(self):
+        """
+        Validate that w:t elements with whitespace have xml:space='preserve'.
+        """
+        errors = []
+
+        for xml_file in self.xml_files:
+            # Only check document.xml files
+            if xml_file.name != "document.xml":
+                continue
+
+            try:
+                root = lxml.etree.parse(str(xml_file)).getroot()
+
+                # Find all w:t elements
+                for elem in root.iter(f"{{{self.WORD_2006_NAMESPACE}}}t"):
+                    if elem.text:
+                        text = elem.text
+                        # Check if text starts or ends with whitespace
+                        if re.match(r"^\s.*", text) or re.match(r".*\s$", text):
+                            # Check if xml:space="preserve" attribute exists
+                            xml_space_attr = f"{{{self.XML_NAMESPACE}}}space"
+                            if (
+                                xml_space_attr not in elem.attrib
+                                or elem.attrib[xml_space_attr] != "preserve"
+                            ):
+                                # Show a preview of the text
+                                text_preview = (
+                                    repr(text)[:50] + "..."
+                                    if len(repr(text)) > 50
+                                    else repr(text)
+                                )
+                                errors.append(
+                                    f"  {xml_file.relative_to(self.unpacked_dir)}: "
+                                    f"Line {elem.sourceline}: w:t element with whitespace missing xml:space='preserve': {text_preview}"
+                                )
+
+            except (lxml.etree.XMLSyntaxError, Exception) as e:
+                errors.append(
+                    f"  {xml_file.relative_to(self.unpacked_dir)}: Error: {e}"
+                )
+
+        if errors:
+            print(f"FAILED - Found {len(errors)} whitespace preservation violations:")
+            for error in errors:
+                print(error)
+            return False
+        else:
+            if self.verbose:
+                print("PASSED - All whitespace is properly preserved")
+            return True
+
+    def validate_deletions(self):
+        """
+        Validate that w:t elements are not within w:del elements.
+        For some reason, XSD validation does not catch this, so we do it manually.
+        """
+        errors = []
+
+        for xml_file in self.xml_files:
+            # Only check document.xml files
+            if xml_file.name != "document.xml":
+                continue
+
+            try:
+                root = lxml.etree.parse(str(xml_file)).getroot()
+
+                # Find all w:t elements that are descendants of w:del elements
+                namespaces = {"w": self.WORD_2006_NAMESPACE}
+                xpath_expression = ".//w:del//w:t"
+                problematic_t_elements = root.xpath(
+                    xpath_expression, namespaces=namespaces
+                )
+                for t_elem in problematic_t_elements:
+                    if t_elem.text:
+                        # Show a preview of the text
+                        text_preview = (
+                            repr(t_elem.text)[:50] + "..."
+                            if len(repr(t_elem.text)) > 50
+                            else repr(t_elem.text)
+                        )
+                        errors.append(
+                            f"  {xml_file.relative_to(self.unpacked_dir)}: "
+                            f"Line {t_elem.sourceline}: <w:t> found within <w:del>: {text_preview}"
+                        )
+
+            except (lxml.etree.XMLSyntaxError, Exception) as e:
+                errors.append(
+                    f"  {xml_file.relative_to(self.unpacked_dir)}: Error: {e}"
+                )
+
+        if errors:
+            print(f"FAILED - Found {len(errors)} deletion validation violations:")
+            for error in errors:
+                print(error)
+            return False
+        else:
+            if self.verbose:
+                print("PASSED - No w:t elements found within w:del elements")
+            return True
+
+    def count_paragraphs_in_unpacked(self):
+        """Count the number of paragraphs in the unpacked document."""
+        count = 0
+
+        for xml_file in self.xml_files:
+            # Only check document.xml files
+            if xml_file.name != "document.xml":
+                continue
+
+            try:
+                root = lxml.etree.parse(str(xml_file)).getroot()
+                # Count all w:p elements
+                paragraphs = root.findall(f".//{{{self.WORD_2006_NAMESPACE}}}p")
+                count = len(paragraphs)
+            except Exception as e:
+                print(f"Error counting paragraphs in unpacked document: {e}")
+
+        return count
+
+    def count_paragraphs_in_original(self):
+        """Count the number of paragraphs in the original docx file."""
+        count = 0
+
+        try:
+            # Create temporary directory to unpack original
+            with tempfile.TemporaryDirectory() as temp_dir:
+                # Unpack original docx
+                with zipfile.ZipFile(self.original_file, "r") as zip_ref:
+                    zip_ref.extractall(temp_dir)
+
+                # Parse document.xml
+                doc_xml_path = temp_dir + "/word/document.xml"
+                root = lxml.etree.parse(doc_xml_path).getroot()
+
+                # Count all w:p elements
+                paragraphs = root.findall(f".//{{{self.WORD_2006_NAMESPACE}}}p")
+                count = len(paragraphs)
+
+        except Exception as e:
+            print(f"Error counting paragraphs in original document: {e}")
+
+        return count
+
+    def validate_insertions(self):
+        """
+        Validate that w:delText elements are not within w:ins elements.
+        w:delText is only allowed in w:ins if nested within a w:del.
+        """
+        errors = []
+
+        for xml_file in self.xml_files:
+            if xml_file.name != "document.xml":
+                continue
+
+            try:
+                root = lxml.etree.parse(str(xml_file)).getroot()
+                namespaces = {"w": self.WORD_2006_NAMESPACE}
+
+                # Find w:delText in w:ins that are NOT within w:del
+                invalid_elements = root.xpath(
+                    ".//w:ins//w:delText[not(ancestor::w:del)]",
+                    namespaces=namespaces
+                )
+
+                for elem in invalid_elements:
+                    text_preview = (
+                        repr(elem.text or "")[:50] + "..."
+                        if len(repr(elem.text or "")) > 50
+                        else repr(elem.text or "")
+                    )
+                    errors.append(
+                        f"  {xml_file.relative_to(self.unpacked_dir)}: "
+                        f"Line {elem.sourceline}: <w:delText> within <w:ins>: {text_preview}"
+                    )
+
+            except (lxml.etree.XMLSyntaxError, Exception) as e:
+                errors.append(
+                    f"  {xml_file.relative_to(self.unpacked_dir)}: Error: {e}"
+                )
+
+        if errors:
+            print(f"FAILED - Found {len(errors)} insertion validation violations:")
+            for error in errors:
+                print(error)
+            return False
+        else:
+            if self.verbose:
+                print("PASSED - No w:delText elements within w:ins elements")
+            return True
+
+    def compare_paragraph_counts(self):
+        """Compare paragraph counts between original and new document."""
+        original_count = self.count_paragraphs_in_original()
+        new_count = self.count_paragraphs_in_unpacked()
+
+        diff = new_count - original_count
+        diff_str = f"+{diff}" if diff > 0 else str(diff)
+        print(f"\nParagraphs: {original_count} → {new_count} ({diff_str})")
+
+
+if __name__ == "__main__":
+    raise RuntimeError("This module should not be run directly.")
--- a/skills/pptx/ooxml/scripts/validation/pptx.py
+++ b/skills/pptx/ooxml/scripts/validation/pptx.py
@@ -0,0 +1,315 @@
+"""
+Validator for PowerPoint presentation XML files against XSD schemas.
+"""
+
+import re
+
+from .base import BaseSchemaValidator
+
+
+class PPTXSchemaValidator(BaseSchemaValidator):
+    """Validator for PowerPoint presentation XML files against XSD schemas."""
+
+    # PowerPoint presentation namespace
+    PRESENTATIONML_NAMESPACE = (
+        "http://schemas.openxmlformats.org/presentationml/2006/main"
+    )
+
+    # PowerPoint-specific element to relationship type mappings
+    ELEMENT_RELATIONSHIP_TYPES = {
+        "sldid": "slide",
+        "sldmasterid": "slidemaster",
+        "notesmasterid": "notesmaster",
+        "sldlayoutid": "slidelayout",
+        "themeid": "theme",
+        "tablestyleid": "tablestyles",
+    }
+
+    def validate(self):
+        """Run all validation checks and return True if all pass."""
+        # Test 0: XML well-formedness
+        if not self.validate_xml():
+            return False
+
+        # Test 1: Namespace declarations
+        all_valid = True
+        if not self.validate_namespaces():
+            all_valid = False
+
+        # Test 2: Unique IDs
+        if not self.validate_unique_ids():
+            all_valid = False
+
+        # Test 3: UUID ID validation
+        if not self.validate_uuid_ids():
+            all_valid = False
+
+        # Test 4: Relationship and file reference validation
+        if not self.validate_file_references():
+            all_valid = False
+
+        # Test 5: Slide layout ID validation
+        if not self.validate_slide_layout_ids():
+            all_valid = False
+
+        # Test 6: Content type declarations
+        if not self.validate_content_types():
+            all_valid = False
+
+        # Test 7: XSD schema validation
+        if not self.validate_against_xsd():
+            all_valid = False
+
+        # Test 8: Notes slide reference validation
+        if not self.validate_notes_slide_references():
+            all_valid = False
+
+        # Test 9: Relationship ID reference validation
+        if not self.validate_all_relationship_ids():
+            all_valid = False
+
+        # Test 10: Duplicate slide layout references validation
+        if not self.validate_no_duplicate_slide_layouts():
+            all_valid = False
+
+        return all_valid
+
+    def validate_uuid_ids(self):
+        """Validate that ID attributes that look like UUIDs contain only hex values."""
+        import lxml.etree
+
+        errors = []
+        # UUID pattern: 8-4-4-4-12 hex digits with optional braces/hyphens
+        uuid_pattern = re.compile(
+            r"^[\{\(]?[0-9A-Fa-f]{8}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{12}[\}\)]?$"
+        )
+
+        for xml_file in self.xml_files:
+            try:
+                root = lxml.etree.parse(str(xml_file)).getroot()
+
+                # Check all elements for ID attributes
+                for elem in root.iter():
+                    for attr, value in elem.attrib.items():
+                        # Check if this is an ID attribute
+                        attr_name = attr.split("}")[-1].lower()
+                        if attr_name == "id" or attr_name.endswith("id"):
+                            # Check if value looks like a UUID (has the right length and pattern structure)
+                            if self._looks_like_uuid(value):
+                                # Validate that it contains only hex characters in the right positions
+                                if not uuid_pattern.match(value):
+                                    errors.append(
+                                        f"  {xml_file.relative_to(self.unpacked_dir)}: "
+                                        f"Line {elem.sourceline}: ID '{value}' appears to be a UUID but contains invalid hex characters"
+                                    )
+
+            except (lxml.etree.XMLSyntaxError, Exception) as e:
+                errors.append(
+                    f"  {xml_file.relative_to(self.unpacked_dir)}: Error: {e}"
+                )
+
+        if errors:
+            print(f"FAILED - Found {len(errors)} UUID ID validation errors:")
+            for error in errors:
+                print(error)
+            return False
+        else:
+            if self.verbose:
+                print("PASSED - All UUID-like IDs contain valid hex values")
+            return True
+
+    def _looks_like_uuid(self, value):
+        """Check if a value has the general structure of a UUID."""
+        # Remove common UUID delimiters
+        clean_value = value.strip("{}()").replace("-", "")
+        # Check if it's 32 hex-like characters (could include invalid hex chars)
+        return len(clean_value) == 32 and all(c.isalnum() for c in clean_value)
+
+    def validate_slide_layout_ids(self):
+        """Validate that sldLayoutId elements in slide masters reference valid slide layouts."""
+        import lxml.etree
+
+        errors = []
+
+        # Find all slide master files
+        slide_masters = list(self.unpacked_dir.glob("ppt/slideMasters/*.xml"))
+
+        if not slide_masters:
+            if self.verbose:
+                print("PASSED - No slide masters found")
+            return True
+
+        for slide_master in slide_masters:
+            try:
+                # Parse the slide master file
+                root = lxml.etree.parse(str(slide_master)).getroot()
+
+                # Find the corresponding _rels file for this slide master
+                rels_file = slide_master.parent / "_rels" / f"{slide_master.name}.rels"
+
+                if not rels_file.exists():
+                    errors.append(
+                        f"  {slide_master.relative_to(self.unpacked_dir)}: "
+                        f"Missing relationships file: {rels_file.relative_to(self.unpacked_dir)}"
+                    )
+                    continue
+
+                # Parse the relationships file
+                rels_root = lxml.etree.parse(str(rels_file)).getroot()
+
+                # Build a set of valid relationship IDs that point to slide layouts
+                valid_layout_rids = set()
+                for rel in rels_root.findall(
+                    f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship"
+                ):
+                    rel_type = rel.get("Type", "")
+                    if "slideLayout" in rel_type:
+                        valid_layout_rids.add(rel.get("Id"))
+
+                # Find all sldLayoutId elements in the slide master
+                for sld_layout_id in root.findall(
+                    f".//{{{self.PRESENTATIONML_NAMESPACE}}}sldLayoutId"
+                ):
+                    r_id = sld_layout_id.get(
+                        f"{{{self.OFFICE_RELATIONSHIPS_NAMESPACE}}}id"
+                    )
+                    layout_id = sld_layout_id.get("id")
+
+                    if r_id and r_id not in valid_layout_rids:
+                        errors.append(
+                            f"  {slide_master.relative_to(self.unpacked_dir)}: "
+                            f"Line {sld_layout_id.sourceline}: sldLayoutId with id='{layout_id}' "
+                            f"references r:id='{r_id}' which is not found in slide layout relationships"
+                        )
+
+            except (lxml.etree.XMLSyntaxError, Exception) as e:
+                errors.append(
+                    f"  {slide_master.relative_to(self.unpacked_dir)}: Error: {e}"
+                )
+
+        if errors:
+            print(f"FAILED - Found {len(errors)} slide layout ID validation errors:")
+            for error in errors:
+                print(error)
+            print(
+                "Remove invalid references or add missing slide layouts to the relationships file."
+            )
+            return False
+        else:
+            if self.verbose:
+                print("PASSED - All slide layout IDs reference valid slide layouts")
+            return True
+
+    def validate_no_duplicate_slide_layouts(self):
+        """Validate that each slide has exactly one slideLayout reference."""
+        import lxml.etree
+
+        errors = []
+        slide_rels_files = list(self.unpacked_dir.glob("ppt/slides/_rels/*.xml.rels"))
+
+        for rels_file in slide_rels_files:
+            try:
+                root = lxml.etree.parse(str(rels_file)).getroot()
+
+                # Find all slideLayout relationships
+                layout_rels = [
+                    rel
+                    for rel in root.findall(
+                        f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship"
+                    )
+                    if "slideLayout" in rel.get("Type", "")
+                ]
+
+                if len(layout_rels) > 1:
+                    errors.append(
+                        f"  {rels_file.relative_to(self.unpacked_dir)}: has {len(layout_rels)} slideLayout references"
+                    )
+
+            except Exception as e:
+                errors.append(
+                    f"  {rels_file.relative_to(self.unpacked_dir)}: Error: {e}"
+                )
+
+        if errors:
+            print("FAILED - Found slides with duplicate slideLayout references:")
+            for error in errors:
+                print(error)
+            return False
+        else:
+            if self.verbose:
+                print("PASSED - All slides have exactly one slideLayout reference")
+            return True
+
+    def validate_notes_slide_references(self):
+        """Validate that each notesSlide file is referenced by only one slide."""
+        import lxml.etree
+
+        errors = []
+        notes_slide_references = {}  # Track which slides reference each notesSlide
+
+        # Find all slide relationship files
+        slide_rels_files = list(self.unpacked_dir.glob("ppt/slides/_rels/*.xml.rels"))
+
+        if not slide_rels_files:
+            if self.verbose:
+                print("PASSED - No slide relationship files found")
+            return True
+
+        for rels_file in slide_rels_files:
+            try:
+                # Parse the relationships file
+                root = lxml.etree.parse(str(rels_file)).getroot()
+
+                # Find all notesSlide relationships
+                for rel in root.findall(
+                    f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship"
+                ):
+                    rel_type = rel.get("Type", "")
+                    if "notesSlide" in rel_type:
+                        target = rel.get("Target", "")
+                        if target:
+                            # Normalize the target path to handle relative paths
+                            normalized_target = target.replace("../", "")
+
+                            # Track which slide references this notesSlide
+                            slide_name = rels_file.stem.replace(
+                                ".xml", ""
+                            )  # e.g., "slide1"
+
+                            if normalized_target not in notes_slide_references:
+                                notes_slide_references[normalized_target] = []
+                            notes_slide_references[normalized_target].append(
+                                (slide_name, rels_file)
+                            )
+
+            except (lxml.etree.XMLSyntaxError, Exception) as e:
+                errors.append(
+                    f"  {rels_file.relative_to(self.unpacked_dir)}: Error: {e}"
+                )
+
+        # Check for duplicate references
+        for target, references in notes_slide_references.items():
+            if len(references) > 1:
+                slide_names = [ref[0] for ref in references]
+                errors.append(
+                    f"  Notes slide '{target}' is referenced by multiple slides: {', '.join(slide_names)}"
+                )
+                for slide_name, rels_file in references:
+                    errors.append(f"    - {rels_file.relative_to(self.unpacked_dir)}")
+
+        if errors:
+            print(
+                f"FAILED - Found {len([e for e in errors if not e.startswith('    ')])} notes slide reference validation errors:"
+            )
+            for error in errors:
+                print(error)
+            print("Each slide may optionally have its own slide file.")
+            return False
+        else:
+            if self.verbose:
+                print("PASSED - All notes slide references are unique")
+            return True
+
+
+if __name__ == "__main__":
+    raise RuntimeError("This module should not be run directly.")
--- a/skills/pptx/ooxml/scripts/validation/redlining.py
+++ b/skills/pptx/ooxml/scripts/validation/redlining.py
@@ -0,0 +1,279 @@
+"""
+Validator for tracked changes in Word documents.
+"""
+
+import subprocess
+import tempfile
+import zipfile
+from pathlib import Path
+
+
+class RedliningValidator:
+    """Validator for tracked changes in Word documents."""
+
+    def __init__(self, unpacked_dir, original_docx, verbose=False):
+        self.unpacked_dir = Path(unpacked_dir)
+        self.original_docx = Path(original_docx)
+        self.verbose = verbose
+        self.namespaces = {
+            "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
+        }
+
+    def validate(self):
+        """Main validation method that returns True if valid, False otherwise."""
+        # Verify unpacked directory exists and has correct structure
+        modified_file = self.unpacked_dir / "word" / "document.xml"
+        if not modified_file.exists():
+            print(f"FAILED - Modified document.xml not found at {modified_file}")
+            return False
+
+        # First, check if there are any tracked changes by Claude to validate
+        try:
+            import xml.etree.ElementTree as ET
+
+            tree = ET.parse(modified_file)
+            root = tree.getroot()
+
+            # Check for w:del or w:ins tags authored by Claude
+            del_elements = root.findall(".//w:del", self.namespaces)
+            ins_elements = root.findall(".//w:ins", self.namespaces)
+
+            # Filter to only include changes by Claude
+            claude_del_elements = [
+                elem
+                for elem in del_elements
+                if elem.get(f"{{{self.namespaces['w']}}}author") == "Claude"
+            ]
+            claude_ins_elements = [
+                elem
+                for elem in ins_elements
+                if elem.get(f"{{{self.namespaces['w']}}}author") == "Claude"
+            ]
+
+            # Redlining validation is only needed if tracked changes by Claude have been used.
+            if not claude_del_elements and not claude_ins_elements:
+                if self.verbose:
+                    print("PASSED - No tracked changes by Claude found.")
+                return True
+
+        except Exception:
+            # If we can't parse the XML, continue with full validation
+            pass
+
+        # Create temporary directory for unpacking original docx
+        with tempfile.TemporaryDirectory() as temp_dir:
+            temp_path = Path(temp_dir)
+
+            # Unpack original docx
+            try:
+                with zipfile.ZipFile(self.original_docx, "r") as zip_ref:
+                    zip_ref.extractall(temp_path)
+            except Exception as e:
+                print(f"FAILED - Error unpacking original docx: {e}")
+                return False
+
+            original_file = temp_path / "word" / "document.xml"
+            if not original_file.exists():
+                print(
+                    f"FAILED - Original document.xml not found in {self.original_docx}"
+                )
+                return False
+
+            # Parse both XML files using xml.etree.ElementTree for redlining validation
+            try:
+                import xml.etree.ElementTree as ET
+
+                modified_tree = ET.parse(modified_file)
+                modified_root = modified_tree.getroot()
+                original_tree = ET.parse(original_file)
+                original_root = original_tree.getroot()
+            except ET.ParseError as e:
+                print(f"FAILED - Error parsing XML files: {e}")
+                return False
+
+            # Remove Claude's tracked changes from both documents
+            self._remove_claude_tracked_changes(original_root)
+            self._remove_claude_tracked_changes(modified_root)
+
+            # Extract and compare text content
+            modified_text = self._extract_text_content(modified_root)
+            original_text = self._extract_text_content(original_root)
+
+            if modified_text != original_text:
+                # Show detailed character-level differences for each paragraph
+                error_message = self._generate_detailed_diff(
+                    original_text, modified_text
+                )
+                print(error_message)
+                return False
+
+            if self.verbose:
+                print("PASSED - All changes by Claude are properly tracked")
+            return True
+
+    def _generate_detailed_diff(self, original_text, modified_text):
+        """Generate detailed word-level differences using git word diff."""
+        error_parts = [
+            "FAILED - Document text doesn't match after removing Claude's tracked changes",
+            "",
+            "Likely causes:",
+            "  1. Modified text inside another author's <w:ins> or <w:del> tags",
+            "  2. Made edits without proper tracked changes",
+            "  3. Didn't nest <w:del> inside <w:ins> when deleting another's insertion",
+            "",
+            "For pre-redlined documents, use correct patterns:",
+            "  - To reject another's INSERTION: Nest <w:del> inside their <w:ins>",
+            "  - To restore another's DELETION: Add new <w:ins> AFTER their <w:del>",
+            "",
+        ]
+
+        # Show git word diff
+        git_diff = self._get_git_word_diff(original_text, modified_text)
+        if git_diff:
+            error_parts.extend(["Differences:", "============", git_diff])
+        else:
+            error_parts.append("Unable to generate word diff (git not available)")
+
+        return "\n".join(error_parts)
+
+    def _get_git_word_diff(self, original_text, modified_text):
+        """Generate word diff using git with character-level precision."""
+        try:
+            with tempfile.TemporaryDirectory() as temp_dir:
+                temp_path = Path(temp_dir)
+
+                # Create two files
+                original_file = temp_path / "original.txt"
+                modified_file = temp_path / "modified.txt"
+
+                original_file.write_text(original_text, encoding="utf-8")
+                modified_file.write_text(modified_text, encoding="utf-8")
+
+                # Try character-level diff first for precise differences
+                result = subprocess.run(
+                    [
+                        "git",
+                        "diff",
+                        "--word-diff=plain",
+                        "--word-diff-regex=.",  # Character-by-character diff
+                        "-U0",  # Zero lines of context - show only changed lines
+                        "--no-index",
+                        str(original_file),
+                        str(modified_file),
+                    ],
+                    capture_output=True,
+                    text=True,
+                )
+
+                if result.stdout.strip():
+                    # Clean up the output - remove git diff header lines
+                    lines = result.stdout.split("\n")
+                    # Skip the header lines (diff --git, index, +++, ---, @@)
+                    content_lines = []
+                    in_content = False
+                    for line in lines:
+                        if line.startswith("@@"):
+                            in_content = True
+                            continue
+                        if in_content and line.strip():
+                            content_lines.append(line)
+
+                    if content_lines:
+                        return "\n".join(content_lines)
+
+                # Fallback to word-level diff if character-level is too verbose
+                result = subprocess.run(
+                    [
+                        "git",
+                        "diff",
+                        "--word-diff=plain",
+                        "-U0",  # Zero lines of context
+                        "--no-index",
+                        str(original_file),
+                        str(modified_file),
+                    ],
+                    capture_output=True,
+                    text=True,
+                )
+
+                if result.stdout.strip():
+                    lines = result.stdout.split("\n")
+                    content_lines = []
+                    in_content = False
+                    for line in lines:
+                        if line.startswith("@@"):
+                            in_content = True
+                            continue
+                        if in_content and line.strip():
+                            content_lines.append(line)
+                    return "\n".join(content_lines)
+
+        except (subprocess.CalledProcessError, FileNotFoundError, Exception):
+            # Git not available or other error, return None to use fallback
+            pass
+
+        return None
+
+    def _remove_claude_tracked_changes(self, root):
+        """Remove tracked changes authored by Claude from the XML root."""
+        ins_tag = f"{{{self.namespaces['w']}}}ins"
+        del_tag = f"{{{self.namespaces['w']}}}del"
+        author_attr = f"{{{self.namespaces['w']}}}author"
+
+        # Remove w:ins elements
+        for parent in root.iter():
+            to_remove = []
+            for child in parent:
+                if child.tag == ins_tag and child.get(author_attr) == "Claude":
+                    to_remove.append(child)
+            for elem in to_remove:
+                parent.remove(elem)
+
+        # Unwrap content in w:del elements where author is "Claude"
+        deltext_tag = f"{{{self.namespaces['w']}}}delText"
+        t_tag = f"{{{self.namespaces['w']}}}t"
+
+        for parent in root.iter():
+            to_process = []
+            for child in parent:
+                if child.tag == del_tag and child.get(author_attr) == "Claude":
+                    to_process.append((child, list(parent).index(child)))
+
+            # Process in reverse order to maintain indices
+            for del_elem, del_index in reversed(to_process):
+                # Convert w:delText to w:t before moving
+                for elem in del_elem.iter():
+                    if elem.tag == deltext_tag:
+                        elem.tag = t_tag
+
+                # Move all children of w:del to its parent before removing w:del
+                for child in reversed(list(del_elem)):
+                    parent.insert(del_index, child)
+                parent.remove(del_elem)
+
+    def _extract_text_content(self, root):
+        """Extract text content from Word XML, preserving paragraph structure.
+
+        Empty paragraphs are skipped to avoid false positives when tracked
+        insertions add only structural elements without text content.
+        """
+        p_tag = f"{{{self.namespaces['w']}}}p"
+        t_tag = f"{{{self.namespaces['w']}}}t"
+
+        paragraphs = []
+        for p_elem in root.findall(f".//{p_tag}"):
+            # Get all text elements within this paragraph
+            text_parts = []
+            for t_elem in p_elem.findall(f".//{t_tag}"):
+                if t_elem.text:
+                    text_parts.append(t_elem.text)
+            paragraph_text = "".join(text_parts)
+            # Skip empty paragraphs - they don't affect content validation
+            if paragraph_text:
+                paragraphs.append(paragraph_text)
+
+        return "\n".join(paragraphs)
+
+
+if __name__ == "__main__":
+    raise RuntimeError("This module should not be run directly.")