Initial commit
This commit is contained in:
159
skills/pptx/ooxml/scripts/pack.py
Normal file
159
skills/pptx/ooxml/scripts/pack.py
Normal file
@@ -0,0 +1,159 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tool to pack a directory into a .docx, .pptx, or .xlsx file with XML formatting undone.
|
||||
|
||||
Example usage:
|
||||
python pack.py <input_directory> <office_file> [--force]
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import defusedxml.minidom
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Pack a directory into an Office file")
|
||||
parser.add_argument("input_directory", help="Unpacked Office document directory")
|
||||
parser.add_argument("output_file", help="Output Office file (.docx/.pptx/.xlsx)")
|
||||
parser.add_argument("--force", action="store_true", help="Skip validation")
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
success = pack_document(
|
||||
args.input_directory, args.output_file, validate=not args.force
|
||||
)
|
||||
|
||||
# Show warning if validation was skipped
|
||||
if args.force:
|
||||
print("Warning: Skipped validation, file may be corrupt", file=sys.stderr)
|
||||
# Exit with error if validation failed
|
||||
elif not success:
|
||||
print("Contents would produce a corrupt file.", file=sys.stderr)
|
||||
print("Please validate XML before repacking.", file=sys.stderr)
|
||||
print("Use --force to skip validation and pack anyway.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except ValueError as e:
|
||||
sys.exit(f"Error: {e}")
|
||||
|
||||
|
||||
def pack_document(input_dir, output_file, validate=False):
|
||||
"""Pack a directory into an Office file (.docx/.pptx/.xlsx).
|
||||
|
||||
Args:
|
||||
input_dir: Path to unpacked Office document directory
|
||||
output_file: Path to output Office file
|
||||
validate: If True, validates with soffice (default: False)
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False if validation failed
|
||||
"""
|
||||
input_dir = Path(input_dir)
|
||||
output_file = Path(output_file)
|
||||
|
||||
if not input_dir.is_dir():
|
||||
raise ValueError(f"{input_dir} is not a directory")
|
||||
if output_file.suffix.lower() not in {".docx", ".pptx", ".xlsx"}:
|
||||
raise ValueError(f"{output_file} must be a .docx, .pptx, or .xlsx file")
|
||||
|
||||
# Work in temporary directory to avoid modifying original
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
temp_content_dir = Path(temp_dir) / "content"
|
||||
shutil.copytree(input_dir, temp_content_dir)
|
||||
|
||||
# Process XML files to remove pretty-printing whitespace
|
||||
for pattern in ["*.xml", "*.rels"]:
|
||||
for xml_file in temp_content_dir.rglob(pattern):
|
||||
condense_xml(xml_file)
|
||||
|
||||
# Create final Office file as zip archive
|
||||
output_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
with zipfile.ZipFile(output_file, "w", zipfile.ZIP_DEFLATED) as zf:
|
||||
for f in temp_content_dir.rglob("*"):
|
||||
if f.is_file():
|
||||
zf.write(f, f.relative_to(temp_content_dir))
|
||||
|
||||
# Validate if requested
|
||||
if validate:
|
||||
if not validate_document(output_file):
|
||||
output_file.unlink() # Delete the corrupt file
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def validate_document(doc_path):
|
||||
"""Validate document by converting to HTML with soffice."""
|
||||
# Determine the correct filter based on file extension
|
||||
match doc_path.suffix.lower():
|
||||
case ".docx":
|
||||
filter_name = "html:HTML"
|
||||
case ".pptx":
|
||||
filter_name = "html:impress_html_Export"
|
||||
case ".xlsx":
|
||||
filter_name = "html:HTML (StarCalc)"
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[
|
||||
"soffice",
|
||||
"--headless",
|
||||
"--convert-to",
|
||||
filter_name,
|
||||
"--outdir",
|
||||
temp_dir,
|
||||
str(doc_path),
|
||||
],
|
||||
capture_output=True,
|
||||
timeout=10,
|
||||
text=True,
|
||||
)
|
||||
if not (Path(temp_dir) / f"{doc_path.stem}.html").exists():
|
||||
error_msg = result.stderr.strip() or "Document validation failed"
|
||||
print(f"Validation error: {error_msg}", file=sys.stderr)
|
||||
return False
|
||||
return True
|
||||
except FileNotFoundError:
|
||||
print("Warning: soffice not found. Skipping validation.", file=sys.stderr)
|
||||
return True
|
||||
except subprocess.TimeoutExpired:
|
||||
print("Validation error: Timeout during conversion", file=sys.stderr)
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f"Validation error: {e}", file=sys.stderr)
|
||||
return False
|
||||
|
||||
|
||||
def condense_xml(xml_file):
|
||||
"""Strip unnecessary whitespace and remove comments."""
|
||||
with open(xml_file, "r", encoding="utf-8") as f:
|
||||
dom = defusedxml.minidom.parse(f)
|
||||
|
||||
# Process each element to remove whitespace and comments
|
||||
for element in dom.getElementsByTagName("*"):
|
||||
# Skip w:t elements and their processing
|
||||
if element.tagName.endswith(":t"):
|
||||
continue
|
||||
|
||||
# Remove whitespace-only text nodes and comment nodes
|
||||
for child in list(element.childNodes):
|
||||
if (
|
||||
child.nodeType == child.TEXT_NODE
|
||||
and child.nodeValue
|
||||
and child.nodeValue.strip() == ""
|
||||
) or child.nodeType == child.COMMENT_NODE:
|
||||
element.removeChild(child)
|
||||
|
||||
# Write back the condensed XML
|
||||
with open(xml_file, "wb") as f:
|
||||
f.write(dom.toxml(encoding="UTF-8"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
29
skills/pptx/ooxml/scripts/unpack.py
Normal file
29
skills/pptx/ooxml/scripts/unpack.py
Normal file
@@ -0,0 +1,29 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Unpack and format XML contents of Office files (.docx, .pptx, .xlsx)"""
|
||||
|
||||
import random
|
||||
import sys
|
||||
import defusedxml.minidom
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
|
||||
# Get command line arguments
|
||||
assert len(sys.argv) == 3, "Usage: python unpack.py <office_file> <output_dir>"
|
||||
input_file, output_dir = sys.argv[1], sys.argv[2]
|
||||
|
||||
# Extract and format
|
||||
output_path = Path(output_dir)
|
||||
output_path.mkdir(parents=True, exist_ok=True)
|
||||
zipfile.ZipFile(input_file).extractall(output_path)
|
||||
|
||||
# Pretty print all XML files
|
||||
xml_files = list(output_path.rglob("*.xml")) + list(output_path.rglob("*.rels"))
|
||||
for xml_file in xml_files:
|
||||
content = xml_file.read_text(encoding="utf-8")
|
||||
dom = defusedxml.minidom.parseString(content)
|
||||
xml_file.write_bytes(dom.toprettyxml(indent=" ", encoding="ascii"))
|
||||
|
||||
# For .docx files, suggest an RSID for tracked changes
|
||||
if input_file.endswith(".docx"):
|
||||
suggested_rsid = "".join(random.choices("0123456789ABCDEF", k=8))
|
||||
print(f"Suggested RSID for edit session: {suggested_rsid}")
|
||||
69
skills/pptx/ooxml/scripts/validate.py
Normal file
69
skills/pptx/ooxml/scripts/validate.py
Normal file
@@ -0,0 +1,69 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Command line tool to validate Office document XML files against XSD schemas and tracked changes.
|
||||
|
||||
Usage:
|
||||
python validate.py <dir> --original <original_file>
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from validation import DOCXSchemaValidator, PPTXSchemaValidator, RedliningValidator
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Validate Office document XML files")
|
||||
parser.add_argument(
|
||||
"unpacked_dir",
|
||||
help="Path to unpacked Office document directory",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--original",
|
||||
required=True,
|
||||
help="Path to original file (.docx/.pptx/.xlsx)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-v",
|
||||
"--verbose",
|
||||
action="store_true",
|
||||
help="Enable verbose output",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Validate paths
|
||||
unpacked_dir = Path(args.unpacked_dir)
|
||||
original_file = Path(args.original)
|
||||
file_extension = original_file.suffix.lower()
|
||||
assert unpacked_dir.is_dir(), f"Error: {unpacked_dir} is not a directory"
|
||||
assert original_file.is_file(), f"Error: {original_file} is not a file"
|
||||
assert file_extension in [".docx", ".pptx", ".xlsx"], (
|
||||
f"Error: {original_file} must be a .docx, .pptx, or .xlsx file"
|
||||
)
|
||||
|
||||
# Run validations
|
||||
match file_extension:
|
||||
case ".docx":
|
||||
validators = [DOCXSchemaValidator, RedliningValidator]
|
||||
case ".pptx":
|
||||
validators = [PPTXSchemaValidator]
|
||||
case _:
|
||||
print(f"Error: Validation not supported for file type {file_extension}")
|
||||
sys.exit(1)
|
||||
|
||||
# Run validators
|
||||
success = True
|
||||
for V in validators:
|
||||
validator = V(unpacked_dir, original_file, verbose=args.verbose)
|
||||
if not validator.validate():
|
||||
success = False
|
||||
|
||||
if success:
|
||||
print("All validations PASSED!")
|
||||
|
||||
sys.exit(0 if success else 1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
15
skills/pptx/ooxml/scripts/validation/__init__.py
Normal file
15
skills/pptx/ooxml/scripts/validation/__init__.py
Normal file
@@ -0,0 +1,15 @@
|
||||
"""
|
||||
Validation modules for Word document processing.
|
||||
"""
|
||||
|
||||
from .base import BaseSchemaValidator
|
||||
from .docx import DOCXSchemaValidator
|
||||
from .pptx import PPTXSchemaValidator
|
||||
from .redlining import RedliningValidator
|
||||
|
||||
__all__ = [
|
||||
"BaseSchemaValidator",
|
||||
"DOCXSchemaValidator",
|
||||
"PPTXSchemaValidator",
|
||||
"RedliningValidator",
|
||||
]
|
||||
951
skills/pptx/ooxml/scripts/validation/base.py
Normal file
951
skills/pptx/ooxml/scripts/validation/base.py
Normal file
@@ -0,0 +1,951 @@
|
||||
"""
|
||||
Base validator with common validation logic for document files.
|
||||
"""
|
||||
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
import lxml.etree
|
||||
|
||||
|
||||
class BaseSchemaValidator:
|
||||
"""Base validator with common validation logic for document files."""
|
||||
|
||||
# Elements whose 'id' attributes must be unique within their file
|
||||
# Format: element_name -> (attribute_name, scope)
|
||||
# scope can be 'file' (unique within file) or 'global' (unique across all files)
|
||||
UNIQUE_ID_REQUIREMENTS = {
|
||||
# Word elements
|
||||
"comment": ("id", "file"), # Comment IDs in comments.xml
|
||||
"commentrangestart": ("id", "file"), # Must match comment IDs
|
||||
"commentrangeend": ("id", "file"), # Must match comment IDs
|
||||
"bookmarkstart": ("id", "file"), # Bookmark start IDs
|
||||
"bookmarkend": ("id", "file"), # Bookmark end IDs
|
||||
# Note: ins and del (track changes) can share IDs when part of same revision
|
||||
# PowerPoint elements
|
||||
"sldid": ("id", "file"), # Slide IDs in presentation.xml
|
||||
"sldmasterid": ("id", "global"), # Slide master IDs must be globally unique
|
||||
"sldlayoutid": ("id", "global"), # Slide layout IDs must be globally unique
|
||||
"cm": ("authorid", "file"), # Comment author IDs
|
||||
# Excel elements
|
||||
"sheet": ("sheetid", "file"), # Sheet IDs in workbook.xml
|
||||
"definedname": ("id", "file"), # Named range IDs
|
||||
# Drawing/Shape elements (all formats)
|
||||
"cxnsp": ("id", "file"), # Connection shape IDs
|
||||
"sp": ("id", "file"), # Shape IDs
|
||||
"pic": ("id", "file"), # Picture IDs
|
||||
"grpsp": ("id", "file"), # Group shape IDs
|
||||
}
|
||||
|
||||
# Mapping of element names to expected relationship types
|
||||
# Subclasses should override this with format-specific mappings
|
||||
ELEMENT_RELATIONSHIP_TYPES = {}
|
||||
|
||||
# Unified schema mappings for all Office document types
|
||||
SCHEMA_MAPPINGS = {
|
||||
# Document type specific schemas
|
||||
"word": "ISO-IEC29500-4_2016/wml.xsd", # Word documents
|
||||
"ppt": "ISO-IEC29500-4_2016/pml.xsd", # PowerPoint presentations
|
||||
"xl": "ISO-IEC29500-4_2016/sml.xsd", # Excel spreadsheets
|
||||
# Common file types
|
||||
"[Content_Types].xml": "ecma/fouth-edition/opc-contentTypes.xsd",
|
||||
"app.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd",
|
||||
"core.xml": "ecma/fouth-edition/opc-coreProperties.xsd",
|
||||
"custom.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd",
|
||||
".rels": "ecma/fouth-edition/opc-relationships.xsd",
|
||||
# Word-specific files
|
||||
"people.xml": "microsoft/wml-2012.xsd",
|
||||
"commentsIds.xml": "microsoft/wml-cid-2016.xsd",
|
||||
"commentsExtensible.xml": "microsoft/wml-cex-2018.xsd",
|
||||
"commentsExtended.xml": "microsoft/wml-2012.xsd",
|
||||
# Chart files (common across document types)
|
||||
"chart": "ISO-IEC29500-4_2016/dml-chart.xsd",
|
||||
# Theme files (common across document types)
|
||||
"theme": "ISO-IEC29500-4_2016/dml-main.xsd",
|
||||
# Drawing and media files
|
||||
"drawing": "ISO-IEC29500-4_2016/dml-main.xsd",
|
||||
}
|
||||
|
||||
# Unified namespace constants
|
||||
MC_NAMESPACE = "http://schemas.openxmlformats.org/markup-compatibility/2006"
|
||||
XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
|
||||
|
||||
# Common OOXML namespaces used across validators
|
||||
PACKAGE_RELATIONSHIPS_NAMESPACE = (
|
||||
"http://schemas.openxmlformats.org/package/2006/relationships"
|
||||
)
|
||||
OFFICE_RELATIONSHIPS_NAMESPACE = (
|
||||
"http://schemas.openxmlformats.org/officeDocument/2006/relationships"
|
||||
)
|
||||
CONTENT_TYPES_NAMESPACE = (
|
||||
"http://schemas.openxmlformats.org/package/2006/content-types"
|
||||
)
|
||||
|
||||
# Folders where we should clean ignorable namespaces
|
||||
MAIN_CONTENT_FOLDERS = {"word", "ppt", "xl"}
|
||||
|
||||
# All allowed OOXML namespaces (superset of all document types)
|
||||
OOXML_NAMESPACES = {
|
||||
"http://schemas.openxmlformats.org/officeDocument/2006/math",
|
||||
"http://schemas.openxmlformats.org/officeDocument/2006/relationships",
|
||||
"http://schemas.openxmlformats.org/schemaLibrary/2006/main",
|
||||
"http://schemas.openxmlformats.org/drawingml/2006/main",
|
||||
"http://schemas.openxmlformats.org/drawingml/2006/chart",
|
||||
"http://schemas.openxmlformats.org/drawingml/2006/chartDrawing",
|
||||
"http://schemas.openxmlformats.org/drawingml/2006/diagram",
|
||||
"http://schemas.openxmlformats.org/drawingml/2006/picture",
|
||||
"http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing",
|
||||
"http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
|
||||
"http://schemas.openxmlformats.org/wordprocessingml/2006/main",
|
||||
"http://schemas.openxmlformats.org/presentationml/2006/main",
|
||||
"http://schemas.openxmlformats.org/spreadsheetml/2006/main",
|
||||
"http://schemas.openxmlformats.org/officeDocument/2006/sharedTypes",
|
||||
"http://www.w3.org/XML/1998/namespace",
|
||||
}
|
||||
|
||||
def __init__(self, unpacked_dir, original_file, verbose=False):
|
||||
self.unpacked_dir = Path(unpacked_dir).resolve()
|
||||
self.original_file = Path(original_file)
|
||||
self.verbose = verbose
|
||||
|
||||
# Set schemas directory
|
||||
self.schemas_dir = Path(__file__).parent.parent.parent / "schemas"
|
||||
|
||||
# Get all XML and .rels files
|
||||
patterns = ["*.xml", "*.rels"]
|
||||
self.xml_files = [
|
||||
f for pattern in patterns for f in self.unpacked_dir.rglob(pattern)
|
||||
]
|
||||
|
||||
if not self.xml_files:
|
||||
print(f"Warning: No XML files found in {self.unpacked_dir}")
|
||||
|
||||
def validate(self):
|
||||
"""Run all validation checks and return True if all pass."""
|
||||
raise NotImplementedError("Subclasses must implement the validate method")
|
||||
|
||||
def validate_xml(self):
|
||||
"""Validate that all XML files are well-formed."""
|
||||
errors = []
|
||||
|
||||
for xml_file in self.xml_files:
|
||||
try:
|
||||
# Try to parse the XML file
|
||||
lxml.etree.parse(str(xml_file))
|
||||
except lxml.etree.XMLSyntaxError as e:
|
||||
errors.append(
|
||||
f" {xml_file.relative_to(self.unpacked_dir)}: "
|
||||
f"Line {e.lineno}: {e.msg}"
|
||||
)
|
||||
except Exception as e:
|
||||
errors.append(
|
||||
f" {xml_file.relative_to(self.unpacked_dir)}: "
|
||||
f"Unexpected error: {str(e)}"
|
||||
)
|
||||
|
||||
if errors:
|
||||
print(f"FAILED - Found {len(errors)} XML violations:")
|
||||
for error in errors:
|
||||
print(error)
|
||||
return False
|
||||
else:
|
||||
if self.verbose:
|
||||
print("PASSED - All XML files are well-formed")
|
||||
return True
|
||||
|
||||
def validate_namespaces(self):
|
||||
"""Validate that namespace prefixes in Ignorable attributes are declared."""
|
||||
errors = []
|
||||
|
||||
for xml_file in self.xml_files:
|
||||
try:
|
||||
root = lxml.etree.parse(str(xml_file)).getroot()
|
||||
declared = set(root.nsmap.keys()) - {None} # Exclude default namespace
|
||||
|
||||
for attr_val in [
|
||||
v for k, v in root.attrib.items() if k.endswith("Ignorable")
|
||||
]:
|
||||
undeclared = set(attr_val.split()) - declared
|
||||
errors.extend(
|
||||
f" {xml_file.relative_to(self.unpacked_dir)}: "
|
||||
f"Namespace '{ns}' in Ignorable but not declared"
|
||||
for ns in undeclared
|
||||
)
|
||||
except lxml.etree.XMLSyntaxError:
|
||||
continue
|
||||
|
||||
if errors:
|
||||
print(f"FAILED - {len(errors)} namespace issues:")
|
||||
for error in errors:
|
||||
print(error)
|
||||
return False
|
||||
if self.verbose:
|
||||
print("PASSED - All namespace prefixes properly declared")
|
||||
return True
|
||||
|
||||
def validate_unique_ids(self):
|
||||
"""Validate that specific IDs are unique according to OOXML requirements."""
|
||||
errors = []
|
||||
global_ids = {} # Track globally unique IDs across all files
|
||||
|
||||
for xml_file in self.xml_files:
|
||||
try:
|
||||
root = lxml.etree.parse(str(xml_file)).getroot()
|
||||
file_ids = {} # Track IDs that must be unique within this file
|
||||
|
||||
# Remove all mc:AlternateContent elements from the tree
|
||||
mc_elements = root.xpath(
|
||||
".//mc:AlternateContent", namespaces={"mc": self.MC_NAMESPACE}
|
||||
)
|
||||
for elem in mc_elements:
|
||||
elem.getparent().remove(elem)
|
||||
|
||||
# Now check IDs in the cleaned tree
|
||||
for elem in root.iter():
|
||||
# Get the element name without namespace
|
||||
tag = (
|
||||
elem.tag.split("}")[-1].lower()
|
||||
if "}" in elem.tag
|
||||
else elem.tag.lower()
|
||||
)
|
||||
|
||||
# Check if this element type has ID uniqueness requirements
|
||||
if tag in self.UNIQUE_ID_REQUIREMENTS:
|
||||
attr_name, scope = self.UNIQUE_ID_REQUIREMENTS[tag]
|
||||
|
||||
# Look for the specified attribute
|
||||
id_value = None
|
||||
for attr, value in elem.attrib.items():
|
||||
attr_local = (
|
||||
attr.split("}")[-1].lower()
|
||||
if "}" in attr
|
||||
else attr.lower()
|
||||
)
|
||||
if attr_local == attr_name:
|
||||
id_value = value
|
||||
break
|
||||
|
||||
if id_value is not None:
|
||||
if scope == "global":
|
||||
# Check global uniqueness
|
||||
if id_value in global_ids:
|
||||
prev_file, prev_line, prev_tag = global_ids[
|
||||
id_value
|
||||
]
|
||||
errors.append(
|
||||
f" {xml_file.relative_to(self.unpacked_dir)}: "
|
||||
f"Line {elem.sourceline}: Global ID '{id_value}' in <{tag}> "
|
||||
f"already used in {prev_file} at line {prev_line} in <{prev_tag}>"
|
||||
)
|
||||
else:
|
||||
global_ids[id_value] = (
|
||||
xml_file.relative_to(self.unpacked_dir),
|
||||
elem.sourceline,
|
||||
tag,
|
||||
)
|
||||
elif scope == "file":
|
||||
# Check file-level uniqueness
|
||||
key = (tag, attr_name)
|
||||
if key not in file_ids:
|
||||
file_ids[key] = {}
|
||||
|
||||
if id_value in file_ids[key]:
|
||||
prev_line = file_ids[key][id_value]
|
||||
errors.append(
|
||||
f" {xml_file.relative_to(self.unpacked_dir)}: "
|
||||
f"Line {elem.sourceline}: Duplicate {attr_name}='{id_value}' in <{tag}> "
|
||||
f"(first occurrence at line {prev_line})"
|
||||
)
|
||||
else:
|
||||
file_ids[key][id_value] = elem.sourceline
|
||||
|
||||
except (lxml.etree.XMLSyntaxError, Exception) as e:
|
||||
errors.append(
|
||||
f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}"
|
||||
)
|
||||
|
||||
if errors:
|
||||
print(f"FAILED - Found {len(errors)} ID uniqueness violations:")
|
||||
for error in errors:
|
||||
print(error)
|
||||
return False
|
||||
else:
|
||||
if self.verbose:
|
||||
print("PASSED - All required IDs are unique")
|
||||
return True
|
||||
|
||||
def validate_file_references(self):
|
||||
"""
|
||||
Validate that all .rels files properly reference files and that all files are referenced.
|
||||
"""
|
||||
errors = []
|
||||
|
||||
# Find all .rels files
|
||||
rels_files = list(self.unpacked_dir.rglob("*.rels"))
|
||||
|
||||
if not rels_files:
|
||||
if self.verbose:
|
||||
print("PASSED - No .rels files found")
|
||||
return True
|
||||
|
||||
# Get all files in the unpacked directory (excluding reference files)
|
||||
all_files = []
|
||||
for file_path in self.unpacked_dir.rglob("*"):
|
||||
if (
|
||||
file_path.is_file()
|
||||
and file_path.name != "[Content_Types].xml"
|
||||
and not file_path.name.endswith(".rels")
|
||||
): # This file is not referenced by .rels
|
||||
all_files.append(file_path.resolve())
|
||||
|
||||
# Track all files that are referenced by any .rels file
|
||||
all_referenced_files = set()
|
||||
|
||||
if self.verbose:
|
||||
print(
|
||||
f"Found {len(rels_files)} .rels files and {len(all_files)} target files"
|
||||
)
|
||||
|
||||
# Check each .rels file
|
||||
for rels_file in rels_files:
|
||||
try:
|
||||
# Parse relationships file
|
||||
rels_root = lxml.etree.parse(str(rels_file)).getroot()
|
||||
|
||||
# Get the directory where this .rels file is located
|
||||
rels_dir = rels_file.parent
|
||||
|
||||
# Find all relationships and their targets
|
||||
referenced_files = set()
|
||||
broken_refs = []
|
||||
|
||||
for rel in rels_root.findall(
|
||||
".//ns:Relationship",
|
||||
namespaces={"ns": self.PACKAGE_RELATIONSHIPS_NAMESPACE},
|
||||
):
|
||||
target = rel.get("Target")
|
||||
if target and not target.startswith(
|
||||
("http", "mailto:")
|
||||
): # Skip external URLs
|
||||
# Resolve the target path relative to the .rels file location
|
||||
if rels_file.name == ".rels":
|
||||
# Root .rels file - targets are relative to unpacked_dir
|
||||
target_path = self.unpacked_dir / target
|
||||
else:
|
||||
# Other .rels files - targets are relative to their parent's parent
|
||||
# e.g., word/_rels/document.xml.rels -> targets relative to word/
|
||||
base_dir = rels_dir.parent
|
||||
target_path = base_dir / target
|
||||
|
||||
# Normalize the path and check if it exists
|
||||
try:
|
||||
target_path = target_path.resolve()
|
||||
if target_path.exists() and target_path.is_file():
|
||||
referenced_files.add(target_path)
|
||||
all_referenced_files.add(target_path)
|
||||
else:
|
||||
broken_refs.append((target, rel.sourceline))
|
||||
except (OSError, ValueError):
|
||||
broken_refs.append((target, rel.sourceline))
|
||||
|
||||
# Report broken references
|
||||
if broken_refs:
|
||||
rel_path = rels_file.relative_to(self.unpacked_dir)
|
||||
for broken_ref, line_num in broken_refs:
|
||||
errors.append(
|
||||
f" {rel_path}: Line {line_num}: Broken reference to {broken_ref}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
rel_path = rels_file.relative_to(self.unpacked_dir)
|
||||
errors.append(f" Error parsing {rel_path}: {e}")
|
||||
|
||||
# Check for unreferenced files (files that exist but are not referenced anywhere)
|
||||
unreferenced_files = set(all_files) - all_referenced_files
|
||||
|
||||
if unreferenced_files:
|
||||
for unref_file in sorted(unreferenced_files):
|
||||
unref_rel_path = unref_file.relative_to(self.unpacked_dir)
|
||||
errors.append(f" Unreferenced file: {unref_rel_path}")
|
||||
|
||||
if errors:
|
||||
print(f"FAILED - Found {len(errors)} relationship validation errors:")
|
||||
for error in errors:
|
||||
print(error)
|
||||
print(
|
||||
"CRITICAL: These errors will cause the document to appear corrupt. "
|
||||
+ "Broken references MUST be fixed, "
|
||||
+ "and unreferenced files MUST be referenced or removed."
|
||||
)
|
||||
return False
|
||||
else:
|
||||
if self.verbose:
|
||||
print(
|
||||
"PASSED - All references are valid and all files are properly referenced"
|
||||
)
|
||||
return True
|
||||
|
||||
def validate_all_relationship_ids(self):
|
||||
"""
|
||||
Validate that all r:id attributes in XML files reference existing IDs
|
||||
in their corresponding .rels files, and optionally validate relationship types.
|
||||
"""
|
||||
import lxml.etree
|
||||
|
||||
errors = []
|
||||
|
||||
# Process each XML file that might contain r:id references
|
||||
for xml_file in self.xml_files:
|
||||
# Skip .rels files themselves
|
||||
if xml_file.suffix == ".rels":
|
||||
continue
|
||||
|
||||
# Determine the corresponding .rels file
|
||||
# For dir/file.xml, it's dir/_rels/file.xml.rels
|
||||
rels_dir = xml_file.parent / "_rels"
|
||||
rels_file = rels_dir / f"{xml_file.name}.rels"
|
||||
|
||||
# Skip if there's no corresponding .rels file (that's okay)
|
||||
if not rels_file.exists():
|
||||
continue
|
||||
|
||||
try:
|
||||
# Parse the .rels file to get valid relationship IDs and their types
|
||||
rels_root = lxml.etree.parse(str(rels_file)).getroot()
|
||||
rid_to_type = {}
|
||||
|
||||
for rel in rels_root.findall(
|
||||
f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship"
|
||||
):
|
||||
rid = rel.get("Id")
|
||||
rel_type = rel.get("Type", "")
|
||||
if rid:
|
||||
# Check for duplicate rIds
|
||||
if rid in rid_to_type:
|
||||
rels_rel_path = rels_file.relative_to(self.unpacked_dir)
|
||||
errors.append(
|
||||
f" {rels_rel_path}: Line {rel.sourceline}: "
|
||||
f"Duplicate relationship ID '{rid}' (IDs must be unique)"
|
||||
)
|
||||
# Extract just the type name from the full URL
|
||||
type_name = (
|
||||
rel_type.split("/")[-1] if "/" in rel_type else rel_type
|
||||
)
|
||||
rid_to_type[rid] = type_name
|
||||
|
||||
# Parse the XML file to find all r:id references
|
||||
xml_root = lxml.etree.parse(str(xml_file)).getroot()
|
||||
|
||||
# Find all elements with r:id attributes
|
||||
for elem in xml_root.iter():
|
||||
# Check for r:id attribute (relationship ID)
|
||||
rid_attr = elem.get(f"{{{self.OFFICE_RELATIONSHIPS_NAMESPACE}}}id")
|
||||
if rid_attr:
|
||||
xml_rel_path = xml_file.relative_to(self.unpacked_dir)
|
||||
elem_name = (
|
||||
elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag
|
||||
)
|
||||
|
||||
# Check if the ID exists
|
||||
if rid_attr not in rid_to_type:
|
||||
errors.append(
|
||||
f" {xml_rel_path}: Line {elem.sourceline}: "
|
||||
f"<{elem_name}> references non-existent relationship '{rid_attr}' "
|
||||
f"(valid IDs: {', '.join(sorted(rid_to_type.keys())[:5])}{'...' if len(rid_to_type) > 5 else ''})"
|
||||
)
|
||||
# Check if we have type expectations for this element
|
||||
elif self.ELEMENT_RELATIONSHIP_TYPES:
|
||||
expected_type = self._get_expected_relationship_type(
|
||||
elem_name
|
||||
)
|
||||
if expected_type:
|
||||
actual_type = rid_to_type[rid_attr]
|
||||
# Check if the actual type matches or contains the expected type
|
||||
if expected_type not in actual_type.lower():
|
||||
errors.append(
|
||||
f" {xml_rel_path}: Line {elem.sourceline}: "
|
||||
f"<{elem_name}> references '{rid_attr}' which points to '{actual_type}' "
|
||||
f"but should point to a '{expected_type}' relationship"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
xml_rel_path = xml_file.relative_to(self.unpacked_dir)
|
||||
errors.append(f" Error processing {xml_rel_path}: {e}")
|
||||
|
||||
if errors:
|
||||
print(f"FAILED - Found {len(errors)} relationship ID reference errors:")
|
||||
for error in errors:
|
||||
print(error)
|
||||
print("\nThese ID mismatches will cause the document to appear corrupt!")
|
||||
return False
|
||||
else:
|
||||
if self.verbose:
|
||||
print("PASSED - All relationship ID references are valid")
|
||||
return True
|
||||
|
||||
def _get_expected_relationship_type(self, element_name):
|
||||
"""
|
||||
Get the expected relationship type for an element.
|
||||
First checks the explicit mapping, then tries pattern detection.
|
||||
"""
|
||||
# Normalize element name to lowercase
|
||||
elem_lower = element_name.lower()
|
||||
|
||||
# Check explicit mapping first
|
||||
if elem_lower in self.ELEMENT_RELATIONSHIP_TYPES:
|
||||
return self.ELEMENT_RELATIONSHIP_TYPES[elem_lower]
|
||||
|
||||
# Try pattern detection for common patterns
|
||||
# Pattern 1: Elements ending in "Id" often expect a relationship of the prefix type
|
||||
if elem_lower.endswith("id") and len(elem_lower) > 2:
|
||||
# e.g., "sldId" -> "sld", "sldMasterId" -> "sldMaster"
|
||||
prefix = elem_lower[:-2] # Remove "id"
|
||||
# Check if this might be a compound like "sldMasterId"
|
||||
if prefix.endswith("master"):
|
||||
return prefix.lower()
|
||||
elif prefix.endswith("layout"):
|
||||
return prefix.lower()
|
||||
else:
|
||||
# Simple case like "sldId" -> "slide"
|
||||
# Common transformations
|
||||
if prefix == "sld":
|
||||
return "slide"
|
||||
return prefix.lower()
|
||||
|
||||
# Pattern 2: Elements ending in "Reference" expect a relationship of the prefix type
|
||||
if elem_lower.endswith("reference") and len(elem_lower) > 9:
|
||||
prefix = elem_lower[:-9] # Remove "reference"
|
||||
return prefix.lower()
|
||||
|
||||
return None
|
||||
|
||||
def validate_content_types(self):
|
||||
"""Validate that all content files are properly declared in [Content_Types].xml."""
|
||||
errors = []
|
||||
|
||||
# Find [Content_Types].xml file
|
||||
content_types_file = self.unpacked_dir / "[Content_Types].xml"
|
||||
if not content_types_file.exists():
|
||||
print("FAILED - [Content_Types].xml file not found")
|
||||
return False
|
||||
|
||||
try:
|
||||
# Parse and get all declared parts and extensions
|
||||
root = lxml.etree.parse(str(content_types_file)).getroot()
|
||||
declared_parts = set()
|
||||
declared_extensions = set()
|
||||
|
||||
# Get Override declarations (specific files)
|
||||
for override in root.findall(
|
||||
f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Override"
|
||||
):
|
||||
part_name = override.get("PartName")
|
||||
if part_name is not None:
|
||||
declared_parts.add(part_name.lstrip("/"))
|
||||
|
||||
# Get Default declarations (by extension)
|
||||
for default in root.findall(
|
||||
f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Default"
|
||||
):
|
||||
extension = default.get("Extension")
|
||||
if extension is not None:
|
||||
declared_extensions.add(extension.lower())
|
||||
|
||||
# Root elements that require content type declaration
|
||||
declarable_roots = {
|
||||
"sld",
|
||||
"sldLayout",
|
||||
"sldMaster",
|
||||
"presentation", # PowerPoint
|
||||
"document", # Word
|
||||
"workbook",
|
||||
"worksheet", # Excel
|
||||
"theme", # Common
|
||||
}
|
||||
|
||||
# Common media file extensions that should be declared
|
||||
media_extensions = {
|
||||
"png": "image/png",
|
||||
"jpg": "image/jpeg",
|
||||
"jpeg": "image/jpeg",
|
||||
"gif": "image/gif",
|
||||
"bmp": "image/bmp",
|
||||
"tiff": "image/tiff",
|
||||
"wmf": "image/x-wmf",
|
||||
"emf": "image/x-emf",
|
||||
}
|
||||
|
||||
# Get all files in the unpacked directory
|
||||
all_files = list(self.unpacked_dir.rglob("*"))
|
||||
all_files = [f for f in all_files if f.is_file()]
|
||||
|
||||
# Check all XML files for Override declarations
|
||||
for xml_file in self.xml_files:
|
||||
path_str = str(xml_file.relative_to(self.unpacked_dir)).replace(
|
||||
"\\", "/"
|
||||
)
|
||||
|
||||
# Skip non-content files
|
||||
if any(
|
||||
skip in path_str
|
||||
for skip in [".rels", "[Content_Types]", "docProps/", "_rels/"]
|
||||
):
|
||||
continue
|
||||
|
||||
try:
|
||||
root_tag = lxml.etree.parse(str(xml_file)).getroot().tag
|
||||
root_name = root_tag.split("}")[-1] if "}" in root_tag else root_tag
|
||||
|
||||
if root_name in declarable_roots and path_str not in declared_parts:
|
||||
errors.append(
|
||||
f" {path_str}: File with <{root_name}> root not declared in [Content_Types].xml"
|
||||
)
|
||||
|
||||
except Exception:
|
||||
continue # Skip unparseable files
|
||||
|
||||
# Check all non-XML files for Default extension declarations
|
||||
for file_path in all_files:
|
||||
# Skip XML files and metadata files (already checked above)
|
||||
if file_path.suffix.lower() in {".xml", ".rels"}:
|
||||
continue
|
||||
if file_path.name == "[Content_Types].xml":
|
||||
continue
|
||||
if "_rels" in file_path.parts or "docProps" in file_path.parts:
|
||||
continue
|
||||
|
||||
extension = file_path.suffix.lstrip(".").lower()
|
||||
if extension and extension not in declared_extensions:
|
||||
# Check if it's a known media extension that should be declared
|
||||
if extension in media_extensions:
|
||||
relative_path = file_path.relative_to(self.unpacked_dir)
|
||||
errors.append(
|
||||
f' {relative_path}: File with extension \'{extension}\' not declared in [Content_Types].xml - should add: <Default Extension="{extension}" ContentType="{media_extensions[extension]}"/>'
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
errors.append(f" Error parsing [Content_Types].xml: {e}")
|
||||
|
||||
if errors:
|
||||
print(f"FAILED - Found {len(errors)} content type declaration errors:")
|
||||
for error in errors:
|
||||
print(error)
|
||||
return False
|
||||
else:
|
||||
if self.verbose:
|
||||
print(
|
||||
"PASSED - All content files are properly declared in [Content_Types].xml"
|
||||
)
|
||||
return True
|
||||
|
||||
def validate_file_against_xsd(self, xml_file, verbose=False):
|
||||
"""Validate a single XML file against XSD schema, comparing with original.
|
||||
|
||||
Args:
|
||||
xml_file: Path to XML file to validate
|
||||
verbose: Enable verbose output
|
||||
|
||||
Returns:
|
||||
tuple: (is_valid, new_errors_set) where is_valid is True/False/None (skipped)
|
||||
"""
|
||||
# Resolve both paths to handle symlinks
|
||||
xml_file = Path(xml_file).resolve()
|
||||
unpacked_dir = self.unpacked_dir.resolve()
|
||||
|
||||
# Validate current file
|
||||
is_valid, current_errors = self._validate_single_file_xsd(
|
||||
xml_file, unpacked_dir
|
||||
)
|
||||
|
||||
if is_valid is None:
|
||||
return None, set() # Skipped
|
||||
elif is_valid:
|
||||
return True, set() # Valid, no errors
|
||||
|
||||
# Get errors from original file for this specific file
|
||||
original_errors = self._get_original_file_errors(xml_file)
|
||||
|
||||
# Compare with original (both are guaranteed to be sets here)
|
||||
assert current_errors is not None
|
||||
new_errors = current_errors - original_errors
|
||||
|
||||
if new_errors:
|
||||
if verbose:
|
||||
relative_path = xml_file.relative_to(unpacked_dir)
|
||||
print(f"FAILED - {relative_path}: {len(new_errors)} new error(s)")
|
||||
for error in list(new_errors)[:3]:
|
||||
truncated = error[:250] + "..." if len(error) > 250 else error
|
||||
print(f" - {truncated}")
|
||||
return False, new_errors
|
||||
else:
|
||||
# All errors existed in original
|
||||
if verbose:
|
||||
print(
|
||||
f"PASSED - No new errors (original had {len(current_errors)} errors)"
|
||||
)
|
||||
return True, set()
|
||||
|
||||
def validate_against_xsd(self):
|
||||
"""Validate XML files against XSD schemas, showing only new errors compared to original."""
|
||||
new_errors = []
|
||||
original_error_count = 0
|
||||
valid_count = 0
|
||||
skipped_count = 0
|
||||
|
||||
for xml_file in self.xml_files:
|
||||
relative_path = str(xml_file.relative_to(self.unpacked_dir))
|
||||
is_valid, new_file_errors = self.validate_file_against_xsd(
|
||||
xml_file, verbose=False
|
||||
)
|
||||
|
||||
if is_valid is None:
|
||||
skipped_count += 1
|
||||
continue
|
||||
elif is_valid and not new_file_errors:
|
||||
valid_count += 1
|
||||
continue
|
||||
elif is_valid:
|
||||
# Had errors but all existed in original
|
||||
original_error_count += 1
|
||||
valid_count += 1
|
||||
continue
|
||||
|
||||
# Has new errors
|
||||
new_errors.append(f" {relative_path}: {len(new_file_errors)} new error(s)")
|
||||
for error in list(new_file_errors)[:3]: # Show first 3 errors
|
||||
new_errors.append(
|
||||
f" - {error[:250]}..." if len(error) > 250 else f" - {error}"
|
||||
)
|
||||
|
||||
# Print summary
|
||||
if self.verbose:
|
||||
print(f"Validated {len(self.xml_files)} files:")
|
||||
print(f" - Valid: {valid_count}")
|
||||
print(f" - Skipped (no schema): {skipped_count}")
|
||||
if original_error_count:
|
||||
print(f" - With original errors (ignored): {original_error_count}")
|
||||
print(
|
||||
f" - With NEW errors: {len(new_errors) > 0 and len([e for e in new_errors if not e.startswith(' ')]) or 0}"
|
||||
)
|
||||
|
||||
if new_errors:
|
||||
print("\nFAILED - Found NEW validation errors:")
|
||||
for error in new_errors:
|
||||
print(error)
|
||||
return False
|
||||
else:
|
||||
if self.verbose:
|
||||
print("\nPASSED - No new XSD validation errors introduced")
|
||||
return True
|
||||
|
||||
def _get_schema_path(self, xml_file):
|
||||
"""Determine the appropriate schema path for an XML file."""
|
||||
# Check exact filename match
|
||||
if xml_file.name in self.SCHEMA_MAPPINGS:
|
||||
return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.name]
|
||||
|
||||
# Check .rels files
|
||||
if xml_file.suffix == ".rels":
|
||||
return self.schemas_dir / self.SCHEMA_MAPPINGS[".rels"]
|
||||
|
||||
# Check chart files
|
||||
if "charts/" in str(xml_file) and xml_file.name.startswith("chart"):
|
||||
return self.schemas_dir / self.SCHEMA_MAPPINGS["chart"]
|
||||
|
||||
# Check theme files
|
||||
if "theme/" in str(xml_file) and xml_file.name.startswith("theme"):
|
||||
return self.schemas_dir / self.SCHEMA_MAPPINGS["theme"]
|
||||
|
||||
# Check if file is in a main content folder and use appropriate schema
|
||||
if xml_file.parent.name in self.MAIN_CONTENT_FOLDERS:
|
||||
return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.parent.name]
|
||||
|
||||
return None
|
||||
|
||||
def _clean_ignorable_namespaces(self, xml_doc):
|
||||
"""Remove attributes and elements not in allowed namespaces."""
|
||||
# Create a clean copy
|
||||
xml_string = lxml.etree.tostring(xml_doc, encoding="unicode")
|
||||
xml_copy = lxml.etree.fromstring(xml_string)
|
||||
|
||||
# Remove attributes not in allowed namespaces
|
||||
for elem in xml_copy.iter():
|
||||
attrs_to_remove = []
|
||||
|
||||
for attr in elem.attrib:
|
||||
# Check if attribute is from a namespace other than allowed ones
|
||||
if "{" in attr:
|
||||
ns = attr.split("}")[0][1:]
|
||||
if ns not in self.OOXML_NAMESPACES:
|
||||
attrs_to_remove.append(attr)
|
||||
|
||||
# Remove collected attributes
|
||||
for attr in attrs_to_remove:
|
||||
del elem.attrib[attr]
|
||||
|
||||
# Remove elements not in allowed namespaces
|
||||
self._remove_ignorable_elements(xml_copy)
|
||||
|
||||
return lxml.etree.ElementTree(xml_copy)
|
||||
|
||||
def _remove_ignorable_elements(self, root):
|
||||
"""Recursively remove all elements not in allowed namespaces."""
|
||||
elements_to_remove = []
|
||||
|
||||
# Find elements to remove
|
||||
for elem in list(root):
|
||||
# Skip non-element nodes (comments, processing instructions, etc.)
|
||||
if not hasattr(elem, "tag") or callable(elem.tag):
|
||||
continue
|
||||
|
||||
tag_str = str(elem.tag)
|
||||
if tag_str.startswith("{"):
|
||||
ns = tag_str.split("}")[0][1:]
|
||||
if ns not in self.OOXML_NAMESPACES:
|
||||
elements_to_remove.append(elem)
|
||||
continue
|
||||
|
||||
# Recursively clean child elements
|
||||
self._remove_ignorable_elements(elem)
|
||||
|
||||
# Remove collected elements
|
||||
for elem in elements_to_remove:
|
||||
root.remove(elem)
|
||||
|
||||
def _preprocess_for_mc_ignorable(self, xml_doc):
|
||||
"""Preprocess XML to handle mc:Ignorable attribute properly."""
|
||||
# Remove mc:Ignorable attributes before validation
|
||||
root = xml_doc.getroot()
|
||||
|
||||
# Remove mc:Ignorable attribute from root
|
||||
if f"{{{self.MC_NAMESPACE}}}Ignorable" in root.attrib:
|
||||
del root.attrib[f"{{{self.MC_NAMESPACE}}}Ignorable"]
|
||||
|
||||
return xml_doc
|
||||
|
||||
def _validate_single_file_xsd(self, xml_file, base_path):
|
||||
"""Validate a single XML file against XSD schema. Returns (is_valid, errors_set)."""
|
||||
schema_path = self._get_schema_path(xml_file)
|
||||
if not schema_path:
|
||||
return None, None # Skip file
|
||||
|
||||
try:
|
||||
# Load schema
|
||||
with open(schema_path, "rb") as xsd_file:
|
||||
parser = lxml.etree.XMLParser()
|
||||
xsd_doc = lxml.etree.parse(
|
||||
xsd_file, parser=parser, base_url=str(schema_path)
|
||||
)
|
||||
schema = lxml.etree.XMLSchema(xsd_doc)
|
||||
|
||||
# Load and preprocess XML
|
||||
with open(xml_file, "r") as f:
|
||||
xml_doc = lxml.etree.parse(f)
|
||||
|
||||
xml_doc, _ = self._remove_template_tags_from_text_nodes(xml_doc)
|
||||
xml_doc = self._preprocess_for_mc_ignorable(xml_doc)
|
||||
|
||||
# Clean ignorable namespaces if needed
|
||||
relative_path = xml_file.relative_to(base_path)
|
||||
if (
|
||||
relative_path.parts
|
||||
and relative_path.parts[0] in self.MAIN_CONTENT_FOLDERS
|
||||
):
|
||||
xml_doc = self._clean_ignorable_namespaces(xml_doc)
|
||||
|
||||
# Validate
|
||||
if schema.validate(xml_doc):
|
||||
return True, set()
|
||||
else:
|
||||
errors = set()
|
||||
for error in schema.error_log:
|
||||
# Store normalized error message (without line numbers for comparison)
|
||||
errors.add(error.message)
|
||||
return False, errors
|
||||
|
||||
except Exception as e:
|
||||
return False, {str(e)}
|
||||
|
||||
def _get_original_file_errors(self, xml_file):
|
||||
"""Get XSD validation errors from a single file in the original document.
|
||||
|
||||
Args:
|
||||
xml_file: Path to the XML file in unpacked_dir to check
|
||||
|
||||
Returns:
|
||||
set: Set of error messages from the original file
|
||||
"""
|
||||
import tempfile
|
||||
import zipfile
|
||||
|
||||
# Resolve both paths to handle symlinks (e.g., /var vs /private/var on macOS)
|
||||
xml_file = Path(xml_file).resolve()
|
||||
unpacked_dir = self.unpacked_dir.resolve()
|
||||
relative_path = xml_file.relative_to(unpacked_dir)
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
temp_path = Path(temp_dir)
|
||||
|
||||
# Extract original file
|
||||
with zipfile.ZipFile(self.original_file, "r") as zip_ref:
|
||||
zip_ref.extractall(temp_path)
|
||||
|
||||
# Find corresponding file in original
|
||||
original_xml_file = temp_path / relative_path
|
||||
|
||||
if not original_xml_file.exists():
|
||||
# File didn't exist in original, so no original errors
|
||||
return set()
|
||||
|
||||
# Validate the specific file in original
|
||||
is_valid, errors = self._validate_single_file_xsd(
|
||||
original_xml_file, temp_path
|
||||
)
|
||||
return errors if errors else set()
|
||||
|
||||
def _remove_template_tags_from_text_nodes(self, xml_doc):
|
||||
"""Remove template tags from XML text nodes and collect warnings.
|
||||
|
||||
Template tags follow the pattern {{ ... }} and are used as placeholders
|
||||
for content replacement. They should be removed from text content before
|
||||
XSD validation while preserving XML structure.
|
||||
|
||||
Returns:
|
||||
tuple: (cleaned_xml_doc, warnings_list)
|
||||
"""
|
||||
warnings = []
|
||||
template_pattern = re.compile(r"\{\{[^}]*\}\}")
|
||||
|
||||
# Create a copy of the document to avoid modifying the original
|
||||
xml_string = lxml.etree.tostring(xml_doc, encoding="unicode")
|
||||
xml_copy = lxml.etree.fromstring(xml_string)
|
||||
|
||||
def process_text_content(text, content_type):
|
||||
if not text:
|
||||
return text
|
||||
matches = list(template_pattern.finditer(text))
|
||||
if matches:
|
||||
for match in matches:
|
||||
warnings.append(
|
||||
f"Found template tag in {content_type}: {match.group()}"
|
||||
)
|
||||
return template_pattern.sub("", text)
|
||||
return text
|
||||
|
||||
# Process all text nodes in the document
|
||||
for elem in xml_copy.iter():
|
||||
# Skip processing if this is a w:t element
|
||||
if not hasattr(elem, "tag") or callable(elem.tag):
|
||||
continue
|
||||
tag_str = str(elem.tag)
|
||||
if tag_str.endswith("}t") or tag_str == "t":
|
||||
continue
|
||||
|
||||
elem.text = process_text_content(elem.text, "text content")
|
||||
elem.tail = process_text_content(elem.tail, "tail content")
|
||||
|
||||
return lxml.etree.ElementTree(xml_copy), warnings
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise RuntimeError("This module should not be run directly.")
|
||||
274
skills/pptx/ooxml/scripts/validation/docx.py
Normal file
274
skills/pptx/ooxml/scripts/validation/docx.py
Normal file
@@ -0,0 +1,274 @@
|
||||
"""
|
||||
Validator for Word document XML files against XSD schemas.
|
||||
"""
|
||||
|
||||
import re
|
||||
import tempfile
|
||||
import zipfile
|
||||
|
||||
import lxml.etree
|
||||
|
||||
from .base import BaseSchemaValidator
|
||||
|
||||
|
||||
class DOCXSchemaValidator(BaseSchemaValidator):
|
||||
"""Validator for Word document XML files against XSD schemas."""
|
||||
|
||||
# Word-specific namespace
|
||||
WORD_2006_NAMESPACE = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
||||
|
||||
# Word-specific element to relationship type mappings
|
||||
# Start with empty mapping - add specific cases as we discover them
|
||||
ELEMENT_RELATIONSHIP_TYPES = {}
|
||||
|
||||
def validate(self):
|
||||
"""Run all validation checks and return True if all pass."""
|
||||
# Test 0: XML well-formedness
|
||||
if not self.validate_xml():
|
||||
return False
|
||||
|
||||
# Test 1: Namespace declarations
|
||||
all_valid = True
|
||||
if not self.validate_namespaces():
|
||||
all_valid = False
|
||||
|
||||
# Test 2: Unique IDs
|
||||
if not self.validate_unique_ids():
|
||||
all_valid = False
|
||||
|
||||
# Test 3: Relationship and file reference validation
|
||||
if not self.validate_file_references():
|
||||
all_valid = False
|
||||
|
||||
# Test 4: Content type declarations
|
||||
if not self.validate_content_types():
|
||||
all_valid = False
|
||||
|
||||
# Test 5: XSD schema validation
|
||||
if not self.validate_against_xsd():
|
||||
all_valid = False
|
||||
|
||||
# Test 6: Whitespace preservation
|
||||
if not self.validate_whitespace_preservation():
|
||||
all_valid = False
|
||||
|
||||
# Test 7: Deletion validation
|
||||
if not self.validate_deletions():
|
||||
all_valid = False
|
||||
|
||||
# Test 8: Insertion validation
|
||||
if not self.validate_insertions():
|
||||
all_valid = False
|
||||
|
||||
# Test 9: Relationship ID reference validation
|
||||
if not self.validate_all_relationship_ids():
|
||||
all_valid = False
|
||||
|
||||
# Count and compare paragraphs
|
||||
self.compare_paragraph_counts()
|
||||
|
||||
return all_valid
|
||||
|
||||
def validate_whitespace_preservation(self):
|
||||
"""
|
||||
Validate that w:t elements with whitespace have xml:space='preserve'.
|
||||
"""
|
||||
errors = []
|
||||
|
||||
for xml_file in self.xml_files:
|
||||
# Only check document.xml files
|
||||
if xml_file.name != "document.xml":
|
||||
continue
|
||||
|
||||
try:
|
||||
root = lxml.etree.parse(str(xml_file)).getroot()
|
||||
|
||||
# Find all w:t elements
|
||||
for elem in root.iter(f"{{{self.WORD_2006_NAMESPACE}}}t"):
|
||||
if elem.text:
|
||||
text = elem.text
|
||||
# Check if text starts or ends with whitespace
|
||||
if re.match(r"^\s.*", text) or re.match(r".*\s$", text):
|
||||
# Check if xml:space="preserve" attribute exists
|
||||
xml_space_attr = f"{{{self.XML_NAMESPACE}}}space"
|
||||
if (
|
||||
xml_space_attr not in elem.attrib
|
||||
or elem.attrib[xml_space_attr] != "preserve"
|
||||
):
|
||||
# Show a preview of the text
|
||||
text_preview = (
|
||||
repr(text)[:50] + "..."
|
||||
if len(repr(text)) > 50
|
||||
else repr(text)
|
||||
)
|
||||
errors.append(
|
||||
f" {xml_file.relative_to(self.unpacked_dir)}: "
|
||||
f"Line {elem.sourceline}: w:t element with whitespace missing xml:space='preserve': {text_preview}"
|
||||
)
|
||||
|
||||
except (lxml.etree.XMLSyntaxError, Exception) as e:
|
||||
errors.append(
|
||||
f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}"
|
||||
)
|
||||
|
||||
if errors:
|
||||
print(f"FAILED - Found {len(errors)} whitespace preservation violations:")
|
||||
for error in errors:
|
||||
print(error)
|
||||
return False
|
||||
else:
|
||||
if self.verbose:
|
||||
print("PASSED - All whitespace is properly preserved")
|
||||
return True
|
||||
|
||||
def validate_deletions(self):
|
||||
"""
|
||||
Validate that w:t elements are not within w:del elements.
|
||||
For some reason, XSD validation does not catch this, so we do it manually.
|
||||
"""
|
||||
errors = []
|
||||
|
||||
for xml_file in self.xml_files:
|
||||
# Only check document.xml files
|
||||
if xml_file.name != "document.xml":
|
||||
continue
|
||||
|
||||
try:
|
||||
root = lxml.etree.parse(str(xml_file)).getroot()
|
||||
|
||||
# Find all w:t elements that are descendants of w:del elements
|
||||
namespaces = {"w": self.WORD_2006_NAMESPACE}
|
||||
xpath_expression = ".//w:del//w:t"
|
||||
problematic_t_elements = root.xpath(
|
||||
xpath_expression, namespaces=namespaces
|
||||
)
|
||||
for t_elem in problematic_t_elements:
|
||||
if t_elem.text:
|
||||
# Show a preview of the text
|
||||
text_preview = (
|
||||
repr(t_elem.text)[:50] + "..."
|
||||
if len(repr(t_elem.text)) > 50
|
||||
else repr(t_elem.text)
|
||||
)
|
||||
errors.append(
|
||||
f" {xml_file.relative_to(self.unpacked_dir)}: "
|
||||
f"Line {t_elem.sourceline}: <w:t> found within <w:del>: {text_preview}"
|
||||
)
|
||||
|
||||
except (lxml.etree.XMLSyntaxError, Exception) as e:
|
||||
errors.append(
|
||||
f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}"
|
||||
)
|
||||
|
||||
if errors:
|
||||
print(f"FAILED - Found {len(errors)} deletion validation violations:")
|
||||
for error in errors:
|
||||
print(error)
|
||||
return False
|
||||
else:
|
||||
if self.verbose:
|
||||
print("PASSED - No w:t elements found within w:del elements")
|
||||
return True
|
||||
|
||||
def count_paragraphs_in_unpacked(self):
|
||||
"""Count the number of paragraphs in the unpacked document."""
|
||||
count = 0
|
||||
|
||||
for xml_file in self.xml_files:
|
||||
# Only check document.xml files
|
||||
if xml_file.name != "document.xml":
|
||||
continue
|
||||
|
||||
try:
|
||||
root = lxml.etree.parse(str(xml_file)).getroot()
|
||||
# Count all w:p elements
|
||||
paragraphs = root.findall(f".//{{{self.WORD_2006_NAMESPACE}}}p")
|
||||
count = len(paragraphs)
|
||||
except Exception as e:
|
||||
print(f"Error counting paragraphs in unpacked document: {e}")
|
||||
|
||||
return count
|
||||
|
||||
def count_paragraphs_in_original(self):
|
||||
"""Count the number of paragraphs in the original docx file."""
|
||||
count = 0
|
||||
|
||||
try:
|
||||
# Create temporary directory to unpack original
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
# Unpack original docx
|
||||
with zipfile.ZipFile(self.original_file, "r") as zip_ref:
|
||||
zip_ref.extractall(temp_dir)
|
||||
|
||||
# Parse document.xml
|
||||
doc_xml_path = temp_dir + "/word/document.xml"
|
||||
root = lxml.etree.parse(doc_xml_path).getroot()
|
||||
|
||||
# Count all w:p elements
|
||||
paragraphs = root.findall(f".//{{{self.WORD_2006_NAMESPACE}}}p")
|
||||
count = len(paragraphs)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error counting paragraphs in original document: {e}")
|
||||
|
||||
return count
|
||||
|
||||
def validate_insertions(self):
|
||||
"""
|
||||
Validate that w:delText elements are not within w:ins elements.
|
||||
w:delText is only allowed in w:ins if nested within a w:del.
|
||||
"""
|
||||
errors = []
|
||||
|
||||
for xml_file in self.xml_files:
|
||||
if xml_file.name != "document.xml":
|
||||
continue
|
||||
|
||||
try:
|
||||
root = lxml.etree.parse(str(xml_file)).getroot()
|
||||
namespaces = {"w": self.WORD_2006_NAMESPACE}
|
||||
|
||||
# Find w:delText in w:ins that are NOT within w:del
|
||||
invalid_elements = root.xpath(
|
||||
".//w:ins//w:delText[not(ancestor::w:del)]",
|
||||
namespaces=namespaces
|
||||
)
|
||||
|
||||
for elem in invalid_elements:
|
||||
text_preview = (
|
||||
repr(elem.text or "")[:50] + "..."
|
||||
if len(repr(elem.text or "")) > 50
|
||||
else repr(elem.text or "")
|
||||
)
|
||||
errors.append(
|
||||
f" {xml_file.relative_to(self.unpacked_dir)}: "
|
||||
f"Line {elem.sourceline}: <w:delText> within <w:ins>: {text_preview}"
|
||||
)
|
||||
|
||||
except (lxml.etree.XMLSyntaxError, Exception) as e:
|
||||
errors.append(
|
||||
f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}"
|
||||
)
|
||||
|
||||
if errors:
|
||||
print(f"FAILED - Found {len(errors)} insertion validation violations:")
|
||||
for error in errors:
|
||||
print(error)
|
||||
return False
|
||||
else:
|
||||
if self.verbose:
|
||||
print("PASSED - No w:delText elements within w:ins elements")
|
||||
return True
|
||||
|
||||
def compare_paragraph_counts(self):
|
||||
"""Compare paragraph counts between original and new document."""
|
||||
original_count = self.count_paragraphs_in_original()
|
||||
new_count = self.count_paragraphs_in_unpacked()
|
||||
|
||||
diff = new_count - original_count
|
||||
diff_str = f"+{diff}" if diff > 0 else str(diff)
|
||||
print(f"\nParagraphs: {original_count} → {new_count} ({diff_str})")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise RuntimeError("This module should not be run directly.")
|
||||
315
skills/pptx/ooxml/scripts/validation/pptx.py
Normal file
315
skills/pptx/ooxml/scripts/validation/pptx.py
Normal file
@@ -0,0 +1,315 @@
|
||||
"""
|
||||
Validator for PowerPoint presentation XML files against XSD schemas.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
from .base import BaseSchemaValidator
|
||||
|
||||
|
||||
class PPTXSchemaValidator(BaseSchemaValidator):
|
||||
"""Validator for PowerPoint presentation XML files against XSD schemas."""
|
||||
|
||||
# PowerPoint presentation namespace
|
||||
PRESENTATIONML_NAMESPACE = (
|
||||
"http://schemas.openxmlformats.org/presentationml/2006/main"
|
||||
)
|
||||
|
||||
# PowerPoint-specific element to relationship type mappings
|
||||
ELEMENT_RELATIONSHIP_TYPES = {
|
||||
"sldid": "slide",
|
||||
"sldmasterid": "slidemaster",
|
||||
"notesmasterid": "notesmaster",
|
||||
"sldlayoutid": "slidelayout",
|
||||
"themeid": "theme",
|
||||
"tablestyleid": "tablestyles",
|
||||
}
|
||||
|
||||
def validate(self):
|
||||
"""Run all validation checks and return True if all pass."""
|
||||
# Test 0: XML well-formedness
|
||||
if not self.validate_xml():
|
||||
return False
|
||||
|
||||
# Test 1: Namespace declarations
|
||||
all_valid = True
|
||||
if not self.validate_namespaces():
|
||||
all_valid = False
|
||||
|
||||
# Test 2: Unique IDs
|
||||
if not self.validate_unique_ids():
|
||||
all_valid = False
|
||||
|
||||
# Test 3: UUID ID validation
|
||||
if not self.validate_uuid_ids():
|
||||
all_valid = False
|
||||
|
||||
# Test 4: Relationship and file reference validation
|
||||
if not self.validate_file_references():
|
||||
all_valid = False
|
||||
|
||||
# Test 5: Slide layout ID validation
|
||||
if not self.validate_slide_layout_ids():
|
||||
all_valid = False
|
||||
|
||||
# Test 6: Content type declarations
|
||||
if not self.validate_content_types():
|
||||
all_valid = False
|
||||
|
||||
# Test 7: XSD schema validation
|
||||
if not self.validate_against_xsd():
|
||||
all_valid = False
|
||||
|
||||
# Test 8: Notes slide reference validation
|
||||
if not self.validate_notes_slide_references():
|
||||
all_valid = False
|
||||
|
||||
# Test 9: Relationship ID reference validation
|
||||
if not self.validate_all_relationship_ids():
|
||||
all_valid = False
|
||||
|
||||
# Test 10: Duplicate slide layout references validation
|
||||
if not self.validate_no_duplicate_slide_layouts():
|
||||
all_valid = False
|
||||
|
||||
return all_valid
|
||||
|
||||
def validate_uuid_ids(self):
|
||||
"""Validate that ID attributes that look like UUIDs contain only hex values."""
|
||||
import lxml.etree
|
||||
|
||||
errors = []
|
||||
# UUID pattern: 8-4-4-4-12 hex digits with optional braces/hyphens
|
||||
uuid_pattern = re.compile(
|
||||
r"^[\{\(]?[0-9A-Fa-f]{8}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{12}[\}\)]?$"
|
||||
)
|
||||
|
||||
for xml_file in self.xml_files:
|
||||
try:
|
||||
root = lxml.etree.parse(str(xml_file)).getroot()
|
||||
|
||||
# Check all elements for ID attributes
|
||||
for elem in root.iter():
|
||||
for attr, value in elem.attrib.items():
|
||||
# Check if this is an ID attribute
|
||||
attr_name = attr.split("}")[-1].lower()
|
||||
if attr_name == "id" or attr_name.endswith("id"):
|
||||
# Check if value looks like a UUID (has the right length and pattern structure)
|
||||
if self._looks_like_uuid(value):
|
||||
# Validate that it contains only hex characters in the right positions
|
||||
if not uuid_pattern.match(value):
|
||||
errors.append(
|
||||
f" {xml_file.relative_to(self.unpacked_dir)}: "
|
||||
f"Line {elem.sourceline}: ID '{value}' appears to be a UUID but contains invalid hex characters"
|
||||
)
|
||||
|
||||
except (lxml.etree.XMLSyntaxError, Exception) as e:
|
||||
errors.append(
|
||||
f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}"
|
||||
)
|
||||
|
||||
if errors:
|
||||
print(f"FAILED - Found {len(errors)} UUID ID validation errors:")
|
||||
for error in errors:
|
||||
print(error)
|
||||
return False
|
||||
else:
|
||||
if self.verbose:
|
||||
print("PASSED - All UUID-like IDs contain valid hex values")
|
||||
return True
|
||||
|
||||
def _looks_like_uuid(self, value):
|
||||
"""Check if a value has the general structure of a UUID."""
|
||||
# Remove common UUID delimiters
|
||||
clean_value = value.strip("{}()").replace("-", "")
|
||||
# Check if it's 32 hex-like characters (could include invalid hex chars)
|
||||
return len(clean_value) == 32 and all(c.isalnum() for c in clean_value)
|
||||
|
||||
def validate_slide_layout_ids(self):
|
||||
"""Validate that sldLayoutId elements in slide masters reference valid slide layouts."""
|
||||
import lxml.etree
|
||||
|
||||
errors = []
|
||||
|
||||
# Find all slide master files
|
||||
slide_masters = list(self.unpacked_dir.glob("ppt/slideMasters/*.xml"))
|
||||
|
||||
if not slide_masters:
|
||||
if self.verbose:
|
||||
print("PASSED - No slide masters found")
|
||||
return True
|
||||
|
||||
for slide_master in slide_masters:
|
||||
try:
|
||||
# Parse the slide master file
|
||||
root = lxml.etree.parse(str(slide_master)).getroot()
|
||||
|
||||
# Find the corresponding _rels file for this slide master
|
||||
rels_file = slide_master.parent / "_rels" / f"{slide_master.name}.rels"
|
||||
|
||||
if not rels_file.exists():
|
||||
errors.append(
|
||||
f" {slide_master.relative_to(self.unpacked_dir)}: "
|
||||
f"Missing relationships file: {rels_file.relative_to(self.unpacked_dir)}"
|
||||
)
|
||||
continue
|
||||
|
||||
# Parse the relationships file
|
||||
rels_root = lxml.etree.parse(str(rels_file)).getroot()
|
||||
|
||||
# Build a set of valid relationship IDs that point to slide layouts
|
||||
valid_layout_rids = set()
|
||||
for rel in rels_root.findall(
|
||||
f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship"
|
||||
):
|
||||
rel_type = rel.get("Type", "")
|
||||
if "slideLayout" in rel_type:
|
||||
valid_layout_rids.add(rel.get("Id"))
|
||||
|
||||
# Find all sldLayoutId elements in the slide master
|
||||
for sld_layout_id in root.findall(
|
||||
f".//{{{self.PRESENTATIONML_NAMESPACE}}}sldLayoutId"
|
||||
):
|
||||
r_id = sld_layout_id.get(
|
||||
f"{{{self.OFFICE_RELATIONSHIPS_NAMESPACE}}}id"
|
||||
)
|
||||
layout_id = sld_layout_id.get("id")
|
||||
|
||||
if r_id and r_id not in valid_layout_rids:
|
||||
errors.append(
|
||||
f" {slide_master.relative_to(self.unpacked_dir)}: "
|
||||
f"Line {sld_layout_id.sourceline}: sldLayoutId with id='{layout_id}' "
|
||||
f"references r:id='{r_id}' which is not found in slide layout relationships"
|
||||
)
|
||||
|
||||
except (lxml.etree.XMLSyntaxError, Exception) as e:
|
||||
errors.append(
|
||||
f" {slide_master.relative_to(self.unpacked_dir)}: Error: {e}"
|
||||
)
|
||||
|
||||
if errors:
|
||||
print(f"FAILED - Found {len(errors)} slide layout ID validation errors:")
|
||||
for error in errors:
|
||||
print(error)
|
||||
print(
|
||||
"Remove invalid references or add missing slide layouts to the relationships file."
|
||||
)
|
||||
return False
|
||||
else:
|
||||
if self.verbose:
|
||||
print("PASSED - All slide layout IDs reference valid slide layouts")
|
||||
return True
|
||||
|
||||
def validate_no_duplicate_slide_layouts(self):
|
||||
"""Validate that each slide has exactly one slideLayout reference."""
|
||||
import lxml.etree
|
||||
|
||||
errors = []
|
||||
slide_rels_files = list(self.unpacked_dir.glob("ppt/slides/_rels/*.xml.rels"))
|
||||
|
||||
for rels_file in slide_rels_files:
|
||||
try:
|
||||
root = lxml.etree.parse(str(rels_file)).getroot()
|
||||
|
||||
# Find all slideLayout relationships
|
||||
layout_rels = [
|
||||
rel
|
||||
for rel in root.findall(
|
||||
f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship"
|
||||
)
|
||||
if "slideLayout" in rel.get("Type", "")
|
||||
]
|
||||
|
||||
if len(layout_rels) > 1:
|
||||
errors.append(
|
||||
f" {rels_file.relative_to(self.unpacked_dir)}: has {len(layout_rels)} slideLayout references"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
errors.append(
|
||||
f" {rels_file.relative_to(self.unpacked_dir)}: Error: {e}"
|
||||
)
|
||||
|
||||
if errors:
|
||||
print("FAILED - Found slides with duplicate slideLayout references:")
|
||||
for error in errors:
|
||||
print(error)
|
||||
return False
|
||||
else:
|
||||
if self.verbose:
|
||||
print("PASSED - All slides have exactly one slideLayout reference")
|
||||
return True
|
||||
|
||||
def validate_notes_slide_references(self):
|
||||
"""Validate that each notesSlide file is referenced by only one slide."""
|
||||
import lxml.etree
|
||||
|
||||
errors = []
|
||||
notes_slide_references = {} # Track which slides reference each notesSlide
|
||||
|
||||
# Find all slide relationship files
|
||||
slide_rels_files = list(self.unpacked_dir.glob("ppt/slides/_rels/*.xml.rels"))
|
||||
|
||||
if not slide_rels_files:
|
||||
if self.verbose:
|
||||
print("PASSED - No slide relationship files found")
|
||||
return True
|
||||
|
||||
for rels_file in slide_rels_files:
|
||||
try:
|
||||
# Parse the relationships file
|
||||
root = lxml.etree.parse(str(rels_file)).getroot()
|
||||
|
||||
# Find all notesSlide relationships
|
||||
for rel in root.findall(
|
||||
f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship"
|
||||
):
|
||||
rel_type = rel.get("Type", "")
|
||||
if "notesSlide" in rel_type:
|
||||
target = rel.get("Target", "")
|
||||
if target:
|
||||
# Normalize the target path to handle relative paths
|
||||
normalized_target = target.replace("../", "")
|
||||
|
||||
# Track which slide references this notesSlide
|
||||
slide_name = rels_file.stem.replace(
|
||||
".xml", ""
|
||||
) # e.g., "slide1"
|
||||
|
||||
if normalized_target not in notes_slide_references:
|
||||
notes_slide_references[normalized_target] = []
|
||||
notes_slide_references[normalized_target].append(
|
||||
(slide_name, rels_file)
|
||||
)
|
||||
|
||||
except (lxml.etree.XMLSyntaxError, Exception) as e:
|
||||
errors.append(
|
||||
f" {rels_file.relative_to(self.unpacked_dir)}: Error: {e}"
|
||||
)
|
||||
|
||||
# Check for duplicate references
|
||||
for target, references in notes_slide_references.items():
|
||||
if len(references) > 1:
|
||||
slide_names = [ref[0] for ref in references]
|
||||
errors.append(
|
||||
f" Notes slide '{target}' is referenced by multiple slides: {', '.join(slide_names)}"
|
||||
)
|
||||
for slide_name, rels_file in references:
|
||||
errors.append(f" - {rels_file.relative_to(self.unpacked_dir)}")
|
||||
|
||||
if errors:
|
||||
print(
|
||||
f"FAILED - Found {len([e for e in errors if not e.startswith(' ')])} notes slide reference validation errors:"
|
||||
)
|
||||
for error in errors:
|
||||
print(error)
|
||||
print("Each slide may optionally have its own slide file.")
|
||||
return False
|
||||
else:
|
||||
if self.verbose:
|
||||
print("PASSED - All notes slide references are unique")
|
||||
return True
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise RuntimeError("This module should not be run directly.")
|
||||
279
skills/pptx/ooxml/scripts/validation/redlining.py
Normal file
279
skills/pptx/ooxml/scripts/validation/redlining.py
Normal file
@@ -0,0 +1,279 @@
|
||||
"""
|
||||
Validator for tracked changes in Word documents.
|
||||
"""
|
||||
|
||||
import subprocess
|
||||
import tempfile
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class RedliningValidator:
|
||||
"""Validator for tracked changes in Word documents."""
|
||||
|
||||
def __init__(self, unpacked_dir, original_docx, verbose=False):
|
||||
self.unpacked_dir = Path(unpacked_dir)
|
||||
self.original_docx = Path(original_docx)
|
||||
self.verbose = verbose
|
||||
self.namespaces = {
|
||||
"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
||||
}
|
||||
|
||||
def validate(self):
|
||||
"""Main validation method that returns True if valid, False otherwise."""
|
||||
# Verify unpacked directory exists and has correct structure
|
||||
modified_file = self.unpacked_dir / "word" / "document.xml"
|
||||
if not modified_file.exists():
|
||||
print(f"FAILED - Modified document.xml not found at {modified_file}")
|
||||
return False
|
||||
|
||||
# First, check if there are any tracked changes by Claude to validate
|
||||
try:
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
tree = ET.parse(modified_file)
|
||||
root = tree.getroot()
|
||||
|
||||
# Check for w:del or w:ins tags authored by Claude
|
||||
del_elements = root.findall(".//w:del", self.namespaces)
|
||||
ins_elements = root.findall(".//w:ins", self.namespaces)
|
||||
|
||||
# Filter to only include changes by Claude
|
||||
claude_del_elements = [
|
||||
elem
|
||||
for elem in del_elements
|
||||
if elem.get(f"{{{self.namespaces['w']}}}author") == "Claude"
|
||||
]
|
||||
claude_ins_elements = [
|
||||
elem
|
||||
for elem in ins_elements
|
||||
if elem.get(f"{{{self.namespaces['w']}}}author") == "Claude"
|
||||
]
|
||||
|
||||
# Redlining validation is only needed if tracked changes by Claude have been used.
|
||||
if not claude_del_elements and not claude_ins_elements:
|
||||
if self.verbose:
|
||||
print("PASSED - No tracked changes by Claude found.")
|
||||
return True
|
||||
|
||||
except Exception:
|
||||
# If we can't parse the XML, continue with full validation
|
||||
pass
|
||||
|
||||
# Create temporary directory for unpacking original docx
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
temp_path = Path(temp_dir)
|
||||
|
||||
# Unpack original docx
|
||||
try:
|
||||
with zipfile.ZipFile(self.original_docx, "r") as zip_ref:
|
||||
zip_ref.extractall(temp_path)
|
||||
except Exception as e:
|
||||
print(f"FAILED - Error unpacking original docx: {e}")
|
||||
return False
|
||||
|
||||
original_file = temp_path / "word" / "document.xml"
|
||||
if not original_file.exists():
|
||||
print(
|
||||
f"FAILED - Original document.xml not found in {self.original_docx}"
|
||||
)
|
||||
return False
|
||||
|
||||
# Parse both XML files using xml.etree.ElementTree for redlining validation
|
||||
try:
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
modified_tree = ET.parse(modified_file)
|
||||
modified_root = modified_tree.getroot()
|
||||
original_tree = ET.parse(original_file)
|
||||
original_root = original_tree.getroot()
|
||||
except ET.ParseError as e:
|
||||
print(f"FAILED - Error parsing XML files: {e}")
|
||||
return False
|
||||
|
||||
# Remove Claude's tracked changes from both documents
|
||||
self._remove_claude_tracked_changes(original_root)
|
||||
self._remove_claude_tracked_changes(modified_root)
|
||||
|
||||
# Extract and compare text content
|
||||
modified_text = self._extract_text_content(modified_root)
|
||||
original_text = self._extract_text_content(original_root)
|
||||
|
||||
if modified_text != original_text:
|
||||
# Show detailed character-level differences for each paragraph
|
||||
error_message = self._generate_detailed_diff(
|
||||
original_text, modified_text
|
||||
)
|
||||
print(error_message)
|
||||
return False
|
||||
|
||||
if self.verbose:
|
||||
print("PASSED - All changes by Claude are properly tracked")
|
||||
return True
|
||||
|
||||
def _generate_detailed_diff(self, original_text, modified_text):
|
||||
"""Generate detailed word-level differences using git word diff."""
|
||||
error_parts = [
|
||||
"FAILED - Document text doesn't match after removing Claude's tracked changes",
|
||||
"",
|
||||
"Likely causes:",
|
||||
" 1. Modified text inside another author's <w:ins> or <w:del> tags",
|
||||
" 2. Made edits without proper tracked changes",
|
||||
" 3. Didn't nest <w:del> inside <w:ins> when deleting another's insertion",
|
||||
"",
|
||||
"For pre-redlined documents, use correct patterns:",
|
||||
" - To reject another's INSERTION: Nest <w:del> inside their <w:ins>",
|
||||
" - To restore another's DELETION: Add new <w:ins> AFTER their <w:del>",
|
||||
"",
|
||||
]
|
||||
|
||||
# Show git word diff
|
||||
git_diff = self._get_git_word_diff(original_text, modified_text)
|
||||
if git_diff:
|
||||
error_parts.extend(["Differences:", "============", git_diff])
|
||||
else:
|
||||
error_parts.append("Unable to generate word diff (git not available)")
|
||||
|
||||
return "\n".join(error_parts)
|
||||
|
||||
def _get_git_word_diff(self, original_text, modified_text):
|
||||
"""Generate word diff using git with character-level precision."""
|
||||
try:
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
temp_path = Path(temp_dir)
|
||||
|
||||
# Create two files
|
||||
original_file = temp_path / "original.txt"
|
||||
modified_file = temp_path / "modified.txt"
|
||||
|
||||
original_file.write_text(original_text, encoding="utf-8")
|
||||
modified_file.write_text(modified_text, encoding="utf-8")
|
||||
|
||||
# Try character-level diff first for precise differences
|
||||
result = subprocess.run(
|
||||
[
|
||||
"git",
|
||||
"diff",
|
||||
"--word-diff=plain",
|
||||
"--word-diff-regex=.", # Character-by-character diff
|
||||
"-U0", # Zero lines of context - show only changed lines
|
||||
"--no-index",
|
||||
str(original_file),
|
||||
str(modified_file),
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
if result.stdout.strip():
|
||||
# Clean up the output - remove git diff header lines
|
||||
lines = result.stdout.split("\n")
|
||||
# Skip the header lines (diff --git, index, +++, ---, @@)
|
||||
content_lines = []
|
||||
in_content = False
|
||||
for line in lines:
|
||||
if line.startswith("@@"):
|
||||
in_content = True
|
||||
continue
|
||||
if in_content and line.strip():
|
||||
content_lines.append(line)
|
||||
|
||||
if content_lines:
|
||||
return "\n".join(content_lines)
|
||||
|
||||
# Fallback to word-level diff if character-level is too verbose
|
||||
result = subprocess.run(
|
||||
[
|
||||
"git",
|
||||
"diff",
|
||||
"--word-diff=plain",
|
||||
"-U0", # Zero lines of context
|
||||
"--no-index",
|
||||
str(original_file),
|
||||
str(modified_file),
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
if result.stdout.strip():
|
||||
lines = result.stdout.split("\n")
|
||||
content_lines = []
|
||||
in_content = False
|
||||
for line in lines:
|
||||
if line.startswith("@@"):
|
||||
in_content = True
|
||||
continue
|
||||
if in_content and line.strip():
|
||||
content_lines.append(line)
|
||||
return "\n".join(content_lines)
|
||||
|
||||
except (subprocess.CalledProcessError, FileNotFoundError, Exception):
|
||||
# Git not available or other error, return None to use fallback
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
def _remove_claude_tracked_changes(self, root):
|
||||
"""Remove tracked changes authored by Claude from the XML root."""
|
||||
ins_tag = f"{{{self.namespaces['w']}}}ins"
|
||||
del_tag = f"{{{self.namespaces['w']}}}del"
|
||||
author_attr = f"{{{self.namespaces['w']}}}author"
|
||||
|
||||
# Remove w:ins elements
|
||||
for parent in root.iter():
|
||||
to_remove = []
|
||||
for child in parent:
|
||||
if child.tag == ins_tag and child.get(author_attr) == "Claude":
|
||||
to_remove.append(child)
|
||||
for elem in to_remove:
|
||||
parent.remove(elem)
|
||||
|
||||
# Unwrap content in w:del elements where author is "Claude"
|
||||
deltext_tag = f"{{{self.namespaces['w']}}}delText"
|
||||
t_tag = f"{{{self.namespaces['w']}}}t"
|
||||
|
||||
for parent in root.iter():
|
||||
to_process = []
|
||||
for child in parent:
|
||||
if child.tag == del_tag and child.get(author_attr) == "Claude":
|
||||
to_process.append((child, list(parent).index(child)))
|
||||
|
||||
# Process in reverse order to maintain indices
|
||||
for del_elem, del_index in reversed(to_process):
|
||||
# Convert w:delText to w:t before moving
|
||||
for elem in del_elem.iter():
|
||||
if elem.tag == deltext_tag:
|
||||
elem.tag = t_tag
|
||||
|
||||
# Move all children of w:del to its parent before removing w:del
|
||||
for child in reversed(list(del_elem)):
|
||||
parent.insert(del_index, child)
|
||||
parent.remove(del_elem)
|
||||
|
||||
def _extract_text_content(self, root):
|
||||
"""Extract text content from Word XML, preserving paragraph structure.
|
||||
|
||||
Empty paragraphs are skipped to avoid false positives when tracked
|
||||
insertions add only structural elements without text content.
|
||||
"""
|
||||
p_tag = f"{{{self.namespaces['w']}}}p"
|
||||
t_tag = f"{{{self.namespaces['w']}}}t"
|
||||
|
||||
paragraphs = []
|
||||
for p_elem in root.findall(f".//{p_tag}"):
|
||||
# Get all text elements within this paragraph
|
||||
text_parts = []
|
||||
for t_elem in p_elem.findall(f".//{t_tag}"):
|
||||
if t_elem.text:
|
||||
text_parts.append(t_elem.text)
|
||||
paragraph_text = "".join(text_parts)
|
||||
# Skip empty paragraphs - they don't affect content validation
|
||||
if paragraph_text:
|
||||
paragraphs.append(paragraph_text)
|
||||
|
||||
return "\n".join(paragraphs)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise RuntimeError("This module should not be run directly.")
|
||||
Reference in New Issue
Block a user