Initial commit
This commit is contained in:
315
skills/docx/ooxml/scripts/validation/pptx.py
Normal file
315
skills/docx/ooxml/scripts/validation/pptx.py
Normal file
@@ -0,0 +1,315 @@
|
||||
"""
|
||||
Validator for PowerPoint presentation XML files against XSD schemas.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
from .base import BaseSchemaValidator
|
||||
|
||||
|
||||
class PPTXSchemaValidator(BaseSchemaValidator):
|
||||
"""Validator for PowerPoint presentation XML files against XSD schemas."""
|
||||
|
||||
# PowerPoint presentation namespace
|
||||
PRESENTATIONML_NAMESPACE = (
|
||||
"http://schemas.openxmlformats.org/presentationml/2006/main"
|
||||
)
|
||||
|
||||
# PowerPoint-specific element to relationship type mappings
|
||||
ELEMENT_RELATIONSHIP_TYPES = {
|
||||
"sldid": "slide",
|
||||
"sldmasterid": "slidemaster",
|
||||
"notesmasterid": "notesmaster",
|
||||
"sldlayoutid": "slidelayout",
|
||||
"themeid": "theme",
|
||||
"tablestyleid": "tablestyles",
|
||||
}
|
||||
|
||||
def validate(self):
|
||||
"""Run all validation checks and return True if all pass."""
|
||||
# Test 0: XML well-formedness
|
||||
if not self.validate_xml():
|
||||
return False
|
||||
|
||||
# Test 1: Namespace declarations
|
||||
all_valid = True
|
||||
if not self.validate_namespaces():
|
||||
all_valid = False
|
||||
|
||||
# Test 2: Unique IDs
|
||||
if not self.validate_unique_ids():
|
||||
all_valid = False
|
||||
|
||||
# Test 3: UUID ID validation
|
||||
if not self.validate_uuid_ids():
|
||||
all_valid = False
|
||||
|
||||
# Test 4: Relationship and file reference validation
|
||||
if not self.validate_file_references():
|
||||
all_valid = False
|
||||
|
||||
# Test 5: Slide layout ID validation
|
||||
if not self.validate_slide_layout_ids():
|
||||
all_valid = False
|
||||
|
||||
# Test 6: Content type declarations
|
||||
if not self.validate_content_types():
|
||||
all_valid = False
|
||||
|
||||
# Test 7: XSD schema validation
|
||||
if not self.validate_against_xsd():
|
||||
all_valid = False
|
||||
|
||||
# Test 8: Notes slide reference validation
|
||||
if not self.validate_notes_slide_references():
|
||||
all_valid = False
|
||||
|
||||
# Test 9: Relationship ID reference validation
|
||||
if not self.validate_all_relationship_ids():
|
||||
all_valid = False
|
||||
|
||||
# Test 10: Duplicate slide layout references validation
|
||||
if not self.validate_no_duplicate_slide_layouts():
|
||||
all_valid = False
|
||||
|
||||
return all_valid
|
||||
|
||||
def validate_uuid_ids(self):
|
||||
"""Validate that ID attributes that look like UUIDs contain only hex values."""
|
||||
import lxml.etree
|
||||
|
||||
errors = []
|
||||
# UUID pattern: 8-4-4-4-12 hex digits with optional braces/hyphens
|
||||
uuid_pattern = re.compile(
|
||||
r"^[\{\(]?[0-9A-Fa-f]{8}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{12}[\}\)]?$"
|
||||
)
|
||||
|
||||
for xml_file in self.xml_files:
|
||||
try:
|
||||
root = lxml.etree.parse(str(xml_file)).getroot()
|
||||
|
||||
# Check all elements for ID attributes
|
||||
for elem in root.iter():
|
||||
for attr, value in elem.attrib.items():
|
||||
# Check if this is an ID attribute
|
||||
attr_name = attr.split("}")[-1].lower()
|
||||
if attr_name == "id" or attr_name.endswith("id"):
|
||||
# Check if value looks like a UUID (has the right length and pattern structure)
|
||||
if self._looks_like_uuid(value):
|
||||
# Validate that it contains only hex characters in the right positions
|
||||
if not uuid_pattern.match(value):
|
||||
errors.append(
|
||||
f" {xml_file.relative_to(self.unpacked_dir)}: "
|
||||
f"Line {elem.sourceline}: ID '{value}' appears to be a UUID but contains invalid hex characters"
|
||||
)
|
||||
|
||||
except (lxml.etree.XMLSyntaxError, Exception) as e:
|
||||
errors.append(
|
||||
f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}"
|
||||
)
|
||||
|
||||
if errors:
|
||||
print(f"FAILED - Found {len(errors)} UUID ID validation errors:")
|
||||
for error in errors:
|
||||
print(error)
|
||||
return False
|
||||
else:
|
||||
if self.verbose:
|
||||
print("PASSED - All UUID-like IDs contain valid hex values")
|
||||
return True
|
||||
|
||||
def _looks_like_uuid(self, value):
|
||||
"""Check if a value has the general structure of a UUID."""
|
||||
# Remove common UUID delimiters
|
||||
clean_value = value.strip("{}()").replace("-", "")
|
||||
# Check if it's 32 hex-like characters (could include invalid hex chars)
|
||||
return len(clean_value) == 32 and all(c.isalnum() for c in clean_value)
|
||||
|
||||
def validate_slide_layout_ids(self):
|
||||
"""Validate that sldLayoutId elements in slide masters reference valid slide layouts."""
|
||||
import lxml.etree
|
||||
|
||||
errors = []
|
||||
|
||||
# Find all slide master files
|
||||
slide_masters = list(self.unpacked_dir.glob("ppt/slideMasters/*.xml"))
|
||||
|
||||
if not slide_masters:
|
||||
if self.verbose:
|
||||
print("PASSED - No slide masters found")
|
||||
return True
|
||||
|
||||
for slide_master in slide_masters:
|
||||
try:
|
||||
# Parse the slide master file
|
||||
root = lxml.etree.parse(str(slide_master)).getroot()
|
||||
|
||||
# Find the corresponding _rels file for this slide master
|
||||
rels_file = slide_master.parent / "_rels" / f"{slide_master.name}.rels"
|
||||
|
||||
if not rels_file.exists():
|
||||
errors.append(
|
||||
f" {slide_master.relative_to(self.unpacked_dir)}: "
|
||||
f"Missing relationships file: {rels_file.relative_to(self.unpacked_dir)}"
|
||||
)
|
||||
continue
|
||||
|
||||
# Parse the relationships file
|
||||
rels_root = lxml.etree.parse(str(rels_file)).getroot()
|
||||
|
||||
# Build a set of valid relationship IDs that point to slide layouts
|
||||
valid_layout_rids = set()
|
||||
for rel in rels_root.findall(
|
||||
f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship"
|
||||
):
|
||||
rel_type = rel.get("Type", "")
|
||||
if "slideLayout" in rel_type:
|
||||
valid_layout_rids.add(rel.get("Id"))
|
||||
|
||||
# Find all sldLayoutId elements in the slide master
|
||||
for sld_layout_id in root.findall(
|
||||
f".//{{{self.PRESENTATIONML_NAMESPACE}}}sldLayoutId"
|
||||
):
|
||||
r_id = sld_layout_id.get(
|
||||
f"{{{self.OFFICE_RELATIONSHIPS_NAMESPACE}}}id"
|
||||
)
|
||||
layout_id = sld_layout_id.get("id")
|
||||
|
||||
if r_id and r_id not in valid_layout_rids:
|
||||
errors.append(
|
||||
f" {slide_master.relative_to(self.unpacked_dir)}: "
|
||||
f"Line {sld_layout_id.sourceline}: sldLayoutId with id='{layout_id}' "
|
||||
f"references r:id='{r_id}' which is not found in slide layout relationships"
|
||||
)
|
||||
|
||||
except (lxml.etree.XMLSyntaxError, Exception) as e:
|
||||
errors.append(
|
||||
f" {slide_master.relative_to(self.unpacked_dir)}: Error: {e}"
|
||||
)
|
||||
|
||||
if errors:
|
||||
print(f"FAILED - Found {len(errors)} slide layout ID validation errors:")
|
||||
for error in errors:
|
||||
print(error)
|
||||
print(
|
||||
"Remove invalid references or add missing slide layouts to the relationships file."
|
||||
)
|
||||
return False
|
||||
else:
|
||||
if self.verbose:
|
||||
print("PASSED - All slide layout IDs reference valid slide layouts")
|
||||
return True
|
||||
|
||||
def validate_no_duplicate_slide_layouts(self):
|
||||
"""Validate that each slide has exactly one slideLayout reference."""
|
||||
import lxml.etree
|
||||
|
||||
errors = []
|
||||
slide_rels_files = list(self.unpacked_dir.glob("ppt/slides/_rels/*.xml.rels"))
|
||||
|
||||
for rels_file in slide_rels_files:
|
||||
try:
|
||||
root = lxml.etree.parse(str(rels_file)).getroot()
|
||||
|
||||
# Find all slideLayout relationships
|
||||
layout_rels = [
|
||||
rel
|
||||
for rel in root.findall(
|
||||
f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship"
|
||||
)
|
||||
if "slideLayout" in rel.get("Type", "")
|
||||
]
|
||||
|
||||
if len(layout_rels) > 1:
|
||||
errors.append(
|
||||
f" {rels_file.relative_to(self.unpacked_dir)}: has {len(layout_rels)} slideLayout references"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
errors.append(
|
||||
f" {rels_file.relative_to(self.unpacked_dir)}: Error: {e}"
|
||||
)
|
||||
|
||||
if errors:
|
||||
print("FAILED - Found slides with duplicate slideLayout references:")
|
||||
for error in errors:
|
||||
print(error)
|
||||
return False
|
||||
else:
|
||||
if self.verbose:
|
||||
print("PASSED - All slides have exactly one slideLayout reference")
|
||||
return True
|
||||
|
||||
def validate_notes_slide_references(self):
|
||||
"""Validate that each notesSlide file is referenced by only one slide."""
|
||||
import lxml.etree
|
||||
|
||||
errors = []
|
||||
notes_slide_references = {} # Track which slides reference each notesSlide
|
||||
|
||||
# Find all slide relationship files
|
||||
slide_rels_files = list(self.unpacked_dir.glob("ppt/slides/_rels/*.xml.rels"))
|
||||
|
||||
if not slide_rels_files:
|
||||
if self.verbose:
|
||||
print("PASSED - No slide relationship files found")
|
||||
return True
|
||||
|
||||
for rels_file in slide_rels_files:
|
||||
try:
|
||||
# Parse the relationships file
|
||||
root = lxml.etree.parse(str(rels_file)).getroot()
|
||||
|
||||
# Find all notesSlide relationships
|
||||
for rel in root.findall(
|
||||
f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship"
|
||||
):
|
||||
rel_type = rel.get("Type", "")
|
||||
if "notesSlide" in rel_type:
|
||||
target = rel.get("Target", "")
|
||||
if target:
|
||||
# Normalize the target path to handle relative paths
|
||||
normalized_target = target.replace("../", "")
|
||||
|
||||
# Track which slide references this notesSlide
|
||||
slide_name = rels_file.stem.replace(
|
||||
".xml", ""
|
||||
) # e.g., "slide1"
|
||||
|
||||
if normalized_target not in notes_slide_references:
|
||||
notes_slide_references[normalized_target] = []
|
||||
notes_slide_references[normalized_target].append(
|
||||
(slide_name, rels_file)
|
||||
)
|
||||
|
||||
except (lxml.etree.XMLSyntaxError, Exception) as e:
|
||||
errors.append(
|
||||
f" {rels_file.relative_to(self.unpacked_dir)}: Error: {e}"
|
||||
)
|
||||
|
||||
# Check for duplicate references
|
||||
for target, references in notes_slide_references.items():
|
||||
if len(references) > 1:
|
||||
slide_names = [ref[0] for ref in references]
|
||||
errors.append(
|
||||
f" Notes slide '{target}' is referenced by multiple slides: {', '.join(slide_names)}"
|
||||
)
|
||||
for slide_name, rels_file in references:
|
||||
errors.append(f" - {rels_file.relative_to(self.unpacked_dir)}")
|
||||
|
||||
if errors:
|
||||
print(
|
||||
f"FAILED - Found {len([e for e in errors if not e.startswith(' ')])} notes slide reference validation errors:"
|
||||
)
|
||||
for error in errors:
|
||||
print(error)
|
||||
print("Each slide may optionally have its own slide file.")
|
||||
return False
|
||||
else:
|
||||
if self.verbose:
|
||||
print("PASSED - All notes slide references are unique")
|
||||
return True
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise RuntimeError("This module should not be run directly.")
|
||||
Reference in New Issue
Block a user