#!/usr/bin/env python3 """ Extract structured text content from PowerPoint presentations. This module provides functionality to: - Extract all text content from PowerPoint shapes - Preserve paragraph formatting (alignment, bullets, fonts, spacing) - Handle nested GroupShapes recursively with correct absolute positions - Sort shapes by visual position on slides - Filter out slide numbers and non-content placeholders - Export to JSON with clean, structured data Classes: ParagraphData: Represents a text paragraph with formatting ShapeData: Represents a shape with position and text content Main Functions: extract_text_inventory: Extract all text from a presentation save_inventory: Save extracted data to JSON Usage: python inventory.py input.pptx output.json """ import argparse import json import platform import sys from dataclasses import dataclass from pathlib import Path from typing import Any, Dict, List, Optional, Tuple, Union from PIL import Image, ImageDraw, ImageFont from pptx import Presentation from pptx.enum.text import PP_ALIGN from pptx.shapes.base import BaseShape # Type aliases for cleaner signatures JsonValue = Union[str, int, float, bool, None] ParagraphDict = Dict[str, JsonValue] ShapeDict = Dict[ str, Union[str, float, bool, List[ParagraphDict], List[str], Dict[str, Any], None] ] InventoryData = Dict[ str, Dict[str, "ShapeData"] ] # Dict of slide_id -> {shape_id -> ShapeData} InventoryDict = Dict[str, Dict[str, ShapeDict]] # JSON-serializable inventory def main(): """Main entry point for command-line usage.""" parser = argparse.ArgumentParser( description="Extract text inventory from PowerPoint with proper GroupShape support.", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: python inventory.py presentation.pptx inventory.json Extracts text inventory with correct absolute positions for grouped shapes python inventory.py presentation.pptx inventory.json --issues-only Extracts only text shapes that have overflow or overlap issues The output JSON includes: - All text content organized by slide and shape - Correct absolute positions for shapes in groups - Visual position and size in inches - Paragraph properties and formatting - Issue detection: text overflow and shape overlaps """, ) parser.add_argument("input", help="Input PowerPoint file (.pptx)") parser.add_argument("output", help="Output JSON file for inventory") parser.add_argument( "--issues-only", action="store_true", help="Include only text shapes that have overflow or overlap issues", ) args = parser.parse_args() input_path = Path(args.input) if not input_path.exists(): print(f"Error: Input file not found: {args.input}") sys.exit(1) if not input_path.suffix.lower() == ".pptx": print("Error: Input must be a PowerPoint file (.pptx)") sys.exit(1) try: print(f"Extracting text inventory from: {args.input}") if args.issues_only: print( "Filtering to include only text shapes with issues (overflow/overlap)" ) inventory = extract_text_inventory(input_path, issues_only=args.issues_only) output_path = Path(args.output) output_path.parent.mkdir(parents=True, exist_ok=True) save_inventory(inventory, output_path) print(f"Output saved to: {args.output}") # Report statistics total_slides = len(inventory) total_shapes = sum(len(shapes) for shapes in inventory.values()) if args.issues_only: if total_shapes > 0: print( f"Found {total_shapes} text elements with issues in {total_slides} slides" ) else: print("No issues discovered") else: print( f"Found text in {total_slides} slides with {total_shapes} text elements" ) except Exception as e: print(f"Error processing presentation: {e}") import traceback traceback.print_exc() sys.exit(1) @dataclass class ShapeWithPosition: """A shape with its absolute position on the slide.""" shape: BaseShape absolute_left: int # in EMUs absolute_top: int # in EMUs class ParagraphData: """Data structure for paragraph properties extracted from a PowerPoint paragraph.""" def __init__(self, paragraph: Any): """Initialize from a PowerPoint paragraph object. Args: paragraph: The PowerPoint paragraph object """ self.text: str = paragraph.text.strip() self.bullet: bool = False self.level: Optional[int] = None self.alignment: Optional[str] = None self.space_before: Optional[float] = None self.space_after: Optional[float] = None self.font_name: Optional[str] = None self.font_size: Optional[float] = None self.bold: Optional[bool] = None self.italic: Optional[bool] = None self.underline: Optional[bool] = None self.color: Optional[str] = None self.theme_color: Optional[str] = None self.line_spacing: Optional[float] = None # Check for bullet formatting if ( hasattr(paragraph, "_p") and paragraph._p is not None and paragraph._p.pPr is not None ): pPr = paragraph._p.pPr ns = "{http://schemas.openxmlformats.org/drawingml/2006/main}" if ( pPr.find(f"{ns}buChar") is not None or pPr.find(f"{ns}buAutoNum") is not None ): self.bullet = True if hasattr(paragraph, "level"): self.level = paragraph.level # Add alignment if not LEFT (default) if hasattr(paragraph, "alignment") and paragraph.alignment is not None: alignment_map = { PP_ALIGN.CENTER: "CENTER", PP_ALIGN.RIGHT: "RIGHT", PP_ALIGN.JUSTIFY: "JUSTIFY", } if paragraph.alignment in alignment_map: self.alignment = alignment_map[paragraph.alignment] # Add spacing properties if set if hasattr(paragraph, "space_before") and paragraph.space_before: self.space_before = paragraph.space_before.pt if hasattr(paragraph, "space_after") and paragraph.space_after: self.space_after = paragraph.space_after.pt # Extract font properties from first run if paragraph.runs: first_run = paragraph.runs[0] if hasattr(first_run, "font"): font = first_run.font if font.name: self.font_name = font.name if font.size: self.font_size = font.size.pt if font.bold is not None: self.bold = font.bold if font.italic is not None: self.italic = font.italic if font.underline is not None: self.underline = font.underline # Handle color - both RGB and theme colors try: # Try RGB color first if font.color.rgb: self.color = str(font.color.rgb) except (AttributeError, TypeError): # Fall back to theme color try: if font.color.theme_color: self.theme_color = font.color.theme_color.name except (AttributeError, TypeError): pass # Add line spacing if set if hasattr(paragraph, "line_spacing") and paragraph.line_spacing is not None: if hasattr(paragraph.line_spacing, "pt"): self.line_spacing = round(paragraph.line_spacing.pt, 2) else: # Multiplier - convert to points font_size = self.font_size if self.font_size else 12.0 self.line_spacing = round(paragraph.line_spacing * font_size, 2) def to_dict(self) -> ParagraphDict: """Convert to dictionary for JSON serialization, excluding None values.""" result: ParagraphDict = {"text": self.text} # Add optional fields only if they have values if self.bullet: result["bullet"] = self.bullet if self.level is not None: result["level"] = self.level if self.alignment: result["alignment"] = self.alignment if self.space_before is not None: result["space_before"] = self.space_before if self.space_after is not None: result["space_after"] = self.space_after if self.font_name: result["font_name"] = self.font_name if self.font_size is not None: result["font_size"] = self.font_size if self.bold is not None: result["bold"] = self.bold if self.italic is not None: result["italic"] = self.italic if self.underline is not None: result["underline"] = self.underline if self.color: result["color"] = self.color if self.theme_color: result["theme_color"] = self.theme_color if self.line_spacing is not None: result["line_spacing"] = self.line_spacing return result class ShapeData: """Data structure for shape properties extracted from a PowerPoint shape.""" @staticmethod def emu_to_inches(emu: int) -> float: """Convert EMUs (English Metric Units) to inches.""" return emu / 914400.0 @staticmethod def inches_to_pixels(inches: float, dpi: int = 96) -> int: """Convert inches to pixels at given DPI.""" return int(inches * dpi) @staticmethod def get_font_path(font_name: str) -> Optional[str]: """Get the font file path for a given font name. Args: font_name: Name of the font (e.g., 'Arial', 'Calibri') Returns: Path to the font file, or None if not found """ system = platform.system() # Common font file variations to try font_variations = [ font_name, font_name.lower(), font_name.replace(" ", ""), font_name.replace(" ", "-"), ] # Define font directories and extensions by platform if system == "Darwin": # macOS font_dirs = [ "/System/Library/Fonts/", "/Library/Fonts/", "~/Library/Fonts/", ] extensions = [".ttf", ".otf", ".ttc", ".dfont"] else: # Linux font_dirs = [ "/usr/share/fonts/truetype/", "/usr/local/share/fonts/", "~/.fonts/", ] extensions = [".ttf", ".otf"] # Try to find the font file from pathlib import Path for font_dir in font_dirs: font_dir_path = Path(font_dir).expanduser() if not font_dir_path.exists(): continue # First try exact matches for variant in font_variations: for ext in extensions: font_path = font_dir_path / f"{variant}{ext}" if font_path.exists(): return str(font_path) # Then try fuzzy matching - find files containing the font name try: for file_path in font_dir_path.iterdir(): if file_path.is_file(): file_name_lower = file_path.name.lower() font_name_lower = font_name.lower().replace(" ", "") if font_name_lower in file_name_lower and any( file_name_lower.endswith(ext) for ext in extensions ): return str(file_path) except (OSError, PermissionError): continue return None @staticmethod def get_slide_dimensions(slide: Any) -> tuple[Optional[int], Optional[int]]: """Get slide dimensions from slide object. Args: slide: Slide object Returns: Tuple of (width_emu, height_emu) or (None, None) if not found """ try: prs = slide.part.package.presentation_part.presentation return prs.slide_width, prs.slide_height except (AttributeError, TypeError): return None, None @staticmethod def get_default_font_size(shape: BaseShape, slide_layout: Any) -> Optional[float]: """Extract default font size from slide layout for a placeholder shape. Args: shape: Placeholder shape slide_layout: Slide layout containing the placeholder definition Returns: Default font size in points, or None if not found """ try: if not hasattr(shape, "placeholder_format"): return None shape_type = shape.placeholder_format.type # type: ignore for layout_placeholder in slide_layout.placeholders: if layout_placeholder.placeholder_format.type == shape_type: # Find first defRPr element with sz (size) attribute for elem in layout_placeholder.element.iter(): if "defRPr" in elem.tag and (sz := elem.get("sz")): return float(sz) / 100.0 # Convert EMUs to points break except Exception: pass return None def __init__( self, shape: BaseShape, absolute_left: Optional[int] = None, absolute_top: Optional[int] = None, slide: Optional[Any] = None, ): """Initialize from a PowerPoint shape object. Args: shape: The PowerPoint shape object (should be pre-validated) absolute_left: Absolute left position in EMUs (for shapes in groups) absolute_top: Absolute top position in EMUs (for shapes in groups) slide: Optional slide object to get dimensions and layout information """ self.shape = shape # Store reference to original shape self.shape_id: str = "" # Will be set after sorting # Get slide dimensions from slide object self.slide_width_emu, self.slide_height_emu = ( self.get_slide_dimensions(slide) if slide else (None, None) ) # Get placeholder type if applicable self.placeholder_type: Optional[str] = None self.default_font_size: Optional[float] = None if hasattr(shape, "is_placeholder") and shape.is_placeholder: # type: ignore if shape.placeholder_format and shape.placeholder_format.type: # type: ignore self.placeholder_type = ( str(shape.placeholder_format.type).split(".")[-1].split(" ")[0] # type: ignore ) # Get default font size from layout if slide and hasattr(slide, "slide_layout"): self.default_font_size = self.get_default_font_size( shape, slide.slide_layout ) # Get position information # Use absolute positions if provided (for shapes in groups), otherwise use shape's position left_emu = ( absolute_left if absolute_left is not None else (shape.left if hasattr(shape, "left") else 0) ) top_emu = ( absolute_top if absolute_top is not None else (shape.top if hasattr(shape, "top") else 0) ) self.left: float = round(self.emu_to_inches(left_emu), 2) # type: ignore self.top: float = round(self.emu_to_inches(top_emu), 2) # type: ignore self.width: float = round( self.emu_to_inches(shape.width if hasattr(shape, "width") else 0), 2, # type: ignore ) self.height: float = round( self.emu_to_inches(shape.height if hasattr(shape, "height") else 0), 2, # type: ignore ) # Store EMU positions for overflow calculations self.left_emu = left_emu self.top_emu = top_emu self.width_emu = shape.width if hasattr(shape, "width") else 0 self.height_emu = shape.height if hasattr(shape, "height") else 0 # Calculate overflow status self.frame_overflow_bottom: Optional[float] = None self.slide_overflow_right: Optional[float] = None self.slide_overflow_bottom: Optional[float] = None self.overlapping_shapes: Dict[ str, float ] = {} # Dict of shape_id -> overlap area in sq inches self.warnings: List[str] = [] self._estimate_frame_overflow() self._calculate_slide_overflow() self._detect_bullet_issues() @property def paragraphs(self) -> List[ParagraphData]: """Calculate paragraphs from the shape's text frame.""" if not self.shape or not hasattr(self.shape, "text_frame"): return [] paragraphs = [] for paragraph in self.shape.text_frame.paragraphs: # type: ignore if paragraph.text.strip(): paragraphs.append(ParagraphData(paragraph)) return paragraphs def _get_default_font_size(self) -> int: """Get default font size from theme text styles or use conservative default.""" try: if not ( hasattr(self.shape, "part") and hasattr(self.shape.part, "slide_layout") ): return 14 slide_master = self.shape.part.slide_layout.slide_master # type: ignore if not hasattr(slide_master, "element"): return 14 # Determine theme style based on placeholder type style_name = "bodyStyle" # Default if self.placeholder_type and "TITLE" in self.placeholder_type: style_name = "titleStyle" # Find font size in theme styles for child in slide_master.element.iter(): tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag if tag == style_name: for elem in child.iter(): if "sz" in elem.attrib: return int(elem.attrib["sz"]) // 100 except Exception: pass return 14 # Conservative default for body text def _get_usable_dimensions(self, text_frame) -> Tuple[int, int]: """Get usable width and height in pixels after accounting for margins.""" # Default PowerPoint margins in inches margins = {"top": 0.05, "bottom": 0.05, "left": 0.1, "right": 0.1} # Override with actual margins if set if hasattr(text_frame, "margin_top") and text_frame.margin_top: margins["top"] = self.emu_to_inches(text_frame.margin_top) if hasattr(text_frame, "margin_bottom") and text_frame.margin_bottom: margins["bottom"] = self.emu_to_inches(text_frame.margin_bottom) if hasattr(text_frame, "margin_left") and text_frame.margin_left: margins["left"] = self.emu_to_inches(text_frame.margin_left) if hasattr(text_frame, "margin_right") and text_frame.margin_right: margins["right"] = self.emu_to_inches(text_frame.margin_right) # Calculate usable area usable_width = self.width - margins["left"] - margins["right"] usable_height = self.height - margins["top"] - margins["bottom"] # Convert to pixels return ( self.inches_to_pixels(usable_width), self.inches_to_pixels(usable_height), ) def _wrap_text_line(self, line: str, max_width_px: int, draw, font) -> List[str]: """Wrap a single line of text to fit within max_width_px.""" if not line: return [""] # Use textlength for efficient width calculation if draw.textlength(line, font=font) <= max_width_px: return [line] # Need to wrap - split into words wrapped = [] words = line.split(" ") current_line = "" for word in words: test_line = current_line + (" " if current_line else "") + word if draw.textlength(test_line, font=font) <= max_width_px: current_line = test_line else: if current_line: wrapped.append(current_line) current_line = word if current_line: wrapped.append(current_line) return wrapped def _estimate_frame_overflow(self) -> None: """Estimate if text overflows the shape bounds using PIL text measurement.""" if not self.shape or not hasattr(self.shape, "text_frame"): return text_frame = self.shape.text_frame # type: ignore if not text_frame or not text_frame.paragraphs: return # Get usable dimensions after accounting for margins usable_width_px, usable_height_px = self._get_usable_dimensions(text_frame) if usable_width_px <= 0 or usable_height_px <= 0: return # Set up PIL for text measurement dummy_img = Image.new("RGB", (1, 1)) draw = ImageDraw.Draw(dummy_img) # Get default font size from placeholder or use conservative estimate default_font_size = self._get_default_font_size() # Calculate total height of all paragraphs total_height_px = 0 for para_idx, paragraph in enumerate(text_frame.paragraphs): if not paragraph.text.strip(): continue para_data = ParagraphData(paragraph) # Load font for this paragraph font_name = para_data.font_name or "Arial" font_size = int(para_data.font_size or default_font_size) font = None font_path = self.get_font_path(font_name) if font_path: try: font = ImageFont.truetype(font_path, size=font_size) except Exception: font = ImageFont.load_default() else: font = ImageFont.load_default() # Wrap all lines in this paragraph all_wrapped_lines = [] for line in paragraph.text.split("\n"): wrapped = self._wrap_text_line(line, usable_width_px, draw, font) all_wrapped_lines.extend(wrapped) if all_wrapped_lines: # Calculate line height if para_data.line_spacing: # Custom line spacing explicitly set line_height_px = para_data.line_spacing * 96 / 72 else: # PowerPoint default single spacing (1.0x font size) line_height_px = font_size * 96 / 72 # Add space_before (except first paragraph) if para_idx > 0 and para_data.space_before: total_height_px += para_data.space_before * 96 / 72 # Add paragraph text height total_height_px += len(all_wrapped_lines) * line_height_px # Add space_after if para_data.space_after: total_height_px += para_data.space_after * 96 / 72 # Check for overflow (ignore negligible overflows <= 0.05") if total_height_px > usable_height_px: overflow_px = total_height_px - usable_height_px overflow_inches = round(overflow_px / 96.0, 2) if overflow_inches > 0.05: # Only report significant overflows self.frame_overflow_bottom = overflow_inches def _calculate_slide_overflow(self) -> None: """Calculate if shape overflows the slide boundaries.""" if self.slide_width_emu is None or self.slide_height_emu is None: return # Check right overflow (ignore negligible overflows <= 0.01") right_edge_emu = self.left_emu + self.width_emu if right_edge_emu > self.slide_width_emu: overflow_emu = right_edge_emu - self.slide_width_emu overflow_inches = round(self.emu_to_inches(overflow_emu), 2) if overflow_inches > 0.01: # Only report significant overflows self.slide_overflow_right = overflow_inches # Check bottom overflow (ignore negligible overflows <= 0.01") bottom_edge_emu = self.top_emu + self.height_emu if bottom_edge_emu > self.slide_height_emu: overflow_emu = bottom_edge_emu - self.slide_height_emu overflow_inches = round(self.emu_to_inches(overflow_emu), 2) if overflow_inches > 0.01: # Only report significant overflows self.slide_overflow_bottom = overflow_inches def _detect_bullet_issues(self) -> None: """Detect bullet point formatting issues in paragraphs.""" if not self.shape or not hasattr(self.shape, "text_frame"): return text_frame = self.shape.text_frame # type: ignore if not text_frame or not text_frame.paragraphs: return # Common bullet symbols that indicate manual bullets bullet_symbols = ["•", "●", "○"] for paragraph in text_frame.paragraphs: text = paragraph.text.strip() # Check for manual bullet symbols if text and any(text.startswith(symbol + " ") for symbol in bullet_symbols): self.warnings.append( "manual_bullet_symbol: use proper bullet formatting" ) break @property def has_any_issues(self) -> bool: """Check if shape has any issues (overflow, overlap, or warnings).""" return ( self.frame_overflow_bottom is not None or self.slide_overflow_right is not None or self.slide_overflow_bottom is not None or len(self.overlapping_shapes) > 0 or len(self.warnings) > 0 ) def to_dict(self) -> ShapeDict: """Convert to dictionary for JSON serialization.""" result: ShapeDict = { "left": self.left, "top": self.top, "width": self.width, "height": self.height, } # Add optional fields if present if self.placeholder_type: result["placeholder_type"] = self.placeholder_type if self.default_font_size: result["default_font_size"] = self.default_font_size # Add overflow information only if there is overflow overflow_data = {} # Add frame overflow if present if self.frame_overflow_bottom is not None: overflow_data["frame"] = {"overflow_bottom": self.frame_overflow_bottom} # Add slide overflow if present slide_overflow = {} if self.slide_overflow_right is not None: slide_overflow["overflow_right"] = self.slide_overflow_right if self.slide_overflow_bottom is not None: slide_overflow["overflow_bottom"] = self.slide_overflow_bottom if slide_overflow: overflow_data["slide"] = slide_overflow # Only add overflow field if there is overflow if overflow_data: result["overflow"] = overflow_data # Add overlap field if there are overlapping shapes if self.overlapping_shapes: result["overlap"] = {"overlapping_shapes": self.overlapping_shapes} # Add warnings field if there are warnings if self.warnings: result["warnings"] = self.warnings # Add paragraphs after placeholder_type result["paragraphs"] = [para.to_dict() for para in self.paragraphs] return result def is_valid_shape(shape: BaseShape) -> bool: """Check if a shape contains meaningful text content.""" # Must have a text frame with content if not hasattr(shape, "text_frame") or not shape.text_frame: # type: ignore return False text = shape.text_frame.text.strip() # type: ignore if not text: return False # Skip slide numbers and numeric footers if hasattr(shape, "is_placeholder") and shape.is_placeholder: # type: ignore if shape.placeholder_format and shape.placeholder_format.type: # type: ignore placeholder_type = ( str(shape.placeholder_format.type).split(".")[-1].split(" ")[0] # type: ignore ) if placeholder_type == "SLIDE_NUMBER": return False if placeholder_type == "FOOTER" and text.isdigit(): return False return True def collect_shapes_with_absolute_positions( shape: BaseShape, parent_left: int = 0, parent_top: int = 0 ) -> List[ShapeWithPosition]: """Recursively collect all shapes with valid text, calculating absolute positions. For shapes within groups, their positions are relative to the group. This function calculates the absolute position on the slide by accumulating parent group offsets. Args: shape: The shape to process parent_left: Accumulated left offset from parent groups (in EMUs) parent_top: Accumulated top offset from parent groups (in EMUs) Returns: List of ShapeWithPosition objects with absolute positions """ if hasattr(shape, "shapes"): # GroupShape result = [] # Get this group's position group_left = shape.left if hasattr(shape, "left") else 0 group_top = shape.top if hasattr(shape, "top") else 0 # Calculate absolute position for this group abs_group_left = parent_left + group_left abs_group_top = parent_top + group_top # Process children with accumulated offsets for child in shape.shapes: # type: ignore result.extend( collect_shapes_with_absolute_positions( child, abs_group_left, abs_group_top ) ) return result # Regular shape - check if it has valid text if is_valid_shape(shape): # Calculate absolute position shape_left = shape.left if hasattr(shape, "left") else 0 shape_top = shape.top if hasattr(shape, "top") else 0 return [ ShapeWithPosition( shape=shape, absolute_left=parent_left + shape_left, absolute_top=parent_top + shape_top, ) ] return [] def sort_shapes_by_position(shapes: List[ShapeData]) -> List[ShapeData]: """Sort shapes by visual position (top-to-bottom, left-to-right). Shapes within 0.5 inches vertically are considered on the same row. """ if not shapes: return shapes # Sort by top position first shapes = sorted(shapes, key=lambda s: (s.top, s.left)) # Group shapes by row (within 0.5 inches vertically) result = [] row = [shapes[0]] row_top = shapes[0].top for shape in shapes[1:]: if abs(shape.top - row_top) <= 0.5: row.append(shape) else: # Sort current row by left position and add to result result.extend(sorted(row, key=lambda s: s.left)) row = [shape] row_top = shape.top # Don't forget the last row result.extend(sorted(row, key=lambda s: s.left)) return result def calculate_overlap( rect1: Tuple[float, float, float, float], rect2: Tuple[float, float, float, float], tolerance: float = 0.05, ) -> Tuple[bool, float]: """Calculate if and how much two rectangles overlap. Args: rect1: (left, top, width, height) of first rectangle in inches rect2: (left, top, width, height) of second rectangle in inches tolerance: Minimum overlap in inches to consider as overlapping (default: 0.05") Returns: Tuple of (overlaps, overlap_area) where: - overlaps: True if rectangles overlap by more than tolerance - overlap_area: Area of overlap in square inches """ left1, top1, w1, h1 = rect1 left2, top2, w2, h2 = rect2 # Calculate overlap dimensions overlap_width = min(left1 + w1, left2 + w2) - max(left1, left2) overlap_height = min(top1 + h1, top2 + h2) - max(top1, top2) # Check if there's meaningful overlap (more than tolerance) if overlap_width > tolerance and overlap_height > tolerance: # Calculate overlap area in square inches overlap_area = overlap_width * overlap_height return True, round(overlap_area, 2) return False, 0 def detect_overlaps(shapes: List[ShapeData]) -> None: """Detect overlapping shapes and update their overlapping_shapes dictionaries. This function requires each ShapeData to have its shape_id already set. It modifies the shapes in-place, adding shape IDs with overlap areas in square inches. Args: shapes: List of ShapeData objects with shape_id attributes set """ n = len(shapes) # Compare each pair of shapes for i in range(n): for j in range(i + 1, n): shape1 = shapes[i] shape2 = shapes[j] # Ensure shape IDs are set assert shape1.shape_id, f"Shape at index {i} has no shape_id" assert shape2.shape_id, f"Shape at index {j} has no shape_id" rect1 = (shape1.left, shape1.top, shape1.width, shape1.height) rect2 = (shape2.left, shape2.top, shape2.width, shape2.height) overlaps, overlap_area = calculate_overlap(rect1, rect2) if overlaps: # Add shape IDs with overlap area in square inches shape1.overlapping_shapes[shape2.shape_id] = overlap_area shape2.overlapping_shapes[shape1.shape_id] = overlap_area def extract_text_inventory( pptx_path: Path, prs: Optional[Any] = None, issues_only: bool = False ) -> InventoryData: """Extract text content from all slides in a PowerPoint presentation. Args: pptx_path: Path to the PowerPoint file prs: Optional Presentation object to use. If not provided, will load from pptx_path. issues_only: If True, only include shapes that have overflow or overlap issues Returns a nested dictionary: {slide-N: {shape-N: ShapeData}} Shapes are sorted by visual position (top-to-bottom, left-to-right). The ShapeData objects contain the full shape information and can be converted to dictionaries for JSON serialization using to_dict(). """ if prs is None: prs = Presentation(str(pptx_path)) inventory: InventoryData = {} for slide_idx, slide in enumerate(prs.slides): # Collect all valid shapes from this slide with absolute positions shapes_with_positions = [] for shape in slide.shapes: # type: ignore shapes_with_positions.extend(collect_shapes_with_absolute_positions(shape)) if not shapes_with_positions: continue # Convert to ShapeData with absolute positions and slide reference shape_data_list = [ ShapeData( swp.shape, swp.absolute_left, swp.absolute_top, slide, ) for swp in shapes_with_positions ] # Sort by visual position and assign stable IDs in one step sorted_shapes = sort_shapes_by_position(shape_data_list) for idx, shape_data in enumerate(sorted_shapes): shape_data.shape_id = f"shape-{idx}" # Detect overlaps using the stable shape IDs if len(sorted_shapes) > 1: detect_overlaps(sorted_shapes) # Filter for issues only if requested (after overlap detection) if issues_only: sorted_shapes = [sd for sd in sorted_shapes if sd.has_any_issues] if not sorted_shapes: continue # Create slide inventory using the stable shape IDs inventory[f"slide-{slide_idx}"] = { shape_data.shape_id: shape_data for shape_data in sorted_shapes } return inventory def get_inventory_as_dict(pptx_path: Path, issues_only: bool = False) -> InventoryDict: """Extract text inventory and return as JSON-serializable dictionaries. This is a convenience wrapper around extract_text_inventory that returns dictionaries instead of ShapeData objects, useful for testing and direct JSON serialization. Args: pptx_path: Path to the PowerPoint file issues_only: If True, only include shapes that have overflow or overlap issues Returns: Nested dictionary with all data serialized for JSON """ inventory = extract_text_inventory(pptx_path, issues_only=issues_only) # Convert ShapeData objects to dictionaries dict_inventory: InventoryDict = {} for slide_key, shapes in inventory.items(): dict_inventory[slide_key] = { shape_key: shape_data.to_dict() for shape_key, shape_data in shapes.items() } return dict_inventory def save_inventory(inventory: InventoryData, output_path: Path) -> None: """Save inventory to JSON file with proper formatting. Converts ShapeData objects to dictionaries for JSON serialization. """ # Convert ShapeData objects to dictionaries json_inventory: InventoryDict = {} for slide_key, shapes in inventory.items(): json_inventory[slide_key] = { shape_key: shape_data.to_dict() for shape_key, shape_data in shapes.items() } with open(output_path, "w", encoding="utf-8") as f: json.dump(json_inventory, f, indent=2, ensure_ascii=False) if __name__ == "__main__": main()