commit 8b332b4007d52efdb630dacc9ebaff7a7422fa71 Author: Zhongwei Li Date: Sun Nov 30 09:06:10 2025 +0800 Initial commit diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json new file mode 100644 index 0000000..e4ad1d1 --- /dev/null +++ b/.claude-plugin/plugin.json @@ -0,0 +1,18 @@ +{ + "name": "presenter", + "description": "Scientific presentation agents and skills specializing in professional, insightful and engaging research presentations.", + "version": "0.1.0", + "author": { + "name": "Wieland Brendel", + "email": "wieland.brendel@tue.ellis.eu" + }, + "skills": [ + "./skills" + ], + "agents": [ + "./agents" + ], + "commands": [ + "./commands" + ] +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..63ce3c1 --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# presenter + +Scientific presentation agents and skills specializing in professional, insightful and engaging research presentations. diff --git a/agents/presentation-outline-architect.md b/agents/presentation-outline-architect.md new file mode 100644 index 0000000..93897ff --- /dev/null +++ b/agents/presentation-outline-architect.md @@ -0,0 +1,196 @@ +--- +name: presentation-outline-architect +description: Use this agent when you need to create a structured outline for a research presentation. +model: sonnet +color: green +--- + +You are an elite research presentation architect with deep expertise in science communication, visual storytelling, and audience engagement. Your specialty is transforming complex research projects into compelling, well-structured presentations that resonate with specific audiences. ULTRATHINK. + +## Your Core Mission + +You will receive three key parameters: +1. **Goal**: The purpose and context of the presentation (e.g., conference talk, funding pitch, lab meeting, thesis defense) +2. **Audience**: The background, expertise level, and interests of the intended viewers +3. **Length**: The time allocation for the presentation (including or excluding Q&A) + +Your task is to create a comprehensive, slide-by-slide outline saved to `research-os/presentations/{DATE}_name_of_presentation/outline.md` where DATE is YYYYMMDD format. + +## Your Methodology + +### Phase 1: Deep Project Analysis + +Before creating any outline, you must conduct a thorough investigation of the project repository: + +1. **Core Documents Analysis**: + - `ideas.md`: Understand the genesis and evolution of ideas + - `mission.md`: Grasp the overarching vision and objectives + - `roadmap.md`: Identify planned milestones and current project stage + - `related_work.md`: Comprehend the research context and positioning + +2. **Results and Artifacts Examination**: + - Search for results directories, data files, figures, and visualizations + - Identify any manuscripts, papers, or technical reports + - Locate experimental outputs, model results, or validation data + - Find any existing visualizations or plots that could be incorporated + +3. **Project Stage Assessment**: + Determine where the project stands: + - **Early stage**: Vision and ideas established, but limited/no results yet + - **Mid stage**: Some preliminary results, ongoing experiments, partial validation + - **Mature stage**: Substantial results, figures available, possibly manuscript-ready + - **Complete stage**: Published or submission-ready manuscript with full figure set + +4. **Content Inventory**: + Create a mental map of: + - Available results and their significance + - Existing figures and their quality/relevance + - Key findings and their narrative potential + - Gaps that need to be filled or de-emphasized + +### Phase 2: Strategic Presentation Design + +Based on your analysis, architect a presentation that: + +1. **Matches Audience Sophistication**: + - For experts: Dive deep into technical details, assume domain knowledge + - For mixed audiences: Build foundational understanding before complexity + - For non-experts: Emphasize impact and intuition over technical mechanics + +2. **Respects Time Constraints**: + - Calculate slides based on ~1-2 minutes per slide average + - Reserve 20-30% of time for Q&A if not explicitly separated + - Build in buffer for complex slides that need more explanation + +3. **Serves the Goal**: + - **Conference talks**: Emphasize novelty, results, and community contribution + - **Funding pitches**: Highlight vision, impact, feasibility, and team capability + - **Lab meetings**: Focus on progress, challenges, and next steps + - **Thesis defense**: Demonstrate mastery, methodology, and contribution + +4. **Follows Narrative Arc**: + - **Hook**: Open with compelling motivation or problem statement + - **Context**: Establish necessary background and related work + - **Approach**: Explain methodology and innovation + - **Results**: Present findings with appropriate depth + - **Impact**: Conclude with significance and future directions + +### Phase 3: Outline Creation + +Your outline must include: + +1. **Slide-by-Slide Breakdown**: + For each slide, provide: + - **Slide number and title** + - **Content description**: Bullet points or paragraph describing what should appear + - **Visual guidance**: Specific recommendations for diagrams, plots, or figures + - **Speaker notes**: Key points to emphasize verbally + - **Timing estimate**: Approximate minutes to spend on this slide + +2. **Visual Strategy**: + - Identify existing figures that should be used + - Suggest new visualizations that need to be created + - Recommend diagram types (flowcharts, architecture diagrams, comparison tables) + - Specify when to use text-heavy vs. visual-heavy slides + +3. **Result Integration**: + - **If results exist**: Decide which results to present, in what order, and with what framing + - **If results are limited**: Focus on methodology, vision, and preliminary findings + - **If no results yet**: Emphasize problem importance, approach innovation, and expected impact + +4. **Adaptability Notes**: + - Mark optional slides that can be skipped if time is short + - Suggest backup slides for anticipated questions + - Indicate where interactive elements or demos might work + +## Output Format + +Your `outline.md` file must follow this structure: + +```markdown +# [Presentation Title] + +## Meta Information +- **Date**: [Presentation date] +- **Venue**: [Where it will be presented] +- **Duration**: [X minutes] +- **Audience**: [Description of audience] +- **Goal**: [Purpose of presentation] + +## Presentation Overview +[2-3 paragraph summary of the presentation strategy, key messages, and narrative approach] + +## Slide-by-Slide Outline + +### Slide 1: [Title] +**Timing**: ~X minutes + +**Content**: +- [Detailed description of slide content] +- [Key points to cover] + +**Visual Elements**: +- [Specific guidance on figures, diagrams, or layout] +- [Reference to existing figures if applicable: e.g., "Use figure from results/analysis/fig_performance.png"] + +**Speaker Notes**: +- [Key talking points] +- [Transitions or emphasis areas] + +--- + +[Repeat for each slide] + +## Visual Asset Requirements + +### Existing Assets to Use +- [List of existing figures/diagrams with file paths] + +### New Assets Needed +- [Description of visualizations that need to be created] +- [Suggested tools or approaches for creation] + +## Backup Slides +[Optional slides for Q&A or extended versions] + +## Presentation Notes +- **Estimated total slides**: [Number] +- **Pacing strategy**: [Notes on timing] +- **Key transitions**: [Important narrative bridges] +- **Anticipated questions**: [Likely audience questions and where to address them] +``` + +## Quality Assurance Checklist + +Before finalizing your outline, verify: + +- [ ] Every slide serves the presentation goal and audience +- [ ] The narrative flows logically from motivation to conclusion +- [ ] Time allocation is realistic and includes buffer +- [ ] Visual guidance is specific and actionable +- [ ] Available results are appropriately showcased (or absence is handled gracefully) +- [ ] Technical depth matches audience sophistication +- [ ] Opening is engaging and conclusion is memorable +- [ ] All referenced figures/files actually exist or are clearly marked as "to be created" +- [ ] The outline could be handed to another person to build the presentation + +## Special Considerations + +**For early-stage projects**: Emphasize the problem significance, approach novelty, and expected contributions. Use conceptual diagrams and related work comparisons. Frame "preliminary results" or "planned experiments" appropriately. + +**For mature projects**: Leverage the full manuscript and figures. Create a distilled narrative that highlights the most impactful results. Consider what to leave out as much as what to include. + +**For technical audiences**: Don't shy away from equations, algorithms, or implementation details when they add value. Include methodological rigor demonstrations. + +**For general audiences**: Use analogies, real-world examples, and visual metaphors. Minimize jargon and provide intuitive explanations of technical concepts. + +## Your Communication Style + +When interacting with users: +- Ask clarifying questions if goal, audience, or length are ambiguous +- Provide a brief summary of your analysis findings before sharing the outline path +- Highlight any concerns or recommendations (e.g., "Given the 10-minute constraint and abundance of results, I recommend focusing on X and Y while saving Z for backup slides") +- Offer to iterate on the outline if the user wants adjustments +- Suggest next steps after outline creation (e.g., "Now you might want to create the visual assets" or "Consider having a colleague review the outline for flow") + +You are proactive, detail-oriented, and strategically minded. Your outlines are not just slide lists—they are blueprints for compelling presentations that achieve their goals. diff --git a/agents/presentation-outline-reviewer.md b/agents/presentation-outline-reviewer.md new file mode 100644 index 0000000..20ad710 --- /dev/null +++ b/agents/presentation-outline-reviewer.md @@ -0,0 +1,69 @@ +--- +name: presentation-outline-reviewer +description: Use this agent when you need to review and improve a research presentation outline. Call this agent after drafting an initial outline in research-os/presentations/{DATE}_name_of_presentation/outline.md, or when you want to enhance the narrative flow, clarity, and visual impact of an existing presentation outline. +model: sonnet +color: orange +--- + +You are an expert research communication specialist with deep expertise in scientific storytelling, presentation design, and visual communication. You combine the narrative insight of a TED talk curator with the technical precision of an academic reviewer and the visual thinking of an information designer. ULTRATHINK. + +Your mission is to transform research presentation outlines into compelling, clear, and visually-rich narratives that maximize audience engagement and comprehension. + +## Your Responsibilities + +1. **Locate and Read the Outline**: Find and thoroughly read the presentation outline at research-os/presentations/{DATE}_name_of_presentation/outline.md. If the path is ambiguous, search for the most recent presentation outline. + +2. **Analyze Project Context**: Review relevant materials in research-os/project/ to deeply understand: + - The core research problem and motivation + - The technical approach and methodology + - Key results, contributions, and implications + - The target audience and their likely knowledge level + - Related work and how this research positions itself + +3. **Evaluate the Current Outline**: Assess the outline across multiple dimensions: + - **Clarity**: Is each section's purpose obvious? Are technical concepts explained accessibly? + - **Storyline**: Does the narrative flow logically from problem → approach → results → impact? + - **Climax**: Is there a clear peak moment where the key insight or result is revealed? + - **Visual Potential**: Are there opportunities to replace text-heavy slides with diagrams, visualizations, or demonstrations? + - **Audience Engagement**: Does the outline hook attention early and maintain momentum? + +4. **Improve the Outline**: Rewrite and enhance the outline with specific improvements: + - **Opening Hook**: Ensure the presentation starts with a compelling problem statement or motivating example + - **Narrative Arc**: Structure the presentation as a story with clear beginning (problem), middle (approach), and end (results/impact) + - **Strategic Climax**: Place the most impressive result or key insight at a natural climax point (typically 2/3 through) + - **Visual Annotations**: For each section, suggest specific visual elements (diagrams, charts, animations, demos, comparisons) + - **Pacing Notes**: Indicate where to slow down for complex topics or accelerate through background + - **Transition Quality**: Craft smooth transitions between sections that maintain narrative coherence + - **Conclusion Impact**: End with clear takeaways and future vision that resonates + +## Quality Standards + +- **Every slide should have a purpose**: If you can't articulate why a slide exists in the narrative, suggest removing or merging it +- **Show, don't tell**: Wherever possible, suggest visual representations over bullet points +- **Technical precision with accessibility**: Maintain rigor while ensuring explanations are graspable +- **Emotional resonance**: Research presentations should inspire; identify moments to connect emotionally with the work's importance +- **Respect time constraints**: Typical research presentations are 20-30 minutes; ensure the outline is feasible + +## Output Format + +Follow the same structure as outline.md and improve it according to your analysis. + +## Decision-Making Framework + +When uncertain about a change: +- Prioritize clarity over comprehensiveness +- Choose concrete examples over abstract explanations +- Favor visual communication over textual when the concept is spatial, temporal, or comparative +- Maintain the researcher's voice and technical accuracy +- If the current approach is already excellent, say so and provide only minor refinements + +## Self-Verification + +Before finalizing your review: +- Read through the improved outline as if you were the audience—does it compel you? +- Verify that the climax is positioned effectively and the narrative builds toward it +- Check that visual suggestions are specific and actionable +- Ensure every section connects to the overarching research vision +- Confirm that the outline respects typical presentation time constraints + +If you cannot locate the outline or project context, clearly state what you need and ask for clarification before proceeding. diff --git a/commands/make-slides.md b/commands/make-slides.md new file mode 100644 index 0000000..a234180 --- /dev/null +++ b/commands/make-slides.md @@ -0,0 +1,301 @@ +## Make Research Presentation Slides + +You are orchestrating a comprehensive workflow to create a professional research presentation. This command guides the user through information gathering, outline creation, refinement, review, and final slide generation. + +--- + +## PHASE 1: Information Gathering & Setup + +### Step 1.1: Check for Existing Presentations + +First, check if any presentations already exist in `research-os/presentations/`: + +```bash +ls -la research-os/presentations/ 2>/dev/null || echo "No presentations directory exists yet" +``` + +If presentations exist: +- List them clearly to the user +- Use the **AskUserQuestion** tool to ask: "I found existing presentations. Would you like to:" + - Option 1: "Edit an existing presentation" (user will then specify which one) + - Option 2: "Copy an existing presentation as starting point" (user will then specify which one) + - Option 3: "Start from scratch" + +If the user chooses Option 1 or 2, they should provide the folder name in their "Other" response. + +### Step 1.2: Gather Presentation Parameters + +Use the **AskUserQuestion** tool to collect essential information. Ask ALL of these questions unless the user has already provided this information in their initial argument ($ARGUMENTS). Do not ask these questions all at once, but go through each question one-by-one and use the **AskUserQuestion** tool to collect the answer. + +**Question 1 - Audience Background**: +- Header: "Audience" +- Question: "Who will be attending this presentation?" +- Options: + - "General public" → "Non-experts with no domain knowledge; focus on impact and intuition" + - "Domain researchers" → "Conference or workshop attendees from diverse backgrounds within the field" + - "Specialists" → "Experts already proficient in this specific research area" + - "Mixed audience" → "Combination of experts and non-experts" +- multiSelect: false + +**Question 2 - Talk Length**: +- Header: "Duration" +- Question: "How long is the presentation (in minutes)?" +- Options: + - "4-5 minutes" → "Lightning talk" + - "5-10 minutes" → "Pitch talk or brief update" + - "15-20 minutes" → "Standard conference talk" + - "30-45 minutes" → "Invited talk or detailed seminar" + - "60+ minutes" → "Lecture, tutorial, or thesis defense" +- multiSelect: false + +**Question 3 - Talk Format**: +- Header: "Format" +- Question: "What is the main format / direction of the talk?" +- Options: + - "Informal talk" → "More informal talks in research labs" + - "Conference talk" → "Standard conference talk" + - "Pitch" → "Short pitch event for scientific audiences" + - "Science Slam" → "Science slam talk for general audiences" + - "Public Lecture" → "Longer public presentations for general audiences" + - "Children's Lecture" → "A presentation for younger children" + - "60+ minutes" → "Lecture or tutorial" +- multiSelect: false + +**Question 4 - Presentation Focus**: +- Header: "Focus" +- Question: "What should the presentation emphasize?" +- Options: + - "Wider impact" → "Emphasize motivation, applications, and real-world significance" + - "Technical details" → "Focus on methodology, implementation, and technical contributions" + - "Overall results" → "Highlight findings, experiments, and quantitative outcomes" + - "Vision and motivation" → "Stress the big picture, problem importance, and future potential" +- multiSelect: true + +**Question 5 - Presentation Name** (optional): +- Header: "Name" +- Question: "What should we name this presentation folder? (Leave blank to auto-generate from project name)" +- Options: + - "Auto-generate" → "Use general format (pitch, keynote, conference oral, informal talk)" + - "Custom name" → "User will provide name in Other field" +- multiSelect: false + +### Step 1.3: Create Presentation Directory + +Based on the user's responses: + +1. Determine the presentation folder name: + - If user provided a custom name: use it (sanitize: lowercase, replace spaces with hyphens) + - If user chose "Edit existing": use the existing folder path they provided + - If user chose "Copy existing": create new folder with today's date + name variant + - If auto-generate: use the general format (pitch, keynote, conference oral, informal talk) for naming the folder + +2. Create the directory structure: +``` +research-os/presentations/{YYYYMMDD}_{presentation_name}/ +``` + +Where `{YYYYMMDD}` is today's date and `{presentation_name}` is the determined name. + +3. If the user chose "Copy existing", copy the contents of the existing presentation folder into the new folder. + +4. Confirm to the user: +``` +Created presentation folder: research-os/presentations/{YYYYMMDD}_{presentation_name}/ + +Proceeding with: +- Audience: {audience} +- Duration: {length} minutes +- Focus: {focus_areas} +``` + +--- + +## PHASE 2: Generate Initial Outline + +Use the **Task** tool to launch the `presentation-outline-architect` agent with the following prompt: + +``` +Create a comprehensive presentation outline for research-os/presentations/{YYYYMMDD}_{presentation_name}/outline.md. + +Context: +- Goal: {infer from focus areas, e.g., "conference talk emphasizing technical details"} +- Audience: {audience from Phase 1} +- Length: {length from Phase 1} minutes + +Please analyze the entire research-os/project/ directory (idea.md, mission.md, roadmap.md, related_work.md) and any available artifacts in research-os/artifacts/ to understand the project stage and available results. + +Create a slide-by-slide outline that: +1. Matches the audience sophistication level +2. Fits within the time constraint +3. Emphasizes: {focus_areas from Phase 1} +4. Automatically detects project maturity (early/mid/late stage) and adapts the narrative accordingly + +Save the output to research-os/presentations/{YYYYMMDD}_{presentation_name}/outline.md. +``` + +**Important**: Pass the gathered parameters (goal, audience, length) clearly to the agent. + +Wait for the agent to complete before proceeding. + +--- + +## PHASE 3: Refine the Outline + +Use the **Task** tool to launch the `presentation-outline-reviewer` agent with the following prompt: + +``` +Review and improve the presentation outline at research-os/presentations/{YYYYMMDD}_{presentation_name}/outline.md. + +Please: +1. Analyze the outline for clarity, storyline, and climax positioning +2. Review the project context in research-os/project/ to ensure accurate representation +3. Enhance visual elements and narrative flow +4. Maximize the presentation's impact for a {audience} audience with focus on {focus_areas} + +Rewrite research-os/presentations/{YYYYMMDD}_{presentation_name}/outline.md with your improvements. + +Focus on: +- Strong opening hook +- Clear narrative arc with strategic climax +- Specific visual recommendations (diagrams, charts, images) +- Smooth transitions between sections +- Memorable conclusion +``` + +Wait for the agent to complete before proceeding. + +--- + +## PHASE 4: User Review Checkpoint + +After the outline has been refined, present a summary to the user: + +```markdown +## Presentation Outline Complete + +I've created and refined a presentation outline for you: + +**Location**: `research-os/presentations/{YYYYMMDD}_{presentation_name}/outline.md` + +**Summary**: +- Total slides: {estimated_slide_count} +- Structure: {brief_narrative_description} +- Visual elements: {count_of_diagrams/charts/images} +- Key climax: {main_result_or_insight_highlighted} + +**Next Step**: + +Please review the outline at the path above. Once you're satisfied with the structure and content, I'll generate the actual presentation slides using the pptx skill. + +Would you like to: +1. Review the outline and let me know if any changes are needed +2. Proceed directly to slide generation +3. Make specific edits before generation +``` + +Use the **AskUserQuestion** tool: +- Header: "Next Step" +- Question: "How would you like to proceed?" +- Options: + - "Generate slides now" → "Proceed to Phase 5 immediately" + - "I need to review first" → "Wait for user to review and provide feedback" + - "Make specific changes" → "User will describe changes in Other field" +- multiSelect: false + +**If user requests changes**: +- Make the requested edits to `outline.md` +- Ask again if they're ready to generate slides + +**If user chooses "Generate slides now"**: +- Proceed to Phase 5 + +--- + +## PHASE 5: Generate Presentation Slides + +Once the user approves the outline, use the **Skill** tool to invoke the pptx skill and ULTRATHINK: + +``` +Use the pptx skill to generate a professional presentation from research-os/presentations/{YYYYMMDD}_{presentation_name}/outline.md. + +Instructions for the pptx skill: +1. Read the outline.md file carefully +2. Create slides following the structure exactly +3. For each slide: + - Pull referenced images from artifacts/ directories + - Generate charts and diagrams as specified + - Create graphics for visual elements (no placeholders!) +4. Apply professional formatting appropriate for research presentations +5. Save the final presentation as research-os/presentations/{YYYYMMDD}_{presentation_name}/presentation.pptx + +Context: This is a {length}-minute {audience} presentation focusing on {focus_areas}. +``` + +**Important**: +- NO PLACEHOLDERS for visuals - all diagrams and charts must be generated +- Pull all images directly from artifact paths specified in outline.md +- Ensure visual quality is publication-ready + +--- + +## PHASE 6: Completion Message + +Once slides are generated, display: + +```markdown +## Presentation Complete! + +Your research presentation has been created: + +**Files created**: +✓ research-os/presentations/{YYYYMMDD}_{presentation_name}/outline.md +✓ research-os/presentations/{YYYYMMDD}_{presentation_name}/presentation.pptx + +**Presentation details**: +- Duration: {length} minutes +- Audience: {audience} +- Focus: {focus_areas} +- Total slides: {count} + +**Next steps**: +1. Open the .pptx file and review the slides +2. Practice your delivery timing +3. Customize any visuals or animations as needed +4. Run /make-slides again if you want to create a variant (e.g., shorter version, different audience) + +The presentation is ready for your review! +``` + +--- + +## Important Notes + +### Directory Checking +Before any phase that references `research-os/project/`, verify these files exist: +- If `research-os/project/idea.md` or `research-os/project/mission.md` don't exist, inform the user: + ``` + No research project found. Please run /plan-research first to establish your project, + or create research-os/project/idea.md with your research concept. + ``` + +### Error Handling +- If agents fail, provide clear error messages and suggest troubleshooting steps +- If the pptx skill encounters missing assets, list them and ask user how to proceed +- If existing presentation folders are corrupted or incomplete, offer to start fresh + +### Flexibility +- Users can exit at any phase and resume later +- The outline.md file can be manually edited between Phase 3 and Phase 5 +- Multiple presentations can be created from the same project + +--- + +## Design Philosophy + +This command follows a **human-in-the-loop** approach: +1. Gather requirements explicitly (no defaults) +2. Let specialized agents do deep work autonomously +3. Provide a review checkpoint before expensive operations +4. Deliver production-ready output + +The workflow balances automation with user control, ensuring the final presentation aligns with the researcher's vision while leveraging AI for structure, refinement, and generation. diff --git a/plugin.lock.json b/plugin.lock.json new file mode 100644 index 0000000..5a28a91 --- /dev/null +++ b/plugin.lock.json @@ -0,0 +1,285 @@ +{ + "$schema": "internal://schemas/plugin.lock.v1.json", + "pluginId": "gh:wielandbrendel/research-os:plugins/presenter", + "normalized": { + "repo": null, + "ref": "refs/tags/v20251128.0", + "commit": "2b606e24741b78892a2f4af02a81615554df9c6b", + "treeHash": "8cb43925c7a6f234f04870d7e8c9891a1c82f8dc35f6c15480d1e2615e74f34b", + "generatedAt": "2025-11-28T10:29:01.744740Z", + "toolVersion": "publish_plugins.py@0.2.0" + }, + "origin": { + "remote": "git@github.com:zhongweili/42plugin-data.git", + "branch": "master", + "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390", + "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data" + }, + "manifest": { + "name": "presenter", + "description": "Scientific presentation agents and skills specializing in professional, insightful and engaging research presentations.", + "version": "0.1.0" + }, + "content": { + "files": [ + { + "path": "README.md", + "sha256": "763adb36c825d64a7ae86d835b206d6617303d5c5ceb012f419b4294bc71f585" + }, + { + "path": "agents/presentation-outline-reviewer.md", + "sha256": "0bec7eeea67a62a7ab9c21abbd8014ee4a0650707a0060b4b30bc0cc9c3790a6" + }, + { + "path": "agents/presentation-outline-architect.md", + "sha256": "1e86863c962bc7e5358a4f4d31ce10dcb77b5e7fffa32ae7920b28f04c964ae2" + }, + { + "path": ".claude-plugin/plugin.json", + "sha256": "a99455dd9e58745db0e407dd9b3453e0b4e4806da31c377d199b9fd7c0a019ba" + }, + { + "path": "commands/make-slides.md", + "sha256": "3b30f1348ee63b4272c62329d199b4cc91005bf6539a63f5651c46f5970738cd" + }, + { + "path": "skills/pptx/template.pptx", + "sha256": "4fb9f978cc030f4944f9dce2578399316404c8ec31128824e67d83be6865681a" + }, + { + "path": "skills/pptx/css.md", + "sha256": "fd6551c21cad23c3b580a7032422f437ee3c2cf115808df6ebd37ea58c0d1353" + }, + { + "path": "skills/pptx/ooxml.md", + "sha256": "09868e9f1786765421ecf3f0f49c77006738efda82a76df43ed87f7a9bfe2467" + }, + { + "path": "skills/pptx/html2pptx.tgz", + "sha256": "542e19f94c89a888a4708fa1728dff9b082a9fb2d65c5114a730a6f9060c6e6a" + }, + { + "path": "skills/pptx/SKILL.md", + "sha256": "69d0babd6f8d129f5d1aa8130963ff0fe66dffaeb684794abb99bcd381dfb99c" + }, + { + "path": "skills/pptx/html2pptx.md", + "sha256": "cadbe677d069867f96ce1dc6e24c84fe3169b735bef12f113999a0d07c032457" + }, + { + "path": "skills/pptx/LICENSE.txt", + "sha256": "79f6d8f5b427252fa3b1c11ecdbdb6bf610b944f7530b4de78f770f38741cfaa" + }, + { + "path": "skills/pptx/ooxml/schemas/microsoft/wml-sdtdatahash-2020.xsd", + "sha256": "842e7163409c8d74f4d7088a8bc99500d80bc75332681a0980055b08f374a604" + }, + { + "path": "skills/pptx/ooxml/schemas/microsoft/wml-2012.xsd", + "sha256": "0fa75578a000439a7988ba0c59fdc69f774bbd416cbacc14d07125b3f686cb74" + }, + { + "path": "skills/pptx/ooxml/schemas/microsoft/wml-2010.xsd", + "sha256": "568b26ee156cb9549aa439ca2158965f77b7c1602b7e0316f40ac6cf586e35f2" + }, + { + "path": "skills/pptx/ooxml/schemas/microsoft/wml-cid-2016.xsd", + "sha256": "127ca209fa73d7cb708449cb355c871867948a96e4a74f7bf5811ef62d17991d" + }, + { + "path": "skills/pptx/ooxml/schemas/microsoft/wml-symex-2015.xsd", + "sha256": "16f6f8072249f431370723c2cd8974672e0d9c897e00e97dd918079df934871b" + }, + { + "path": "skills/pptx/ooxml/schemas/microsoft/wml-cex-2018.xsd", + "sha256": "fddc2b880cabb9005aebbc7e783e53c19fec1c03df7d0e2f2076a33a0fdfd081" + }, + { + "path": "skills/pptx/ooxml/schemas/microsoft/wml-2018.xsd", + "sha256": "be0ff793a22dd31384650c3a4da14c2fa8062751c2e97b0e5ee852bda13c60ad" + }, + { + "path": "skills/pptx/ooxml/schemas/mce/mc.xsd", + "sha256": "3a37e461ecf5a8670fdec34029703401f8728ab9c96ec1739a6ae58d55212413" + }, + { + "path": "skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-coreProperties.xsd", + "sha256": "69688ea15f82a870053b9a60c79359063dbcfeec9cdabafabad51a53934bc35b" + }, + { + "path": "skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-relationships.xsd", + "sha256": "607ffb6879cf5d5d765bfdfb2548f3f427ea964ddee6d08a15f0dc781749866d" + }, + { + "path": "skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-contentTypes.xsd", + "sha256": "49ec1c03440a9b5590e69acdf92b200fe133697de0b05f43a129fb94d1885026" + }, + { + "path": "skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-digSig.xsd", + "sha256": "ee7e7e1b9a1340536e25e87c8650a2b90a43b0e67d02d6746cd430048b3c1439" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd", + "sha256": "133c9f64a5c5d573b78d0a474122b22506d8eadb5e063f67cdbbb8fa2f161d0e" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd", + "sha256": "585bedc1313b40888dcc544cb74cd939a105ee674f3b1d3aa1cc6d34f70ff155" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd", + "sha256": "0d103b99a4a8652f8871552a69d42d2a3760ac6a5e3ef02d979c4273257ff6a4" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/pml.xsd", + "sha256": "39c6396b7b1f096885c3ceba5dab30e3ee14291c45a1d6fa6038ec4db69d88d0" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd", + "sha256": "5cb76dabd8b97d1e9308a1700b90c20139be4d50792d21a7f09789f5cccd6026" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-chart.xsd", + "sha256": "4de4390a26e7e44e682ed98b39872cce2620f807f5b0de421567a43f74526388" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd", + "sha256": "3fd0586f2637b98bb9886f0e0b67d89e1cc987c2d158cc7deb5f5b9890ced412" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/sml.xsd", + "sha256": "495debc8fa967b77ed37799747b049832f2c95b2ecdb9f19ddcd68f8c9f96ab9" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd", + "sha256": "6bdeb169c3717eb01108853bd9fc5a3750fb1fa5b82abbdd854d49855a40f519" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/wml.xsd", + "sha256": "c2dd9f61f892deae6acd8d20771ea79b12018af25f3bf8d06639c8542d218cfd" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-picture.xsd", + "sha256": "5d389d42befbebd91945d620242347caecd3367f9a3a7cf8d97949507ae1f53c" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd", + "sha256": "809f77f658f71e10f5cfbf99e25c895a23a4e62c81210b59a2fe8c7fb24849d2" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-main.xsd", + "sha256": "6978ba7e889070b0c3cb5b546b23e5a6c3516134afc53b87a21f482ca33f3858" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd", + "sha256": "9c085407751b9061c1f996f6c39ce58451be22a8d334f09175f0e89e42736285" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd", + "sha256": "b4532b6d258832953fbb3ee4c711f4fe25d3faf46a10644b2505f17010d01e88" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd", + "sha256": "d90722af8f1439ae6b82d57858ee0b0720e320eb309cf30371666338aa66d106" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd", + "sha256": "2dbfd4719505bf39b568005994a433997cbeac59cebc9d4867d564c94e9ba03a" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd", + "sha256": "475dcae1e7d1ea46232db6f8481040c15e53a52a3c256831d3df204212b0e831" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd", + "sha256": "0b364451dc36a48dd6dae0f3b6ada05fd9b71e5208211f8ee5537d7e51a587e2" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd", + "sha256": "bc92e36ccd233722d4c5869bec71ddc7b12e2df56059942cce5a39065cc9c368" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-main.xsd", + "sha256": "335e9ecd40bb594ec4b916ac8c7b9eead4e16fe1b79f9d4105fdb1c43b66bb1c" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/xml.xsd", + "sha256": "70a3c67959f9ad2333016328716101bf8b476e682f29830cc9ee8da63dca7cc2" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd", + "sha256": "12264f3c03d738311cd9237d212f1c07479e70f0cbe1ae725d29b36539aef637" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd", + "sha256": "0ef4bb354ff44b923564c4ddbdda5987919d220225129ec94614a618ceafc281" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd", + "sha256": "7b5b7413e2c895b1e148e82e292a117d53c7ec65b0696c992edca57b61b4a74b" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-math.xsd", + "sha256": "f812dffca0bb66db6e2ec2e01dd19258636b2a5a3a0bb949a2e7295665e0ce61" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd", + "sha256": "3c6709101c6aaa82888df5d8795c33f9e857196790eb320d9194e64be2b6bdd8" + }, + { + "path": "skills/pptx/ooxml/scripts/pack.py", + "sha256": "6fe762f45aff8c63fd95b9fcb1337b28921d6fa454e18a0e8158d4c8708d6d00" + }, + { + "path": "skills/pptx/ooxml/scripts/validate.py", + "sha256": "1ec252de8b14b07d16966c48906ccb1c45c68bcd23557ad31d8c50a27f5f8c0f" + }, + { + "path": "skills/pptx/ooxml/scripts/unpack.py", + "sha256": "0bd17f76a1a4c388aba42c6d1d39015fa84e405c3e0692397fe12762bd632b58" + }, + { + "path": "skills/pptx/ooxml/scripts/validation/docx.py", + "sha256": "e65d6cda0525866a24cc847b2e883bd2416ae6f87b3f5b9e2784dfbb0ec13093" + }, + { + "path": "skills/pptx/ooxml/scripts/validation/__init__.py", + "sha256": "83e0f035c5abea238d3f2c3968afbd511ed022b527b7c9cb60a9434cc34ff987" + }, + { + "path": "skills/pptx/ooxml/scripts/validation/redlining.py", + "sha256": "97abfdff4f08f43f9a4bb5c8a2f8fd483398b5b339592724e8635153b5507967" + }, + { + "path": "skills/pptx/ooxml/scripts/validation/pptx.py", + "sha256": "00bf2623da1177b3948143a4ade2f1cda7cb389dee31960861913fa42ef1b00f" + }, + { + "path": "skills/pptx/ooxml/scripts/validation/base.py", + "sha256": "f2c70d481613456e32b43869d1604b05c236c8da34b5b3967677a661cac7ba63" + }, + { + "path": "skills/pptx/scripts/thumbnail.py", + "sha256": "c21fd950b6ada7bd2f029885d3e56bc66b7ff061cc8404c492eb301664aa9e5d" + }, + { + "path": "skills/pptx/scripts/rearrange.py", + "sha256": "c04ac37916f398ba621b2d9e1e4c1a69225eaad6d7fb0ad116c237ddeb1b2b68" + }, + { + "path": "skills/pptx/scripts/inventory.py", + "sha256": "adead8fe6270e520c397cec9fbee4d606ab10bb80f749e018b42ec894c60d2e5" + }, + { + "path": "skills/pptx/scripts/replace.py", + "sha256": "8a590747551be847a904e3296fb2f35aa4e7feeb4970a61596c2375306462820" + } + ], + "dirSha256": "8cb43925c7a6f234f04870d7e8c9891a1c82f8dc35f6c15480d1e2615e74f34b" + }, + "security": { + "scannedAt": null, + "scannerVersion": null, + "flags": [] + } +} \ No newline at end of file diff --git a/skills/pptx/LICENSE.txt b/skills/pptx/LICENSE.txt new file mode 100644 index 0000000..c55ab42 --- /dev/null +++ b/skills/pptx/LICENSE.txt @@ -0,0 +1,30 @@ +© 2025 Anthropic, PBC. All rights reserved. + +LICENSE: Use of these materials (including all code, prompts, assets, files, +and other components of this Skill) is governed by your agreement with +Anthropic regarding use of Anthropic's services. If no separate agreement +exists, use is governed by Anthropic's Consumer Terms of Service or +Commercial Terms of Service, as applicable: +https://www.anthropic.com/legal/consumer-terms +https://www.anthropic.com/legal/commercial-terms +Your applicable agreement is referred to as the "Agreement." "Services" are +as defined in the Agreement. + +ADDITIONAL RESTRICTIONS: Notwithstanding anything in the Agreement to the +contrary, users may not: + +- Extract these materials from the Services or retain copies of these + materials outside the Services +- Reproduce or copy these materials, except for temporary copies created + automatically during authorized use of the Services +- Create derivative works based on these materials +- Distribute, sublicense, or transfer these materials to any third party +- Make, offer to sell, sell, or import any inventions embodied in these + materials +- Reverse engineer, decompile, or disassemble these materials + +The receipt, viewing, or possession of these materials does not convey or +imply any license or right beyond those expressly granted above. + +Anthropic retains all right, title, and interest in these materials, +including all copyrights, patents, and other intellectual property rights. diff --git a/skills/pptx/SKILL.md b/skills/pptx/SKILL.md new file mode 100644 index 0000000..2b62544 --- /dev/null +++ b/skills/pptx/SKILL.md @@ -0,0 +1,394 @@ +--- +name: pptx +description: "Presentation creation, editing, and analysis. When Claude needs to work with presentations (.pptx files) for: (1) Creating new presentations, (2) Modifying or editing content, (3) Working with layouts, (4) Adding comments or speaker notes, or any other presentation tasks" +license: Proprietary. LICENSE.txt has complete terms +--- + +# PPTX creation, editing, and analysis + +## Overview + +Create, edit, or analyze the contents of .pptx files when requested. A .pptx file is essentially a ZIP archive containing XML files and other resources. Different tools and workflows are available for different tasks. + +## Reading and analyzing content + +### Text extraction + +To read just the text content of a presentation, convert the document to markdown: + +```bash +# Convert document to markdown +python -m markitdown path-to-file.pptx +``` + +### Raw XML access + +Use raw XML access for: comments, speaker notes, slide layouts, animations, design elements, and complex formatting. To access these features, unpack a presentation and read its raw XML contents. + +#### Unpacking a file + +`python ooxml/scripts/unpack.py ` + +**Note**: The unpack.py script is located at `skills/pptx/ooxml/scripts/unpack.py` relative to the project root. If the script doesn't exist at this path, use `find . -name "unpack.py"` to locate it. + +#### Key file structures + +- `ppt/presentation.xml` - Main presentation metadata and slide references +- `ppt/slides/slide{N}.xml` - Individual slide contents (slide1.xml, slide2.xml, etc.) +- `ppt/notesSlides/notesSlide{N}.xml` - Speaker notes for each slide +- `ppt/comments/modernComment_*.xml` - Comments for specific slides +- `ppt/slideLayouts/` - Layout templates for slides +- `ppt/slideMasters/` - Master slide templates +- `ppt/theme/` - Theme and styling information +- `ppt/media/` - Images and other media files + +#### Typography and color extraction + +**To emulate example designs**, analyze the presentation's typography and colors first using the methods below: + +1. **Read theme file**: Check `ppt/theme/theme1.xml` for colors (``) and fonts (``) +2. **Sample slide content**: Examine `ppt/slides/slide1.xml` for actual font usage (``) and colors +3. **Search for patterns**: Use grep to find color (``, ``) and font references across all XML files + +## Editing an existing PowerPoint presentation + +To edit slides in an existing PowerPoint presentation, work with the raw Office Open XML (OOXML) format. This involves unpacking the .pptx file, editing the XML content, and repacking it. + +### Workflow + +1. **MANDATORY - READ ENTIRE FILE**: Read [`ooxml.md`](ooxml.md) (~500 lines) completely from start to finish. **NEVER set any range limits when reading this file.** Read the full file content for detailed guidance on OOXML structure and editing workflows before any presentation editing. +2. Unpack the presentation: `python ooxml/scripts/unpack.py ` +3. Edit the XML files (primarily `ppt/slides/slide{N}.xml` and related files) +4. **CRITICAL**: Validate immediately after each edit and fix any validation errors before proceeding: `python ooxml/scripts/validate.py --original ` +5. Pack the final presentation: `python ooxml/scripts/pack.py ` + +## Creating a new PowerPoint presentation + +To create a presentation that follows an existing template's design, duplicate and re-arrange template slides before replacing placeholder content. + +Unless a template is given to you by the user, use `template.pptx` as the template file for the presentation. + +### Workflow + +1. **Extract template text AND create visual thumbnail grid**: + + - Extract text: `python -m markitdown template.pptx > template-content.md` + - Read `template-content.md`: Read the entire file to understand the contents of the template presentation. **NEVER set any range limits when reading this file.** + - Create thumbnail grids: `python scripts/thumbnail.py template.pptx` + - See [Creating Thumbnail Grids](#creating-thumbnail-grids) section for more details + +2. **Analyze template and save inventory to a file**: + + - **Visual Analysis**: Review thumbnail grid(s) to understand slide layouts, design patterns, and visual structure + - Create and save a template inventory file at `template-inventory.md` containing: + + ```markdown + # Template Inventory Analysis + + **Total Slides: [count]** + **IMPORTANT: Slides are 0-indexed (first slide = 0, last slide = count-1)** + + ## [Category Name] + + - Slide 0: [Layout code if available] - Description/purpose + - Slide 1: [Layout code] - Description/purpose + - Slide 2: [Layout code] - Description/purpose + [... EVERY slide must be listed individually with its index ...] + ``` + + - **Using the thumbnail grid**: Reference the visual thumbnails to identify: + - Layout patterns (title slides, content layouts, section dividers) + - Image placeholder locations and counts + - Design consistency across slide groups + - Visual hierarchy and structure + - This inventory file is REQUIRED for selecting appropriate templates in the next step + +3. **Create presentation outline based on template inventory**: + + - Review available templates from step 2. + - Choose an intro or title template for the first slide. This should be one of the first templates. + - Choose safe, text-based layouts for the other slides. + - **CRITICAL: Match layout structure to actual content**: + - Single-column layouts: Use for unified narrative or single topic + - Two-column layouts: Use ONLY when there are exactly 2 distinct items/concepts + - Three-column layouts: Use ONLY when there are exactly 3 distinct items/concepts + - Image + text layouts: Use ONLY when there are actual images to insert + - Quote layouts: Use ONLY for actual quotes from people (with attribution), never for emphasis + - Never use layouts with more placeholders than available content + - With 2 items, avoid forcing them into a 3-column layout + - With 4+ items, consider breaking into multiple slides or using a list format + - Count actual content pieces BEFORE selecting the layout + - Verify each placeholder in the chosen layout will be filled with meaningful content + - Select one option representing the **best** layout for each content section. + - Save `outline.md` with content AND template mapping that leverages available designs + - Example template mapping: + ``` + # Template slides to use (0-based indexing) + # WARNING: Verify indices are within range! Template with 73 slides has indices 0-72 + # Mapping: slide numbers from outline -> template slide indices + template_mapping = [ + 0, # Use slide 0 (Title/Cover) + 34, # Use slide 34 (B1: Title and body) + 34, # Use slide 34 again (duplicate for second B1) + 50, # Use slide 50 (E1: Quote) + 54, # Use slide 54 (F2: Closing + Text) + ] + ``` + +4. **Duplicate, reorder, and delete slides using `rearrange.py`**: + + - Use the `scripts/rearrange.py` script to create a new presentation with slides in the desired order: + ```bash + python scripts/rearrange.py template.pptx working.pptx 0,34,34,50,52 + ``` + - The script handles duplicating repeated slides, deleting unused slides, and reordering automatically + - Slide indices are 0-based (first slide is 0, second is 1, etc.) + - The same slide index can appear multiple times to duplicate that slide + +5. **Extract ALL text using the `inventory.py` script**: + + - **Run inventory extraction**: + ```bash + python scripts/inventory.py working.pptx text-inventory.json + ``` + - **Read text-inventory.json**: Read the entire text-inventory.json file to understand all shapes and their properties. **NEVER set any range limits when reading this file.** + + - The inventory JSON structure: + + ```json + { + "slide-0": { + "shape-0": { + "placeholder_type": "TITLE", // or null for non-placeholders + "left": 1.5, // position in inches + "top": 2.0, + "width": 7.5, + "height": 1.2, + "paragraphs": [ + { + "text": "Paragraph text", + // Optional properties (only included when non-default): + "bullet": true, // explicit bullet detected + "level": 0, // only included when bullet is true + "alignment": "CENTER", // CENTER, RIGHT (not LEFT) + "space_before": 10.0, // space before paragraph in points + "space_after": 6.0, // space after paragraph in points + "line_spacing": 22.4, // line spacing in points + "font_name": "Arial", // from first run + "font_size": 14.0, // in points + "bold": true, + "italic": false, + "underline": false, + "color": "FF0000" // RGB color + } + ] + } + } + } + ``` + + - Key features: + - **Slides**: Named as "slide-0", "slide-1", etc. + - **Shapes**: Ordered by visual position (top-to-bottom, left-to-right) as "shape-0", "shape-1", etc. + - **Placeholder types**: TITLE, CENTER_TITLE, SUBTITLE, BODY, OBJECT, or null + - **Default font size**: `default_font_size` in points extracted from layout placeholders (when available) + - **Slide numbers are filtered**: Shapes with SLIDE_NUMBER placeholder type are automatically excluded from inventory + - **Bullets**: When `bullet: true`, `level` is always included (even if 0) + - **Spacing**: `space_before`, `space_after`, and `line_spacing` in points (only included when set) + - **Colors**: `color` for RGB (e.g., "FF0000"), `theme_color` for theme colors (e.g., "DARK_1") + - **Properties**: Only non-default values are included in the output + +6. **Generate replacement text and save the data to a JSON file** + Based on the text inventory from the previous step: + + - **CRITICAL**: First verify which shapes exist in the inventory - only reference shapes that are actually present + - **VALIDATION**: The replace.py script validates that all shapes in the replacement JSON exist in the inventory + - Referencing a non-existent shape produces an error showing available shapes + - Referencing a non-existent slide produces an error indicating the slide doesn't exist + - All validation errors are shown at once before the script exits + - **IMPORTANT**: The replace.py script uses inventory.py internally to identify ALL text shapes + - **AUTOMATIC CLEARING**: ALL text shapes from the inventory are cleared unless "paragraphs" are provided for them + - Add a "paragraphs" field to shapes that need content (not "replacement_paragraphs") + - Shapes without "paragraphs" in the replacement JSON have their text cleared automatically + - Paragraphs with bullets are automatically left aligned. Avoid setting the `alignment` property when `"bullet": true` + - Generate appropriate replacement content for placeholder text + - Use shape size to determine appropriate content length + - **CRITICAL**: Include paragraph properties from the original inventory - don't just provide text + - **IMPORTANT**: When bullet: true, do NOT include bullet symbols (•, -, \*) in text - they're added automatically + - **ESSENTIAL FORMATTING RULES**: + - Headers/titles should typically have `"bold": true` + - List items should have `"bullet": true, "level": 0` (level is required when bullet is true) + - Preserve any alignment properties (e.g., `"alignment": "CENTER"` for centered text) + - Include font properties when different from default (e.g., `"font_size": 14.0`, `"font_name": "Lora"`) + - Colors: Use `"color": "FF0000"` for RGB or `"theme_color": "DARK_1"` for theme colors + - The replacement script expects **properly formatted paragraphs**, not just text strings + - **Overlapping shapes**: Prefer shapes with larger default_font_size or more appropriate placeholder_type + - Save the updated inventory with replacements to `replacement-text.json` + - **WARNING**: Different template layouts have different shape counts - always check the actual inventory before creating replacements + + Example paragraphs field showing proper formatting: + + ```json + "paragraphs": [ + { + "text": "New presentation title text", + "alignment": "CENTER", + "bold": true + }, + { + "text": "Section Header", + "bold": true + }, + { + "text": "First bullet point without bullet symbol", + "bullet": true, + "level": 0 + }, + { + "text": "Red colored text", + "color": "FF0000" + }, + { + "text": "Theme colored text", + "theme_color": "DARK_1" + }, + { + "text": "Regular paragraph text without special formatting" + } + ] + ``` + + **Shapes not listed in the replacement JSON are automatically cleared**: + + ```json + { + "slide-0": { + "shape-0": { + "paragraphs": [...] // This shape gets new text + } + // shape-1 and shape-2 from inventory will be cleared automatically + } + } + ``` + + **Common formatting patterns for presentations**: + + - Title slides: Bold text, sometimes centered + - Section headers within slides: Bold text + - Bullet lists: Each item needs `"bullet": true, "level": 0` + - Body text: Usually no special properties needed + - Quotes: May have special alignment or font properties + +7. **Apply replacements using the `replace.py` script** + + ```bash + python scripts/replace.py working.pptx replacement-text.json output.pptx + ``` + + The script will: + + - First extract the inventory of ALL text shapes using functions from inventory.py + - Validate that all shapes in the replacement JSON exist in the inventory + - Clear text from ALL shapes identified in the inventory + - Apply new text only to shapes with "paragraphs" defined in the replacement JSON + - Preserve formatting by applying paragraph properties from the JSON + - Handle bullets, alignment, font properties, and colors automatically + - Save the updated presentation + + Example validation errors: + + ``` + ERROR: Invalid shapes in replacement JSON: + - Shape 'shape-99' not found on 'slide-0'. Available shapes: shape-0, shape-1, shape-4 + - Slide 'slide-999' not found in inventory + ``` + + ``` + ERROR: Replacement text made overflow worse in these shapes: + - slide-0/shape-2: overflow worsened by 1.25" (was 0.00", now 1.25") + ``` + +## Creating Thumbnail Grids + +To create visual thumbnail grids of PowerPoint slides for quick analysis and reference: + +```bash +python scripts/thumbnail.py template.pptx [output_prefix] +``` + +**Features**: + +- Creates: `thumbnails.jpg` (or `thumbnails-1.jpg`, `thumbnails-2.jpg`, etc. for large decks) +- Default: 5 columns, max 30 slides per grid (5×6) +- Custom prefix: `python scripts/thumbnail.py template.pptx my-grid` + - Note: The output prefix should include the path if you want output in a specific directory (e.g., `workspace/my-grid`) +- Adjust columns: `--cols 4` (range: 3-6, affects slides per grid) +- Grid limits: 3 cols = 12 slides/grid, 4 cols = 20, 5 cols = 30, 6 cols = 42 +- Slides are zero-indexed (Slide 0, Slide 1, etc.) + +**Use cases**: + +- Template analysis: Quickly understand slide layouts and design patterns +- Content review: Visual overview of entire presentation +- Navigation reference: Find specific slides by their visual appearance +- Quality check: Verify all slides are properly formatted + +**Examples**: + +```bash +# Basic usage +python scripts/thumbnail.py presentation.pptx + +# Combine options: custom name, columns +python scripts/thumbnail.py template.pptx analysis --cols 4 +``` + +## Converting Slides to Images + +To visually analyze PowerPoint slides, convert them to images using a two-step process: + +1. **Convert PPTX to PDF**: + + ```bash + soffice --headless --convert-to pdf template.pptx + ``` + +2. **Convert PDF pages to JPEG images**: + ```bash + pdftoppm -jpeg -r 150 template.pdf slide + ``` + This creates files like `slide-1.jpg`, `slide-2.jpg`, etc. + +Options: + +- `-r 150`: Sets resolution to 150 DPI (adjust for quality/size balance) +- `-jpeg`: Output JPEG format (use `-png` for PNG if preferred) +- `-f N`: First page to convert (e.g., `-f 2` starts from page 2) +- `-l N`: Last page to convert (e.g., `-l 5` stops at page 5) +- `slide`: Prefix for output files + +Example for specific range: + +```bash +pdftoppm -jpeg -r 150 -f 2 -l 5 template.pdf slide # Converts only pages 2-5 +``` + +## Code Style Guidelines + +**IMPORTANT**: When generating code for PPTX operations: + +- Write concise code +- Avoid verbose variable names and redundant operations +- Avoid unnecessary print statements + +## Dependencies + +Required dependencies (should already be installed): + +- **markitdown**: `pip install "markitdown[pptx]"` (for text extraction from presentations) +- **pptxgenjs**: `npm install -g pptxgenjs` (for creating presentations via html2pptx) +- **playwright**: `npm install -g playwright` (for HTML rendering in html2pptx) +- **react-icons**: `npm install -g react-icons react react-dom` (for icons in SVG format) +- **LibreOffice**: `sudo apt-get install libreoffice` (for PDF conversion) +- **Poppler**: `sudo apt-get install poppler-utils` (for pdftoppm to convert PDF to images) +- **defusedxml**: `pip install defusedxml` (for secure XML parsing) diff --git a/skills/pptx/css.md b/skills/pptx/css.md new file mode 100644 index 0000000..95be365 --- /dev/null +++ b/skills/pptx/css.md @@ -0,0 +1,335 @@ +# Global CSS Framework Reference + +This document provides a comprehensive reference for the global.css framework used in HTML slide creation for PowerPoint conversion. + +--- + +## ⚠️ No Import Necessary + +The global.css framework is automatically added to every slide. Do NOT try to include it in a slide with ` + + +
+

Slide title

+

Subtitle or context

+
+
+ + +``` + +## Using the @ant/html2pptx Library + +### Installation & Setup + +**Important**: Install the @ant/html2pptx package globally before using this library. See the **Prerequisites Check** section at the top of this document. + +**When running scripts, always set NODE_PATH:** + +```sh +NODE_PATH="$(npm root -g)" node your-script.js 2>&1 +``` + +**If you get "Cannot find module" errors**, see the Prerequisites Check section or verify that NODE_PATH is correctly pointing to the global node_modules directory. + +### Dependencies + +These libraries have been globally installed and are available to use: + +- `pptxgenjs` +- `playwright` + +### ⚠️ IMPORTANT: How To Use html2pptx + +Common errors: + +- DO NOT call `pptx.addSlide()` directly, `html2pptx` creates a slide for you +- `html2pptx` accepts an `htmlFilePath` and a `pptx` presentation object + - If you pass the wrong arguments, your script will throw errors or time out + +**Your script MUST follow the following example.** + +```javascript +const pptxgen = require("pptxgenjs"); +const { html2pptx } = require("@ant/html2pptx"); + +// Create a new pptx presentation +const pptx = new pptxgen(); +pptx.layout = "LAYOUT_16x9"; // Must match HTML body dimensions + +// Add an HTML-only slide +await html2pptx("slide1.html", pptx); + +// Add a slide with a chart placeholder +const { slide, placeholders } = await html2pptx("slide2.html", pptx); +slide.addChart(pptx.charts.LINE, chartData, placeholders[0]); + +// Save the presentation +await pptx.writeFile("output.pptx"); +``` + +### API Reference + +#### Function Signature + +```javascript +await html2pptx(htmlFilePath, pptxPresentation, options); +``` + +#### Parameters + +- `htmlFilePath` (string): Path to HTML file (absolute or relative) +- `pptxPresentation` (pptxgen): PptxGenJS presentation instance with layout already set +- `options` (object, optional): + - `tmpDir` (string): Temporary directory for generated files (default: `process.env.TMPDIR || '/tmp'`) + +#### Returns + +```javascript +{ + slide: pptxgenSlide, // The created/updated slide + placeholders: [ // Array of placeholder positions + { id: string, x: number, y: number, w: number, h: number }, + ... + ] +} +``` + +### Validation + +The library automatically validates and collects all errors before throwing: + +1. **HTML dimensions must match presentation layout** - Reports dimension mismatches +2. **Content must not overflow body** - Reports overflow with exact measurements +3. **Text element styling** - Reports backgrounds/borders/shadows on text elements (only allowed on block elements) + +**All validation errors are collected and reported together** in a single error message, allowing you to fix all issues at once instead of one at a time. + +### Working with Placeholders + +```javascript +const { slide, placeholders } = await html2pptx("slide.html", pptx); + +// Use first placeholder +slide.addChart(pptx.charts.BAR, data, placeholders[0]); + +// Find by ID +const chartArea = placeholders.find((p) => p.id === "chart-area"); +slide.addChart(pptx.charts.LINE, data, chartArea); +``` + +### Complete Example + +```javascript +const pptxgen = require("pptxgenjs"); +const { html2pptx } = require("@ant/html2pptx"); + +async function createPresentation() { + const pptx = new pptxgen(); + pptx.layout = "LAYOUT_16x9"; + pptx.author = "Your Name"; + pptx.title = "My Presentation"; + + // Slide 1: Title + const { slide: slide1 } = await html2pptx("slides/title.html", pptx); + + // Slide 2: Content with chart + const { slide: slide2, placeholders } = await html2pptx( + "slides/data.html", + pptx + ); + + const chartData = [ + { + name: "Sales", + labels: ["Q1", "Q2", "Q3", "Q4"], + values: [4500, 5500, 6200, 7100], + }, + ]; + + slide2.addChart(pptx.charts.BAR, chartData, { + ...placeholders[0], + showTitle: true, + title: "Quarterly Sales", + showCatAxisTitle: true, + catAxisTitle: "Quarter", + showValAxisTitle: true, + valAxisTitle: "Sales ($000s)", + }); + + // Save + await pptx.writeFile({ fileName: "presentation.pptx" }); + console.log("Presentation created successfully!"); +} + +createPresentation().catch(console.error); +``` + +**Run with:** + +```sh +NODE_PATH="$(npm root -g)" node create-presentation.js +``` + +## Using PptxGenJS + +After converting HTML to slides with `html2pptx`, you'll use PptxGenJS to add dynamic content like charts, images, and additional elements. + +### ⚠️ Critical Rules + +#### Colors + +- **NEVER use `#` prefix** with hex colors in PptxGenJS - causes file corruption +- ✅ Correct: `color: "FF0000"`, `fill: { color: "0066CC" }` +- ❌ Wrong: `color: "#FF0000"` (breaks document) + +### Adding Images + +Always calculate aspect ratios from actual image dimensions: + +```javascript +// Get image dimensions: identify image.png | grep -o '[0-9]* x [0-9]*' +const imgWidth = 1860, + imgHeight = 1519; // From actual file +const aspectRatio = imgWidth / imgHeight; + +const h = 3; // Max height +const w = h * aspectRatio; +const x = (10 - w) / 2; // Center on 16:9 slide + +slide.addImage({ path: "chart.png", x, y: 1.5, w, h }); +``` + +### Adding Text + +```javascript +// Rich text with formatting +slide.addText( + [ + { text: "Bold ", options: { bold: true } }, + { text: "Italic ", options: { italic: true } }, + { text: "Normal" }, + ], + { + x: 1, + y: 2, + w: 8, + h: 1, + } +); +``` + +### Adding Shapes + +```javascript +// Rectangle +slide.addShape(pptx.shapes.RECTANGLE, { + x: 1, + y: 1, + w: 3, + h: 2, + fill: { color: "4472C4" }, + line: { color: "000000", width: 2 }, +}); + +// Circle +slide.addShape(pptx.shapes.OVAL, { + x: 5, + y: 1, + w: 2, + h: 2, + fill: { color: "ED7D31" }, +}); + +// Rounded rectangle +slide.addShape(pptx.shapes.ROUNDED_RECTANGLE, { + x: 1, + y: 4, + w: 3, + h: 1.5, + fill: { color: "70AD47" }, + rectRadius: 0.2, +}); +``` + +### Adding Charts + +**Required for most charts:** Axis labels using `catAxisTitle` (category) and `valAxisTitle` (value). + +**Chart Data Format:** + +- Use **single series with all labels** for simple bar/line charts +- Each series creates a separate legend entry +- Labels array defines X-axis values + +**Time Series Data - Choose Correct Granularity:** + +- **< 30 days**: Use daily grouping (e.g., "10-01", "10-02") - avoid monthly aggregation that creates single-point charts +- **30-365 days**: Use monthly grouping (e.g., "2024-01", "2024-02") +- **> 365 days**: Use yearly grouping (e.g., "2023", "2024") +- **Validate**: Charts with only 1 data point likely indicate incorrect aggregation for the time period + +```javascript +const { slide, placeholders } = await html2pptx("slide.html", pptx); + +// CORRECT: Single series with all labels +slide.addChart( + pptx.charts.BAR, + [ + { + name: "Sales 2024", + labels: ["Q1", "Q2", "Q3", "Q4"], + values: [4500, 5500, 6200, 7100], + }, + ], + { + ...placeholders[0], // Use placeholder position + barDir: "col", // 'col' = vertical bars, 'bar' = horizontal + showTitle: true, + title: "Quarterly Sales", + showLegend: false, // No legend needed for single series + // Required axis labels + showCatAxisTitle: true, + catAxisTitle: "Quarter", + showValAxisTitle: true, + valAxisTitle: "Sales ($000s)", + // Optional: Control scaling (adjust min based on data range for better visualization) + valAxisMaxVal: 8000, + valAxisMinVal: 0, // Use 0 for counts/amounts; for clustered data (e.g., 4500-7100), consider starting closer to min value + valAxisMajorUnit: 2000, // Control y-axis label spacing to prevent crowding + catAxisLabelRotate: 45, // Rotate labels if crowded + dataLabelPosition: "outEnd", + dataLabelColor: "000000", + // Use single color for single-series charts + chartColors: ["4472C4"], // All bars same color + } +); +``` + +#### Scatter Chart + +**IMPORTANT**: Scatter chart data format is unusual - first series contains X-axis values, subsequent series contain Y-values: + +```javascript +// Prepare data +const data1 = [ + { x: 10, y: 20 }, + { x: 15, y: 25 }, + { x: 20, y: 30 }, +]; +const data2 = [ + { x: 12, y: 18 }, + { x: 18, y: 22 }, +]; + +const allXValues = [...data1.map((d) => d.x), ...data2.map((d) => d.x)]; + +slide.addChart( + pptx.charts.SCATTER, + [ + { name: "X-Axis", values: allXValues }, // First series = X values + { name: "Series 1", values: data1.map((d) => d.y) }, // Y values only + { name: "Series 2", values: data2.map((d) => d.y) }, // Y values only + ], + { + x: 1, + y: 1, + w: 8, + h: 4, + lineSize: 0, // 0 = no connecting lines + lineDataSymbol: "circle", + lineDataSymbolSize: 6, + showCatAxisTitle: true, + catAxisTitle: "X Axis", + showValAxisTitle: true, + valAxisTitle: "Y Axis", + chartColors: ["4472C4", "ED7D31"], + } +); +``` + +#### Line Chart + +```javascript +slide.addChart( + pptx.charts.LINE, + [ + { + name: "Temperature", + labels: ["Jan", "Feb", "Mar", "Apr"], + values: [32, 35, 42, 55], + }, + ], + { + x: 1, + y: 1, + w: 8, + h: 4, + lineSize: 4, + lineSmooth: true, + // Required axis labels + showCatAxisTitle: true, + catAxisTitle: "Month", + showValAxisTitle: true, + valAxisTitle: "Temperature (°F)", + // Optional: Y-axis range (set min based on data range for better visualization) + valAxisMinVal: 0, // For ranges starting at 0 (counts, percentages, etc.) + valAxisMaxVal: 60, + valAxisMajorUnit: 20, // Control y-axis label spacing to prevent crowding (e.g., 10, 20, 25) + // valAxisMinVal: 30, // PREFERRED: For data clustered in a range (e.g., 32-55 or ratings 3-5), start axis closer to min value to show variation + // Optional: Chart colors + chartColors: ["4472C4", "ED7D31", "A5A5A5"], + } +); +``` + +#### Pie Chart (No Axis Labels Required) + +**CRITICAL**: Pie charts require a **single data series** with all categories in the `labels` array and corresponding values in the `values` array. + +```javascript +slide.addChart( + pptx.charts.PIE, + [ + { + name: "Market Share", + labels: ["Product A", "Product B", "Other"], // All categories in one array + values: [35, 45, 20], // All values in one array + }, + ], + { + x: 2, + y: 1, + w: 6, + h: 4, + showPercent: true, + showLegend: true, + legendPos: "r", // right + chartColors: ["4472C4", "ED7D31", "A5A5A5"], + } +); +``` + +#### Multiple Data Series + +```javascript +slide.addChart( + pptx.charts.LINE, + [ + { + name: "Product A", + labels: ["Q1", "Q2", "Q3", "Q4"], + values: [10, 20, 30, 40], + }, + { + name: "Product B", + labels: ["Q1", "Q2", "Q3", "Q4"], + values: [15, 25, 20, 35], + }, + ], + { + x: 1, + y: 1, + w: 8, + h: 4, + showCatAxisTitle: true, + catAxisTitle: "Quarter", + showValAxisTitle: true, + valAxisTitle: "Revenue ($M)", + } +); +``` + +### Chart Colors + +**CRITICAL**: Use hex colors **without** the `#` prefix - including `#` causes file corruption. + +**Align chart colors with your chosen design palette**, ensuring sufficient contrast and distinctiveness for data visualization. Adjust colors for: + +- Strong contrast between adjacent series +- Readability against slide backgrounds +- Accessibility (avoid red-green only combinations) + +```javascript +// Example: Ocean palette-inspired chart colors (adjusted for contrast) +const chartColors = ["16A085", "FF6B9D", "2C3E50", "F39C12", "9B59B6"]; + +// Single-series chart: Use one color for all bars/points +slide.addChart( + pptx.charts.BAR, + [ + { + name: "Sales", + labels: ["Q1", "Q2", "Q3", "Q4"], + values: [4500, 5500, 6200, 7100], + }, + ], + { + ...placeholders[0], + chartColors: ["16A085"], // All bars same color + showLegend: false, + } +); + +// Multi-series chart: Each series gets a different color +slide.addChart( + pptx.charts.LINE, + [ + { name: "Product A", labels: ["Q1", "Q2", "Q3"], values: [10, 20, 30] }, + { name: "Product B", labels: ["Q1", "Q2", "Q3"], values: [15, 25, 20] }, + ], + { + ...placeholders[0], + chartColors: ["16A085", "FF6B9D"], // One color per series + } +); +``` + +### Adding Tables + +Tables can be added with basic or advanced formatting: + +#### Basic Table + +```javascript +slide.addTable( + [ + ["Header 1", "Header 2", "Header 3"], + ["Row 1, Col 1", "Row 1, Col 2", "Row 1, Col 3"], + ["Row 2, Col 1", "Row 2, Col 2", "Row 2, Col 3"], + ], + { + x: 0.5, + y: 1, + w: 9, + h: 3, + border: { pt: 1, color: "999999" }, + fill: { color: "F1F1F1" }, + } +); +``` + +#### Table with Custom Formatting + +```javascript +const tableData = [ + // Header row with custom styling + [ + { + text: "Product", + options: { fill: { color: "4472C4" }, color: "FFFFFF", bold: true }, + }, + { + text: "Revenue", + options: { fill: { color: "4472C4" }, color: "FFFFFF", bold: true }, + }, + { + text: "Growth", + options: { fill: { color: "4472C4" }, color: "FFFFFF", bold: true }, + }, + ], + // Data rows + ["Product A", "$50M", "+15%"], + ["Product B", "$35M", "+22%"], + ["Product C", "$28M", "+8%"], +]; + +slide.addTable(tableData, { + x: 1, + y: 1.5, + w: 8, + h: 3, + colW: [3, 2.5, 2.5], // Column widths + rowH: [0.5, 0.6, 0.6, 0.6], // Row heights + border: { pt: 1, color: "CCCCCC" }, + align: "center", + valign: "middle", + fontSize: 14, +}); +``` + +#### Table with Merged Cells + +```javascript +const mergedTableData = [ + [ + { + text: "Q1 Results", + options: { + colspan: 3, + fill: { color: "4472C4" }, + color: "FFFFFF", + bold: true, + }, + }, + ], + ["Product", "Sales", "Market Share"], + ["Product A", "$25M", "35%"], + ["Product B", "$18M", "25%"], +]; + +slide.addTable(mergedTableData, { + x: 1, + y: 1, + w: 8, + h: 2.5, + colW: [3, 2.5, 2.5], + border: { pt: 1, color: "DDDDDD" }, +}); +``` + +### Table Options + +Common table options: + +- `x, y, w, h` - Position and size +- `colW` - Array of column widths (in inches) +- `rowH` - Array of row heights (in inches) +- `border` - Border style: `{ pt: 1, color: "999999" }` +- `fill` - Background color (no # prefix) +- `align` - Text alignment: "left", "center", "right" +- `valign` - Vertical alignment: "top", "middle", "bottom" +- `fontSize` - Text size +- `autoPage` - Auto-create new slides if content overflows diff --git a/skills/pptx/html2pptx.tgz b/skills/pptx/html2pptx.tgz new file mode 100644 index 0000000..099e080 Binary files /dev/null and b/skills/pptx/html2pptx.tgz differ diff --git a/skills/pptx/ooxml.md b/skills/pptx/ooxml.md new file mode 100644 index 0000000..951b3cf --- /dev/null +++ b/skills/pptx/ooxml.md @@ -0,0 +1,427 @@ +# Office Open XML Technical Reference for PowerPoint + +**Important: Read this entire document before starting.** Critical XML schema rules and formatting requirements are covered throughout. Incorrect implementation can create invalid PPTX files that PowerPoint cannot open. + +## Technical Guidelines + +### Schema Compliance +- **Element ordering in ``**: ``, ``, `` +- **Whitespace**: Add `xml:space='preserve'` to `` elements with leading/trailing spaces +- **Unicode**: Escape characters in ASCII content: `"` becomes `“` +- **Images**: Add to `ppt/media/`, reference in slide XML, set dimensions to fit slide bounds +- **Relationships**: Update `ppt/slides/_rels/slideN.xml.rels` for each slide's resources +- **Dirty attribute**: Add `dirty="0"` to `` and `` elements to indicate clean state + +## Presentation Structure + +### Basic Slide Structure +```xml + + + + + ... + ... + + + + +``` + +### Text Box / Shape with Text +```xml + + + + + + + + + + + + + + + + + + + + + + Slide Title + + + + +``` + +### Text Formatting +```xml + + + + Bold Text + + + + + + Italic Text + + + + + + Underlined + + + + + + + + + + Highlighted Text + + + + + + + + + + Colored Arial 24pt + + + + + + + + + + Formatted text + +``` + +### Lists +```xml + + + + + + + First bullet point + + + + + + + + + + First numbered item + + + + + + + + + + Indented bullet + + +``` + +### Shapes +```xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +``` + +### Images +```xml + + + + + + + + + + + + + + + + + + + + + + + + + + +``` + +### Tables +```xml + + + + + + + + + + + + + + + + + + + + + + + + + + + Cell 1 + + + + + + + + + + + Cell 2 + + + + + + + + + +``` + +### Slide Layouts + +```xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +``` + +## File Updates + +When adding content, update these files: + +**`ppt/_rels/presentation.xml.rels`:** +```xml + + +``` + +**`ppt/slides/_rels/slide1.xml.rels`:** +```xml + + +``` + +**`[Content_Types].xml`:** +```xml + + + +``` + +**`ppt/presentation.xml`:** +```xml + + + + +``` + +**`docProps/app.xml`:** Update slide count and statistics +```xml +2 +10 +50 +``` + +## Slide Operations + +### Adding a New Slide +When adding a slide to the end of the presentation: + +1. **Create the slide file** (`ppt/slides/slideN.xml`) +2. **Update `[Content_Types].xml`**: Add Override for the new slide +3. **Update `ppt/_rels/presentation.xml.rels`**: Add relationship for the new slide +4. **Update `ppt/presentation.xml`**: Add slide ID to `` +5. **Create slide relationships** (`ppt/slides/_rels/slideN.xml.rels`) if needed +6. **Update `docProps/app.xml`**: Increment slide count and update statistics (if present) + +### Duplicating a Slide +1. Copy the source slide XML file with a new name +2. Update all IDs in the new slide to be unique +3. Follow the "Adding a New Slide" steps above +4. **CRITICAL**: Remove or update any notes slide references in `_rels` files +5. Remove references to unused media files + +### Reordering Slides +1. **Update `ppt/presentation.xml`**: Reorder `` elements in `` +2. The order of `` elements determines slide order +3. Keep slide IDs and relationship IDs unchanged + +Example: +```xml + + + + + + + + + + + + + +``` + +### Deleting a Slide +1. **Remove from `ppt/presentation.xml`**: Delete the `` entry +2. **Remove from `ppt/_rels/presentation.xml.rels`**: Delete the relationship +3. **Remove from `[Content_Types].xml`**: Delete the Override entry +4. **Delete files**: Remove `ppt/slides/slideN.xml` and `ppt/slides/_rels/slideN.xml.rels` +5. **Update `docProps/app.xml`**: Decrement slide count and update statistics +6. **Clean up unused media**: Remove orphaned images from `ppt/media/` + +Note: Don't renumber remaining slides - keep their original IDs and filenames. + + +## Common Errors to Avoid + +- **Encodings**: Escape unicode characters in ASCII content: `"` becomes `“` +- **Images**: Add to `ppt/media/` and update relationship files +- **Lists**: Omit bullets from list headers +- **IDs**: Use valid hexadecimal values for UUIDs +- **Themes**: Check all themes in `theme` directory for colors + +## Validation Checklist for Template-Based Presentations + +### Before Packing, Always: +- **Clean unused resources**: Remove unreferenced media, fonts, and notes directories +- **Fix Content_Types.xml**: Declare ALL slides, layouts, and themes present in the package +- **Fix relationship IDs**: + - Remove font embed references if not using embedded fonts +- **Remove broken references**: Check all `_rels` files for references to deleted resources + +### Common Template Duplication Pitfalls: +- Multiple slides referencing the same notes slide after duplication +- Image/media references from template slides that no longer exist +- Font embedding references when fonts aren't included +- Missing slideLayout declarations for layouts 12-25 +- docProps directory may not unpack - this is optional \ No newline at end of file diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-chart.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-chart.xsd new file mode 100644 index 0000000..bc325f9 --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-chart.xsd @@ -0,0 +1,1499 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd new file mode 100644 index 0000000..afa4f46 --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd @@ -0,0 +1,146 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd new file mode 100644 index 0000000..40e4b12 --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd @@ -0,0 +1,1085 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd new file mode 100644 index 0000000..687eea8 --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd @@ -0,0 +1,11 @@ + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-main.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-main.xsd new file mode 100644 index 0000000..94644b3 --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-main.xsd @@ -0,0 +1,3081 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-picture.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-picture.xsd new file mode 100644 index 0000000..1dbf051 --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-picture.xsd @@ -0,0 +1,23 @@ + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd new file mode 100644 index 0000000..f1af17d --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd @@ -0,0 +1,185 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd new file mode 100644 index 0000000..5c00a6f --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd @@ -0,0 +1,287 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/pml.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/pml.xsd new file mode 100644 index 0000000..25564eb --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/pml.xsd @@ -0,0 +1,1676 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd new file mode 100644 index 0000000..c20f3bf --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd @@ -0,0 +1,28 @@ + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd new file mode 100644 index 0000000..ac60252 --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd @@ -0,0 +1,144 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd new file mode 100644 index 0000000..52deec7 --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd @@ -0,0 +1,174 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd new file mode 100644 index 0000000..2bddce2 --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd @@ -0,0 +1,25 @@ + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd new file mode 100644 index 0000000..8a8c18b --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd new file mode 100644 index 0000000..5c42706 --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd @@ -0,0 +1,59 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd new file mode 100644 index 0000000..853c341 --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd @@ -0,0 +1,56 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd new file mode 100644 index 0000000..da835ee --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd @@ -0,0 +1,195 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-math.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-math.xsd new file mode 100644 index 0000000..4f37d30 --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-math.xsd @@ -0,0 +1,582 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd new file mode 100644 index 0000000..9e86f1b --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd @@ -0,0 +1,25 @@ + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/sml.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/sml.xsd new file mode 100644 index 0000000..237dd65 --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/sml.xsd @@ -0,0 +1,4439 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-main.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-main.xsd new file mode 100644 index 0000000..eeb4ef8 --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-main.xsd @@ -0,0 +1,570 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd new file mode 100644 index 0000000..ca2575c --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd @@ -0,0 +1,509 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd new file mode 100644 index 0000000..dd079e6 --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd @@ -0,0 +1,12 @@ + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd new file mode 100644 index 0000000..3dd6cf6 --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd @@ -0,0 +1,108 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd new file mode 100644 index 0000000..f1041e3 --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd @@ -0,0 +1,96 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/wml.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/wml.xsd new file mode 100644 index 0000000..9c5b7a6 --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/wml.xsd @@ -0,0 +1,3646 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/xml.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/xml.xsd new file mode 100644 index 0000000..fbd8876 --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/xml.xsd @@ -0,0 +1,116 @@ + + + + + + See http://www.w3.org/XML/1998/namespace.html and + http://www.w3.org/TR/REC-xml for information about this namespace. + + This schema document describes the XML namespace, in a form + suitable for import by other schema documents. + + Note that local names in this namespace are intended to be defined + only by the World Wide Web Consortium or its subgroups. The + following names are currently defined in this namespace and should + not be used with conflicting semantics by any Working Group, + specification, or document instance: + + base (as an attribute name): denotes an attribute whose value + provides a URI to be used as the base for interpreting any + relative URIs in the scope of the element on which it + appears; its value is inherited. This name is reserved + by virtue of its definition in the XML Base specification. + + lang (as an attribute name): denotes an attribute whose value + is a language code for the natural language of the content of + any element; its value is inherited. This name is reserved + by virtue of its definition in the XML specification. + + space (as an attribute name): denotes an attribute whose + value is a keyword indicating what whitespace processing + discipline is intended for the content of the element; its + value is inherited. This name is reserved by virtue of its + definition in the XML specification. + + Father (in any context at all): denotes Jon Bosak, the chair of + the original XML Working Group. This name is reserved by + the following decision of the W3C XML Plenary and + XML Coordination groups: + + In appreciation for his vision, leadership and dedication + the W3C XML Plenary on this 10th day of February, 2000 + reserves for Jon Bosak in perpetuity the XML name + xml:Father + + + + + This schema defines attributes and an attribute group + suitable for use by + schemas wishing to allow xml:base, xml:lang or xml:space attributes + on elements they define. + + To enable this, such a schema must import this schema + for the XML namespace, e.g. as follows: + <schema . . .> + . . . + <import namespace="http://www.w3.org/XML/1998/namespace" + schemaLocation="http://www.w3.org/2001/03/xml.xsd"/> + + Subsequently, qualified reference to any of the attributes + or the group defined below will have the desired effect, e.g. + + <type . . .> + . . . + <attributeGroup ref="xml:specialAttrs"/> + + will define a type which will schema-validate an instance + element with any of those attributes + + + + In keeping with the XML Schema WG's standard versioning + policy, this schema document will persist at + http://www.w3.org/2001/03/xml.xsd. + At the date of issue it can also be found at + http://www.w3.org/2001/xml.xsd. + The schema document at that URI may however change in the future, + in order to remain compatible with the latest version of XML Schema + itself. In other words, if the XML Schema namespace changes, the version + of this document at + http://www.w3.org/2001/xml.xsd will change + accordingly; the version at + http://www.w3.org/2001/03/xml.xsd will not change. + + + + + + In due course, we should install the relevant ISO 2- and 3-letter + codes as the enumerated possible values . . . + + + + + + + + + + + + + + + See http://www.w3.org/TR/xmlbase/ for + information about this attribute. + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-contentTypes.xsd b/skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-contentTypes.xsd new file mode 100644 index 0000000..e4c5160 --- /dev/null +++ b/skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-contentTypes.xsd @@ -0,0 +1,42 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-coreProperties.xsd b/skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-coreProperties.xsd new file mode 100644 index 0000000..888c0fc --- /dev/null +++ b/skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-coreProperties.xsd @@ -0,0 +1,50 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-digSig.xsd b/skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-digSig.xsd new file mode 100644 index 0000000..7378226 --- /dev/null +++ b/skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-digSig.xsd @@ -0,0 +1,49 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-relationships.xsd b/skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-relationships.xsd new file mode 100644 index 0000000..762dcbe --- /dev/null +++ b/skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-relationships.xsd @@ -0,0 +1,33 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/mce/mc.xsd b/skills/pptx/ooxml/schemas/mce/mc.xsd new file mode 100644 index 0000000..ef72545 --- /dev/null +++ b/skills/pptx/ooxml/schemas/mce/mc.xsd @@ -0,0 +1,75 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/microsoft/wml-2010.xsd b/skills/pptx/ooxml/schemas/microsoft/wml-2010.xsd new file mode 100644 index 0000000..f65f777 --- /dev/null +++ b/skills/pptx/ooxml/schemas/microsoft/wml-2010.xsd @@ -0,0 +1,560 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/microsoft/wml-2012.xsd b/skills/pptx/ooxml/schemas/microsoft/wml-2012.xsd new file mode 100644 index 0000000..6b00755 --- /dev/null +++ b/skills/pptx/ooxml/schemas/microsoft/wml-2012.xsd @@ -0,0 +1,67 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/microsoft/wml-2018.xsd b/skills/pptx/ooxml/schemas/microsoft/wml-2018.xsd new file mode 100644 index 0000000..f321d33 --- /dev/null +++ b/skills/pptx/ooxml/schemas/microsoft/wml-2018.xsd @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/microsoft/wml-cex-2018.xsd b/skills/pptx/ooxml/schemas/microsoft/wml-cex-2018.xsd new file mode 100644 index 0000000..364c6a9 --- /dev/null +++ b/skills/pptx/ooxml/schemas/microsoft/wml-cex-2018.xsd @@ -0,0 +1,20 @@ + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/microsoft/wml-cid-2016.xsd b/skills/pptx/ooxml/schemas/microsoft/wml-cid-2016.xsd new file mode 100644 index 0000000..fed9d15 --- /dev/null +++ b/skills/pptx/ooxml/schemas/microsoft/wml-cid-2016.xsd @@ -0,0 +1,13 @@ + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/microsoft/wml-sdtdatahash-2020.xsd b/skills/pptx/ooxml/schemas/microsoft/wml-sdtdatahash-2020.xsd new file mode 100644 index 0000000..680cf15 --- /dev/null +++ b/skills/pptx/ooxml/schemas/microsoft/wml-sdtdatahash-2020.xsd @@ -0,0 +1,4 @@ + + + + diff --git a/skills/pptx/ooxml/schemas/microsoft/wml-symex-2015.xsd b/skills/pptx/ooxml/schemas/microsoft/wml-symex-2015.xsd new file mode 100644 index 0000000..89ada90 --- /dev/null +++ b/skills/pptx/ooxml/schemas/microsoft/wml-symex-2015.xsd @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/skills/pptx/ooxml/scripts/pack.py b/skills/pptx/ooxml/scripts/pack.py new file mode 100644 index 0000000..68bc088 --- /dev/null +++ b/skills/pptx/ooxml/scripts/pack.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python3 +""" +Tool to pack a directory into a .docx, .pptx, or .xlsx file with XML formatting undone. + +Example usage: + python pack.py [--force] +""" + +import argparse +import shutil +import subprocess +import sys +import tempfile +import defusedxml.minidom +import zipfile +from pathlib import Path + + +def main(): + parser = argparse.ArgumentParser(description="Pack a directory into an Office file") + parser.add_argument("input_directory", help="Unpacked Office document directory") + parser.add_argument("output_file", help="Output Office file (.docx/.pptx/.xlsx)") + parser.add_argument("--force", action="store_true", help="Skip validation") + args = parser.parse_args() + + try: + success = pack_document( + args.input_directory, args.output_file, validate=not args.force + ) + + # Show warning if validation was skipped + if args.force: + print("Warning: Skipped validation, file may be corrupt", file=sys.stderr) + # Exit with error if validation failed + elif not success: + print("Contents would produce a corrupt file.", file=sys.stderr) + print("Please validate XML before repacking.", file=sys.stderr) + print("Use --force to skip validation and pack anyway.", file=sys.stderr) + sys.exit(1) + + except ValueError as e: + sys.exit(f"Error: {e}") + + +def pack_document(input_dir, output_file, validate=False): + """Pack a directory into an Office file (.docx/.pptx/.xlsx). + + Args: + input_dir: Path to unpacked Office document directory + output_file: Path to output Office file + validate: If True, validates with soffice (default: False) + + Returns: + bool: True if successful, False if validation failed + """ + input_dir = Path(input_dir) + output_file = Path(output_file) + + if not input_dir.is_dir(): + raise ValueError(f"{input_dir} is not a directory") + if output_file.suffix.lower() not in {".docx", ".pptx", ".xlsx"}: + raise ValueError(f"{output_file} must be a .docx, .pptx, or .xlsx file") + + # Work in temporary directory to avoid modifying original + with tempfile.TemporaryDirectory() as temp_dir: + temp_content_dir = Path(temp_dir) / "content" + shutil.copytree(input_dir, temp_content_dir) + + # Process XML files to remove pretty-printing whitespace + for pattern in ["*.xml", "*.rels"]: + for xml_file in temp_content_dir.rglob(pattern): + condense_xml(xml_file) + + # Create final Office file as zip archive + output_file.parent.mkdir(parents=True, exist_ok=True) + with zipfile.ZipFile(output_file, "w", zipfile.ZIP_DEFLATED) as zf: + for f in temp_content_dir.rglob("*"): + if f.is_file(): + zf.write(f, f.relative_to(temp_content_dir)) + + # Validate if requested + if validate: + if not validate_document(output_file): + output_file.unlink() # Delete the corrupt file + return False + + return True + + +def validate_document(doc_path): + """Validate document by converting to HTML with soffice.""" + # Determine the correct filter based on file extension + match doc_path.suffix.lower(): + case ".docx": + filter_name = "html:HTML" + case ".pptx": + filter_name = "html:impress_html_Export" + case ".xlsx": + filter_name = "html:HTML (StarCalc)" + + with tempfile.TemporaryDirectory() as temp_dir: + try: + result = subprocess.run( + [ + "soffice", + "--headless", + "--convert-to", + filter_name, + "--outdir", + temp_dir, + str(doc_path), + ], + capture_output=True, + timeout=10, + text=True, + ) + if not (Path(temp_dir) / f"{doc_path.stem}.html").exists(): + error_msg = result.stderr.strip() or "Document validation failed" + print(f"Validation error: {error_msg}", file=sys.stderr) + return False + return True + except FileNotFoundError: + print("Warning: soffice not found. Skipping validation.", file=sys.stderr) + return True + except subprocess.TimeoutExpired: + print("Validation error: Timeout during conversion", file=sys.stderr) + return False + except Exception as e: + print(f"Validation error: {e}", file=sys.stderr) + return False + + +def condense_xml(xml_file): + """Strip unnecessary whitespace and remove comments.""" + with open(xml_file, "r", encoding="utf-8") as f: + dom = defusedxml.minidom.parse(f) + + # Process each element to remove whitespace and comments + for element in dom.getElementsByTagName("*"): + # Skip w:t elements and their processing + if element.tagName.endswith(":t"): + continue + + # Remove whitespace-only text nodes and comment nodes + for child in list(element.childNodes): + if ( + child.nodeType == child.TEXT_NODE + and child.nodeValue + and child.nodeValue.strip() == "" + ) or child.nodeType == child.COMMENT_NODE: + element.removeChild(child) + + # Write back the condensed XML + with open(xml_file, "wb") as f: + f.write(dom.toxml(encoding="UTF-8")) + + +if __name__ == "__main__": + main() diff --git a/skills/pptx/ooxml/scripts/unpack.py b/skills/pptx/ooxml/scripts/unpack.py new file mode 100644 index 0000000..4938798 --- /dev/null +++ b/skills/pptx/ooxml/scripts/unpack.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 +"""Unpack and format XML contents of Office files (.docx, .pptx, .xlsx)""" + +import random +import sys +import defusedxml.minidom +import zipfile +from pathlib import Path + +# Get command line arguments +assert len(sys.argv) == 3, "Usage: python unpack.py " +input_file, output_dir = sys.argv[1], sys.argv[2] + +# Extract and format +output_path = Path(output_dir) +output_path.mkdir(parents=True, exist_ok=True) +zipfile.ZipFile(input_file).extractall(output_path) + +# Pretty print all XML files +xml_files = list(output_path.rglob("*.xml")) + list(output_path.rglob("*.rels")) +for xml_file in xml_files: + content = xml_file.read_text(encoding="utf-8") + dom = defusedxml.minidom.parseString(content) + xml_file.write_bytes(dom.toprettyxml(indent=" ", encoding="ascii")) + +# For .docx files, suggest an RSID for tracked changes +if input_file.endswith(".docx"): + suggested_rsid = "".join(random.choices("0123456789ABCDEF", k=8)) + print(f"Suggested RSID for edit session: {suggested_rsid}") diff --git a/skills/pptx/ooxml/scripts/validate.py b/skills/pptx/ooxml/scripts/validate.py new file mode 100644 index 0000000..508c589 --- /dev/null +++ b/skills/pptx/ooxml/scripts/validate.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 +""" +Command line tool to validate Office document XML files against XSD schemas and tracked changes. + +Usage: + python validate.py --original +""" + +import argparse +import sys +from pathlib import Path + +from validation import DOCXSchemaValidator, PPTXSchemaValidator, RedliningValidator + + +def main(): + parser = argparse.ArgumentParser(description="Validate Office document XML files") + parser.add_argument( + "unpacked_dir", + help="Path to unpacked Office document directory", + ) + parser.add_argument( + "--original", + required=True, + help="Path to original file (.docx/.pptx/.xlsx)", + ) + parser.add_argument( + "-v", + "--verbose", + action="store_true", + help="Enable verbose output", + ) + args = parser.parse_args() + + # Validate paths + unpacked_dir = Path(args.unpacked_dir) + original_file = Path(args.original) + file_extension = original_file.suffix.lower() + assert unpacked_dir.is_dir(), f"Error: {unpacked_dir} is not a directory" + assert original_file.is_file(), f"Error: {original_file} is not a file" + assert file_extension in [".docx", ".pptx", ".xlsx"], ( + f"Error: {original_file} must be a .docx, .pptx, or .xlsx file" + ) + + # Run validations + match file_extension: + case ".docx": + validators = [DOCXSchemaValidator, RedliningValidator] + case ".pptx": + validators = [PPTXSchemaValidator] + case _: + print(f"Error: Validation not supported for file type {file_extension}") + sys.exit(1) + + # Run validators + success = True + for V in validators: + validator = V(unpacked_dir, original_file, verbose=args.verbose) + if not validator.validate(): + success = False + + if success: + print("All validations PASSED!") + + sys.exit(0 if success else 1) + + +if __name__ == "__main__": + main() diff --git a/skills/pptx/ooxml/scripts/validation/__init__.py b/skills/pptx/ooxml/scripts/validation/__init__.py new file mode 100644 index 0000000..db092ec --- /dev/null +++ b/skills/pptx/ooxml/scripts/validation/__init__.py @@ -0,0 +1,15 @@ +""" +Validation modules for Word document processing. +""" + +from .base import BaseSchemaValidator +from .docx import DOCXSchemaValidator +from .pptx import PPTXSchemaValidator +from .redlining import RedliningValidator + +__all__ = [ + "BaseSchemaValidator", + "DOCXSchemaValidator", + "PPTXSchemaValidator", + "RedliningValidator", +] diff --git a/skills/pptx/ooxml/scripts/validation/base.py b/skills/pptx/ooxml/scripts/validation/base.py new file mode 100644 index 0000000..0681b19 --- /dev/null +++ b/skills/pptx/ooxml/scripts/validation/base.py @@ -0,0 +1,951 @@ +""" +Base validator with common validation logic for document files. +""" + +import re +from pathlib import Path + +import lxml.etree + + +class BaseSchemaValidator: + """Base validator with common validation logic for document files.""" + + # Elements whose 'id' attributes must be unique within their file + # Format: element_name -> (attribute_name, scope) + # scope can be 'file' (unique within file) or 'global' (unique across all files) + UNIQUE_ID_REQUIREMENTS = { + # Word elements + "comment": ("id", "file"), # Comment IDs in comments.xml + "commentrangestart": ("id", "file"), # Must match comment IDs + "commentrangeend": ("id", "file"), # Must match comment IDs + "bookmarkstart": ("id", "file"), # Bookmark start IDs + "bookmarkend": ("id", "file"), # Bookmark end IDs + # Note: ins and del (track changes) can share IDs when part of same revision + # PowerPoint elements + "sldid": ("id", "file"), # Slide IDs in presentation.xml + "sldmasterid": ("id", "global"), # Slide master IDs must be globally unique + "sldlayoutid": ("id", "global"), # Slide layout IDs must be globally unique + "cm": ("authorid", "file"), # Comment author IDs + # Excel elements + "sheet": ("sheetid", "file"), # Sheet IDs in workbook.xml + "definedname": ("id", "file"), # Named range IDs + # Drawing/Shape elements (all formats) + "cxnsp": ("id", "file"), # Connection shape IDs + "sp": ("id", "file"), # Shape IDs + "pic": ("id", "file"), # Picture IDs + "grpsp": ("id", "file"), # Group shape IDs + } + + # Mapping of element names to expected relationship types + # Subclasses should override this with format-specific mappings + ELEMENT_RELATIONSHIP_TYPES = {} + + # Unified schema mappings for all Office document types + SCHEMA_MAPPINGS = { + # Document type specific schemas + "word": "ISO-IEC29500-4_2016/wml.xsd", # Word documents + "ppt": "ISO-IEC29500-4_2016/pml.xsd", # PowerPoint presentations + "xl": "ISO-IEC29500-4_2016/sml.xsd", # Excel spreadsheets + # Common file types + "[Content_Types].xml": "ecma/fouth-edition/opc-contentTypes.xsd", + "app.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd", + "core.xml": "ecma/fouth-edition/opc-coreProperties.xsd", + "custom.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd", + ".rels": "ecma/fouth-edition/opc-relationships.xsd", + # Word-specific files + "people.xml": "microsoft/wml-2012.xsd", + "commentsIds.xml": "microsoft/wml-cid-2016.xsd", + "commentsExtensible.xml": "microsoft/wml-cex-2018.xsd", + "commentsExtended.xml": "microsoft/wml-2012.xsd", + # Chart files (common across document types) + "chart": "ISO-IEC29500-4_2016/dml-chart.xsd", + # Theme files (common across document types) + "theme": "ISO-IEC29500-4_2016/dml-main.xsd", + # Drawing and media files + "drawing": "ISO-IEC29500-4_2016/dml-main.xsd", + } + + # Unified namespace constants + MC_NAMESPACE = "http://schemas.openxmlformats.org/markup-compatibility/2006" + XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace" + + # Common OOXML namespaces used across validators + PACKAGE_RELATIONSHIPS_NAMESPACE = ( + "http://schemas.openxmlformats.org/package/2006/relationships" + ) + OFFICE_RELATIONSHIPS_NAMESPACE = ( + "http://schemas.openxmlformats.org/officeDocument/2006/relationships" + ) + CONTENT_TYPES_NAMESPACE = ( + "http://schemas.openxmlformats.org/package/2006/content-types" + ) + + # Folders where we should clean ignorable namespaces + MAIN_CONTENT_FOLDERS = {"word", "ppt", "xl"} + + # All allowed OOXML namespaces (superset of all document types) + OOXML_NAMESPACES = { + "http://schemas.openxmlformats.org/officeDocument/2006/math", + "http://schemas.openxmlformats.org/officeDocument/2006/relationships", + "http://schemas.openxmlformats.org/schemaLibrary/2006/main", + "http://schemas.openxmlformats.org/drawingml/2006/main", + "http://schemas.openxmlformats.org/drawingml/2006/chart", + "http://schemas.openxmlformats.org/drawingml/2006/chartDrawing", + "http://schemas.openxmlformats.org/drawingml/2006/diagram", + "http://schemas.openxmlformats.org/drawingml/2006/picture", + "http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing", + "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing", + "http://schemas.openxmlformats.org/wordprocessingml/2006/main", + "http://schemas.openxmlformats.org/presentationml/2006/main", + "http://schemas.openxmlformats.org/spreadsheetml/2006/main", + "http://schemas.openxmlformats.org/officeDocument/2006/sharedTypes", + "http://www.w3.org/XML/1998/namespace", + } + + def __init__(self, unpacked_dir, original_file, verbose=False): + self.unpacked_dir = Path(unpacked_dir).resolve() + self.original_file = Path(original_file) + self.verbose = verbose + + # Set schemas directory + self.schemas_dir = Path(__file__).parent.parent.parent / "schemas" + + # Get all XML and .rels files + patterns = ["*.xml", "*.rels"] + self.xml_files = [ + f for pattern in patterns for f in self.unpacked_dir.rglob(pattern) + ] + + if not self.xml_files: + print(f"Warning: No XML files found in {self.unpacked_dir}") + + def validate(self): + """Run all validation checks and return True if all pass.""" + raise NotImplementedError("Subclasses must implement the validate method") + + def validate_xml(self): + """Validate that all XML files are well-formed.""" + errors = [] + + for xml_file in self.xml_files: + try: + # Try to parse the XML file + lxml.etree.parse(str(xml_file)) + except lxml.etree.XMLSyntaxError as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {e.lineno}: {e.msg}" + ) + except Exception as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Unexpected error: {str(e)}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} XML violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All XML files are well-formed") + return True + + def validate_namespaces(self): + """Validate that namespace prefixes in Ignorable attributes are declared.""" + errors = [] + + for xml_file in self.xml_files: + try: + root = lxml.etree.parse(str(xml_file)).getroot() + declared = set(root.nsmap.keys()) - {None} # Exclude default namespace + + for attr_val in [ + v for k, v in root.attrib.items() if k.endswith("Ignorable") + ]: + undeclared = set(attr_val.split()) - declared + errors.extend( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Namespace '{ns}' in Ignorable but not declared" + for ns in undeclared + ) + except lxml.etree.XMLSyntaxError: + continue + + if errors: + print(f"FAILED - {len(errors)} namespace issues:") + for error in errors: + print(error) + return False + if self.verbose: + print("PASSED - All namespace prefixes properly declared") + return True + + def validate_unique_ids(self): + """Validate that specific IDs are unique according to OOXML requirements.""" + errors = [] + global_ids = {} # Track globally unique IDs across all files + + for xml_file in self.xml_files: + try: + root = lxml.etree.parse(str(xml_file)).getroot() + file_ids = {} # Track IDs that must be unique within this file + + # Remove all mc:AlternateContent elements from the tree + mc_elements = root.xpath( + ".//mc:AlternateContent", namespaces={"mc": self.MC_NAMESPACE} + ) + for elem in mc_elements: + elem.getparent().remove(elem) + + # Now check IDs in the cleaned tree + for elem in root.iter(): + # Get the element name without namespace + tag = ( + elem.tag.split("}")[-1].lower() + if "}" in elem.tag + else elem.tag.lower() + ) + + # Check if this element type has ID uniqueness requirements + if tag in self.UNIQUE_ID_REQUIREMENTS: + attr_name, scope = self.UNIQUE_ID_REQUIREMENTS[tag] + + # Look for the specified attribute + id_value = None + for attr, value in elem.attrib.items(): + attr_local = ( + attr.split("}")[-1].lower() + if "}" in attr + else attr.lower() + ) + if attr_local == attr_name: + id_value = value + break + + if id_value is not None: + if scope == "global": + # Check global uniqueness + if id_value in global_ids: + prev_file, prev_line, prev_tag = global_ids[ + id_value + ] + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: Global ID '{id_value}' in <{tag}> " + f"already used in {prev_file} at line {prev_line} in <{prev_tag}>" + ) + else: + global_ids[id_value] = ( + xml_file.relative_to(self.unpacked_dir), + elem.sourceline, + tag, + ) + elif scope == "file": + # Check file-level uniqueness + key = (tag, attr_name) + if key not in file_ids: + file_ids[key] = {} + + if id_value in file_ids[key]: + prev_line = file_ids[key][id_value] + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: Duplicate {attr_name}='{id_value}' in <{tag}> " + f"(first occurrence at line {prev_line})" + ) + else: + file_ids[key][id_value] = elem.sourceline + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} ID uniqueness violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All required IDs are unique") + return True + + def validate_file_references(self): + """ + Validate that all .rels files properly reference files and that all files are referenced. + """ + errors = [] + + # Find all .rels files + rels_files = list(self.unpacked_dir.rglob("*.rels")) + + if not rels_files: + if self.verbose: + print("PASSED - No .rels files found") + return True + + # Get all files in the unpacked directory (excluding reference files) + all_files = [] + for file_path in self.unpacked_dir.rglob("*"): + if ( + file_path.is_file() + and file_path.name != "[Content_Types].xml" + and not file_path.name.endswith(".rels") + ): # This file is not referenced by .rels + all_files.append(file_path.resolve()) + + # Track all files that are referenced by any .rels file + all_referenced_files = set() + + if self.verbose: + print( + f"Found {len(rels_files)} .rels files and {len(all_files)} target files" + ) + + # Check each .rels file + for rels_file in rels_files: + try: + # Parse relationships file + rels_root = lxml.etree.parse(str(rels_file)).getroot() + + # Get the directory where this .rels file is located + rels_dir = rels_file.parent + + # Find all relationships and their targets + referenced_files = set() + broken_refs = [] + + for rel in rels_root.findall( + ".//ns:Relationship", + namespaces={"ns": self.PACKAGE_RELATIONSHIPS_NAMESPACE}, + ): + target = rel.get("Target") + if target and not target.startswith( + ("http", "mailto:") + ): # Skip external URLs + # Resolve the target path relative to the .rels file location + if rels_file.name == ".rels": + # Root .rels file - targets are relative to unpacked_dir + target_path = self.unpacked_dir / target + else: + # Other .rels files - targets are relative to their parent's parent + # e.g., word/_rels/document.xml.rels -> targets relative to word/ + base_dir = rels_dir.parent + target_path = base_dir / target + + # Normalize the path and check if it exists + try: + target_path = target_path.resolve() + if target_path.exists() and target_path.is_file(): + referenced_files.add(target_path) + all_referenced_files.add(target_path) + else: + broken_refs.append((target, rel.sourceline)) + except (OSError, ValueError): + broken_refs.append((target, rel.sourceline)) + + # Report broken references + if broken_refs: + rel_path = rels_file.relative_to(self.unpacked_dir) + for broken_ref, line_num in broken_refs: + errors.append( + f" {rel_path}: Line {line_num}: Broken reference to {broken_ref}" + ) + + except Exception as e: + rel_path = rels_file.relative_to(self.unpacked_dir) + errors.append(f" Error parsing {rel_path}: {e}") + + # Check for unreferenced files (files that exist but are not referenced anywhere) + unreferenced_files = set(all_files) - all_referenced_files + + if unreferenced_files: + for unref_file in sorted(unreferenced_files): + unref_rel_path = unref_file.relative_to(self.unpacked_dir) + errors.append(f" Unreferenced file: {unref_rel_path}") + + if errors: + print(f"FAILED - Found {len(errors)} relationship validation errors:") + for error in errors: + print(error) + print( + "CRITICAL: These errors will cause the document to appear corrupt. " + + "Broken references MUST be fixed, " + + "and unreferenced files MUST be referenced or removed." + ) + return False + else: + if self.verbose: + print( + "PASSED - All references are valid and all files are properly referenced" + ) + return True + + def validate_all_relationship_ids(self): + """ + Validate that all r:id attributes in XML files reference existing IDs + in their corresponding .rels files, and optionally validate relationship types. + """ + import lxml.etree + + errors = [] + + # Process each XML file that might contain r:id references + for xml_file in self.xml_files: + # Skip .rels files themselves + if xml_file.suffix == ".rels": + continue + + # Determine the corresponding .rels file + # For dir/file.xml, it's dir/_rels/file.xml.rels + rels_dir = xml_file.parent / "_rels" + rels_file = rels_dir / f"{xml_file.name}.rels" + + # Skip if there's no corresponding .rels file (that's okay) + if not rels_file.exists(): + continue + + try: + # Parse the .rels file to get valid relationship IDs and their types + rels_root = lxml.etree.parse(str(rels_file)).getroot() + rid_to_type = {} + + for rel in rels_root.findall( + f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship" + ): + rid = rel.get("Id") + rel_type = rel.get("Type", "") + if rid: + # Check for duplicate rIds + if rid in rid_to_type: + rels_rel_path = rels_file.relative_to(self.unpacked_dir) + errors.append( + f" {rels_rel_path}: Line {rel.sourceline}: " + f"Duplicate relationship ID '{rid}' (IDs must be unique)" + ) + # Extract just the type name from the full URL + type_name = ( + rel_type.split("/")[-1] if "/" in rel_type else rel_type + ) + rid_to_type[rid] = type_name + + # Parse the XML file to find all r:id references + xml_root = lxml.etree.parse(str(xml_file)).getroot() + + # Find all elements with r:id attributes + for elem in xml_root.iter(): + # Check for r:id attribute (relationship ID) + rid_attr = elem.get(f"{{{self.OFFICE_RELATIONSHIPS_NAMESPACE}}}id") + if rid_attr: + xml_rel_path = xml_file.relative_to(self.unpacked_dir) + elem_name = ( + elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag + ) + + # Check if the ID exists + if rid_attr not in rid_to_type: + errors.append( + f" {xml_rel_path}: Line {elem.sourceline}: " + f"<{elem_name}> references non-existent relationship '{rid_attr}' " + f"(valid IDs: {', '.join(sorted(rid_to_type.keys())[:5])}{'...' if len(rid_to_type) > 5 else ''})" + ) + # Check if we have type expectations for this element + elif self.ELEMENT_RELATIONSHIP_TYPES: + expected_type = self._get_expected_relationship_type( + elem_name + ) + if expected_type: + actual_type = rid_to_type[rid_attr] + # Check if the actual type matches or contains the expected type + if expected_type not in actual_type.lower(): + errors.append( + f" {xml_rel_path}: Line {elem.sourceline}: " + f"<{elem_name}> references '{rid_attr}' which points to '{actual_type}' " + f"but should point to a '{expected_type}' relationship" + ) + + except Exception as e: + xml_rel_path = xml_file.relative_to(self.unpacked_dir) + errors.append(f" Error processing {xml_rel_path}: {e}") + + if errors: + print(f"FAILED - Found {len(errors)} relationship ID reference errors:") + for error in errors: + print(error) + print("\nThese ID mismatches will cause the document to appear corrupt!") + return False + else: + if self.verbose: + print("PASSED - All relationship ID references are valid") + return True + + def _get_expected_relationship_type(self, element_name): + """ + Get the expected relationship type for an element. + First checks the explicit mapping, then tries pattern detection. + """ + # Normalize element name to lowercase + elem_lower = element_name.lower() + + # Check explicit mapping first + if elem_lower in self.ELEMENT_RELATIONSHIP_TYPES: + return self.ELEMENT_RELATIONSHIP_TYPES[elem_lower] + + # Try pattern detection for common patterns + # Pattern 1: Elements ending in "Id" often expect a relationship of the prefix type + if elem_lower.endswith("id") and len(elem_lower) > 2: + # e.g., "sldId" -> "sld", "sldMasterId" -> "sldMaster" + prefix = elem_lower[:-2] # Remove "id" + # Check if this might be a compound like "sldMasterId" + if prefix.endswith("master"): + return prefix.lower() + elif prefix.endswith("layout"): + return prefix.lower() + else: + # Simple case like "sldId" -> "slide" + # Common transformations + if prefix == "sld": + return "slide" + return prefix.lower() + + # Pattern 2: Elements ending in "Reference" expect a relationship of the prefix type + if elem_lower.endswith("reference") and len(elem_lower) > 9: + prefix = elem_lower[:-9] # Remove "reference" + return prefix.lower() + + return None + + def validate_content_types(self): + """Validate that all content files are properly declared in [Content_Types].xml.""" + errors = [] + + # Find [Content_Types].xml file + content_types_file = self.unpacked_dir / "[Content_Types].xml" + if not content_types_file.exists(): + print("FAILED - [Content_Types].xml file not found") + return False + + try: + # Parse and get all declared parts and extensions + root = lxml.etree.parse(str(content_types_file)).getroot() + declared_parts = set() + declared_extensions = set() + + # Get Override declarations (specific files) + for override in root.findall( + f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Override" + ): + part_name = override.get("PartName") + if part_name is not None: + declared_parts.add(part_name.lstrip("/")) + + # Get Default declarations (by extension) + for default in root.findall( + f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Default" + ): + extension = default.get("Extension") + if extension is not None: + declared_extensions.add(extension.lower()) + + # Root elements that require content type declaration + declarable_roots = { + "sld", + "sldLayout", + "sldMaster", + "presentation", # PowerPoint + "document", # Word + "workbook", + "worksheet", # Excel + "theme", # Common + } + + # Common media file extensions that should be declared + media_extensions = { + "png": "image/png", + "jpg": "image/jpeg", + "jpeg": "image/jpeg", + "gif": "image/gif", + "bmp": "image/bmp", + "tiff": "image/tiff", + "wmf": "image/x-wmf", + "emf": "image/x-emf", + } + + # Get all files in the unpacked directory + all_files = list(self.unpacked_dir.rglob("*")) + all_files = [f for f in all_files if f.is_file()] + + # Check all XML files for Override declarations + for xml_file in self.xml_files: + path_str = str(xml_file.relative_to(self.unpacked_dir)).replace( + "\\", "/" + ) + + # Skip non-content files + if any( + skip in path_str + for skip in [".rels", "[Content_Types]", "docProps/", "_rels/"] + ): + continue + + try: + root_tag = lxml.etree.parse(str(xml_file)).getroot().tag + root_name = root_tag.split("}")[-1] if "}" in root_tag else root_tag + + if root_name in declarable_roots and path_str not in declared_parts: + errors.append( + f" {path_str}: File with <{root_name}> root not declared in [Content_Types].xml" + ) + + except Exception: + continue # Skip unparseable files + + # Check all non-XML files for Default extension declarations + for file_path in all_files: + # Skip XML files and metadata files (already checked above) + if file_path.suffix.lower() in {".xml", ".rels"}: + continue + if file_path.name == "[Content_Types].xml": + continue + if "_rels" in file_path.parts or "docProps" in file_path.parts: + continue + + extension = file_path.suffix.lstrip(".").lower() + if extension and extension not in declared_extensions: + # Check if it's a known media extension that should be declared + if extension in media_extensions: + relative_path = file_path.relative_to(self.unpacked_dir) + errors.append( + f' {relative_path}: File with extension \'{extension}\' not declared in [Content_Types].xml - should add: ' + ) + + except Exception as e: + errors.append(f" Error parsing [Content_Types].xml: {e}") + + if errors: + print(f"FAILED - Found {len(errors)} content type declaration errors:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print( + "PASSED - All content files are properly declared in [Content_Types].xml" + ) + return True + + def validate_file_against_xsd(self, xml_file, verbose=False): + """Validate a single XML file against XSD schema, comparing with original. + + Args: + xml_file: Path to XML file to validate + verbose: Enable verbose output + + Returns: + tuple: (is_valid, new_errors_set) where is_valid is True/False/None (skipped) + """ + # Resolve both paths to handle symlinks + xml_file = Path(xml_file).resolve() + unpacked_dir = self.unpacked_dir.resolve() + + # Validate current file + is_valid, current_errors = self._validate_single_file_xsd( + xml_file, unpacked_dir + ) + + if is_valid is None: + return None, set() # Skipped + elif is_valid: + return True, set() # Valid, no errors + + # Get errors from original file for this specific file + original_errors = self._get_original_file_errors(xml_file) + + # Compare with original (both are guaranteed to be sets here) + assert current_errors is not None + new_errors = current_errors - original_errors + + if new_errors: + if verbose: + relative_path = xml_file.relative_to(unpacked_dir) + print(f"FAILED - {relative_path}: {len(new_errors)} new error(s)") + for error in list(new_errors)[:3]: + truncated = error[:250] + "..." if len(error) > 250 else error + print(f" - {truncated}") + return False, new_errors + else: + # All errors existed in original + if verbose: + print( + f"PASSED - No new errors (original had {len(current_errors)} errors)" + ) + return True, set() + + def validate_against_xsd(self): + """Validate XML files against XSD schemas, showing only new errors compared to original.""" + new_errors = [] + original_error_count = 0 + valid_count = 0 + skipped_count = 0 + + for xml_file in self.xml_files: + relative_path = str(xml_file.relative_to(self.unpacked_dir)) + is_valid, new_file_errors = self.validate_file_against_xsd( + xml_file, verbose=False + ) + + if is_valid is None: + skipped_count += 1 + continue + elif is_valid and not new_file_errors: + valid_count += 1 + continue + elif is_valid: + # Had errors but all existed in original + original_error_count += 1 + valid_count += 1 + continue + + # Has new errors + new_errors.append(f" {relative_path}: {len(new_file_errors)} new error(s)") + for error in list(new_file_errors)[:3]: # Show first 3 errors + new_errors.append( + f" - {error[:250]}..." if len(error) > 250 else f" - {error}" + ) + + # Print summary + if self.verbose: + print(f"Validated {len(self.xml_files)} files:") + print(f" - Valid: {valid_count}") + print(f" - Skipped (no schema): {skipped_count}") + if original_error_count: + print(f" - With original errors (ignored): {original_error_count}") + print( + f" - With NEW errors: {len(new_errors) > 0 and len([e for e in new_errors if not e.startswith(' ')]) or 0}" + ) + + if new_errors: + print("\nFAILED - Found NEW validation errors:") + for error in new_errors: + print(error) + return False + else: + if self.verbose: + print("\nPASSED - No new XSD validation errors introduced") + return True + + def _get_schema_path(self, xml_file): + """Determine the appropriate schema path for an XML file.""" + # Check exact filename match + if xml_file.name in self.SCHEMA_MAPPINGS: + return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.name] + + # Check .rels files + if xml_file.suffix == ".rels": + return self.schemas_dir / self.SCHEMA_MAPPINGS[".rels"] + + # Check chart files + if "charts/" in str(xml_file) and xml_file.name.startswith("chart"): + return self.schemas_dir / self.SCHEMA_MAPPINGS["chart"] + + # Check theme files + if "theme/" in str(xml_file) and xml_file.name.startswith("theme"): + return self.schemas_dir / self.SCHEMA_MAPPINGS["theme"] + + # Check if file is in a main content folder and use appropriate schema + if xml_file.parent.name in self.MAIN_CONTENT_FOLDERS: + return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.parent.name] + + return None + + def _clean_ignorable_namespaces(self, xml_doc): + """Remove attributes and elements not in allowed namespaces.""" + # Create a clean copy + xml_string = lxml.etree.tostring(xml_doc, encoding="unicode") + xml_copy = lxml.etree.fromstring(xml_string) + + # Remove attributes not in allowed namespaces + for elem in xml_copy.iter(): + attrs_to_remove = [] + + for attr in elem.attrib: + # Check if attribute is from a namespace other than allowed ones + if "{" in attr: + ns = attr.split("}")[0][1:] + if ns not in self.OOXML_NAMESPACES: + attrs_to_remove.append(attr) + + # Remove collected attributes + for attr in attrs_to_remove: + del elem.attrib[attr] + + # Remove elements not in allowed namespaces + self._remove_ignorable_elements(xml_copy) + + return lxml.etree.ElementTree(xml_copy) + + def _remove_ignorable_elements(self, root): + """Recursively remove all elements not in allowed namespaces.""" + elements_to_remove = [] + + # Find elements to remove + for elem in list(root): + # Skip non-element nodes (comments, processing instructions, etc.) + if not hasattr(elem, "tag") or callable(elem.tag): + continue + + tag_str = str(elem.tag) + if tag_str.startswith("{"): + ns = tag_str.split("}")[0][1:] + if ns not in self.OOXML_NAMESPACES: + elements_to_remove.append(elem) + continue + + # Recursively clean child elements + self._remove_ignorable_elements(elem) + + # Remove collected elements + for elem in elements_to_remove: + root.remove(elem) + + def _preprocess_for_mc_ignorable(self, xml_doc): + """Preprocess XML to handle mc:Ignorable attribute properly.""" + # Remove mc:Ignorable attributes before validation + root = xml_doc.getroot() + + # Remove mc:Ignorable attribute from root + if f"{{{self.MC_NAMESPACE}}}Ignorable" in root.attrib: + del root.attrib[f"{{{self.MC_NAMESPACE}}}Ignorable"] + + return xml_doc + + def _validate_single_file_xsd(self, xml_file, base_path): + """Validate a single XML file against XSD schema. Returns (is_valid, errors_set).""" + schema_path = self._get_schema_path(xml_file) + if not schema_path: + return None, None # Skip file + + try: + # Load schema + with open(schema_path, "rb") as xsd_file: + parser = lxml.etree.XMLParser() + xsd_doc = lxml.etree.parse( + xsd_file, parser=parser, base_url=str(schema_path) + ) + schema = lxml.etree.XMLSchema(xsd_doc) + + # Load and preprocess XML + with open(xml_file, "r") as f: + xml_doc = lxml.etree.parse(f) + + xml_doc, _ = self._remove_template_tags_from_text_nodes(xml_doc) + xml_doc = self._preprocess_for_mc_ignorable(xml_doc) + + # Clean ignorable namespaces if needed + relative_path = xml_file.relative_to(base_path) + if ( + relative_path.parts + and relative_path.parts[0] in self.MAIN_CONTENT_FOLDERS + ): + xml_doc = self._clean_ignorable_namespaces(xml_doc) + + # Validate + if schema.validate(xml_doc): + return True, set() + else: + errors = set() + for error in schema.error_log: + # Store normalized error message (without line numbers for comparison) + errors.add(error.message) + return False, errors + + except Exception as e: + return False, {str(e)} + + def _get_original_file_errors(self, xml_file): + """Get XSD validation errors from a single file in the original document. + + Args: + xml_file: Path to the XML file in unpacked_dir to check + + Returns: + set: Set of error messages from the original file + """ + import tempfile + import zipfile + + # Resolve both paths to handle symlinks (e.g., /var vs /private/var on macOS) + xml_file = Path(xml_file).resolve() + unpacked_dir = self.unpacked_dir.resolve() + relative_path = xml_file.relative_to(unpacked_dir) + + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + # Extract original file + with zipfile.ZipFile(self.original_file, "r") as zip_ref: + zip_ref.extractall(temp_path) + + # Find corresponding file in original + original_xml_file = temp_path / relative_path + + if not original_xml_file.exists(): + # File didn't exist in original, so no original errors + return set() + + # Validate the specific file in original + is_valid, errors = self._validate_single_file_xsd( + original_xml_file, temp_path + ) + return errors if errors else set() + + def _remove_template_tags_from_text_nodes(self, xml_doc): + """Remove template tags from XML text nodes and collect warnings. + + Template tags follow the pattern {{ ... }} and are used as placeholders + for content replacement. They should be removed from text content before + XSD validation while preserving XML structure. + + Returns: + tuple: (cleaned_xml_doc, warnings_list) + """ + warnings = [] + template_pattern = re.compile(r"\{\{[^}]*\}\}") + + # Create a copy of the document to avoid modifying the original + xml_string = lxml.etree.tostring(xml_doc, encoding="unicode") + xml_copy = lxml.etree.fromstring(xml_string) + + def process_text_content(text, content_type): + if not text: + return text + matches = list(template_pattern.finditer(text)) + if matches: + for match in matches: + warnings.append( + f"Found template tag in {content_type}: {match.group()}" + ) + return template_pattern.sub("", text) + return text + + # Process all text nodes in the document + for elem in xml_copy.iter(): + # Skip processing if this is a w:t element + if not hasattr(elem, "tag") or callable(elem.tag): + continue + tag_str = str(elem.tag) + if tag_str.endswith("}t") or tag_str == "t": + continue + + elem.text = process_text_content(elem.text, "text content") + elem.tail = process_text_content(elem.tail, "tail content") + + return lxml.etree.ElementTree(xml_copy), warnings + + +if __name__ == "__main__": + raise RuntimeError("This module should not be run directly.") diff --git a/skills/pptx/ooxml/scripts/validation/docx.py b/skills/pptx/ooxml/scripts/validation/docx.py new file mode 100644 index 0000000..602c470 --- /dev/null +++ b/skills/pptx/ooxml/scripts/validation/docx.py @@ -0,0 +1,274 @@ +""" +Validator for Word document XML files against XSD schemas. +""" + +import re +import tempfile +import zipfile + +import lxml.etree + +from .base import BaseSchemaValidator + + +class DOCXSchemaValidator(BaseSchemaValidator): + """Validator for Word document XML files against XSD schemas.""" + + # Word-specific namespace + WORD_2006_NAMESPACE = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" + + # Word-specific element to relationship type mappings + # Start with empty mapping - add specific cases as we discover them + ELEMENT_RELATIONSHIP_TYPES = {} + + def validate(self): + """Run all validation checks and return True if all pass.""" + # Test 0: XML well-formedness + if not self.validate_xml(): + return False + + # Test 1: Namespace declarations + all_valid = True + if not self.validate_namespaces(): + all_valid = False + + # Test 2: Unique IDs + if not self.validate_unique_ids(): + all_valid = False + + # Test 3: Relationship and file reference validation + if not self.validate_file_references(): + all_valid = False + + # Test 4: Content type declarations + if not self.validate_content_types(): + all_valid = False + + # Test 5: XSD schema validation + if not self.validate_against_xsd(): + all_valid = False + + # Test 6: Whitespace preservation + if not self.validate_whitespace_preservation(): + all_valid = False + + # Test 7: Deletion validation + if not self.validate_deletions(): + all_valid = False + + # Test 8: Insertion validation + if not self.validate_insertions(): + all_valid = False + + # Test 9: Relationship ID reference validation + if not self.validate_all_relationship_ids(): + all_valid = False + + # Count and compare paragraphs + self.compare_paragraph_counts() + + return all_valid + + def validate_whitespace_preservation(self): + """ + Validate that w:t elements with whitespace have xml:space='preserve'. + """ + errors = [] + + for xml_file in self.xml_files: + # Only check document.xml files + if xml_file.name != "document.xml": + continue + + try: + root = lxml.etree.parse(str(xml_file)).getroot() + + # Find all w:t elements + for elem in root.iter(f"{{{self.WORD_2006_NAMESPACE}}}t"): + if elem.text: + text = elem.text + # Check if text starts or ends with whitespace + if re.match(r"^\s.*", text) or re.match(r".*\s$", text): + # Check if xml:space="preserve" attribute exists + xml_space_attr = f"{{{self.XML_NAMESPACE}}}space" + if ( + xml_space_attr not in elem.attrib + or elem.attrib[xml_space_attr] != "preserve" + ): + # Show a preview of the text + text_preview = ( + repr(text)[:50] + "..." + if len(repr(text)) > 50 + else repr(text) + ) + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: w:t element with whitespace missing xml:space='preserve': {text_preview}" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} whitespace preservation violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All whitespace is properly preserved") + return True + + def validate_deletions(self): + """ + Validate that w:t elements are not within w:del elements. + For some reason, XSD validation does not catch this, so we do it manually. + """ + errors = [] + + for xml_file in self.xml_files: + # Only check document.xml files + if xml_file.name != "document.xml": + continue + + try: + root = lxml.etree.parse(str(xml_file)).getroot() + + # Find all w:t elements that are descendants of w:del elements + namespaces = {"w": self.WORD_2006_NAMESPACE} + xpath_expression = ".//w:del//w:t" + problematic_t_elements = root.xpath( + xpath_expression, namespaces=namespaces + ) + for t_elem in problematic_t_elements: + if t_elem.text: + # Show a preview of the text + text_preview = ( + repr(t_elem.text)[:50] + "..." + if len(repr(t_elem.text)) > 50 + else repr(t_elem.text) + ) + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {t_elem.sourceline}: found within : {text_preview}" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} deletion validation violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - No w:t elements found within w:del elements") + return True + + def count_paragraphs_in_unpacked(self): + """Count the number of paragraphs in the unpacked document.""" + count = 0 + + for xml_file in self.xml_files: + # Only check document.xml files + if xml_file.name != "document.xml": + continue + + try: + root = lxml.etree.parse(str(xml_file)).getroot() + # Count all w:p elements + paragraphs = root.findall(f".//{{{self.WORD_2006_NAMESPACE}}}p") + count = len(paragraphs) + except Exception as e: + print(f"Error counting paragraphs in unpacked document: {e}") + + return count + + def count_paragraphs_in_original(self): + """Count the number of paragraphs in the original docx file.""" + count = 0 + + try: + # Create temporary directory to unpack original + with tempfile.TemporaryDirectory() as temp_dir: + # Unpack original docx + with zipfile.ZipFile(self.original_file, "r") as zip_ref: + zip_ref.extractall(temp_dir) + + # Parse document.xml + doc_xml_path = temp_dir + "/word/document.xml" + root = lxml.etree.parse(doc_xml_path).getroot() + + # Count all w:p elements + paragraphs = root.findall(f".//{{{self.WORD_2006_NAMESPACE}}}p") + count = len(paragraphs) + + except Exception as e: + print(f"Error counting paragraphs in original document: {e}") + + return count + + def validate_insertions(self): + """ + Validate that w:delText elements are not within w:ins elements. + w:delText is only allowed in w:ins if nested within a w:del. + """ + errors = [] + + for xml_file in self.xml_files: + if xml_file.name != "document.xml": + continue + + try: + root = lxml.etree.parse(str(xml_file)).getroot() + namespaces = {"w": self.WORD_2006_NAMESPACE} + + # Find w:delText in w:ins that are NOT within w:del + invalid_elements = root.xpath( + ".//w:ins//w:delText[not(ancestor::w:del)]", + namespaces=namespaces + ) + + for elem in invalid_elements: + text_preview = ( + repr(elem.text or "")[:50] + "..." + if len(repr(elem.text or "")) > 50 + else repr(elem.text or "") + ) + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: within : {text_preview}" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} insertion validation violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - No w:delText elements within w:ins elements") + return True + + def compare_paragraph_counts(self): + """Compare paragraph counts between original and new document.""" + original_count = self.count_paragraphs_in_original() + new_count = self.count_paragraphs_in_unpacked() + + diff = new_count - original_count + diff_str = f"+{diff}" if diff > 0 else str(diff) + print(f"\nParagraphs: {original_count} → {new_count} ({diff_str})") + + +if __name__ == "__main__": + raise RuntimeError("This module should not be run directly.") diff --git a/skills/pptx/ooxml/scripts/validation/pptx.py b/skills/pptx/ooxml/scripts/validation/pptx.py new file mode 100644 index 0000000..66d5b1e --- /dev/null +++ b/skills/pptx/ooxml/scripts/validation/pptx.py @@ -0,0 +1,315 @@ +""" +Validator for PowerPoint presentation XML files against XSD schemas. +""" + +import re + +from .base import BaseSchemaValidator + + +class PPTXSchemaValidator(BaseSchemaValidator): + """Validator for PowerPoint presentation XML files against XSD schemas.""" + + # PowerPoint presentation namespace + PRESENTATIONML_NAMESPACE = ( + "http://schemas.openxmlformats.org/presentationml/2006/main" + ) + + # PowerPoint-specific element to relationship type mappings + ELEMENT_RELATIONSHIP_TYPES = { + "sldid": "slide", + "sldmasterid": "slidemaster", + "notesmasterid": "notesmaster", + "sldlayoutid": "slidelayout", + "themeid": "theme", + "tablestyleid": "tablestyles", + } + + def validate(self): + """Run all validation checks and return True if all pass.""" + # Test 0: XML well-formedness + if not self.validate_xml(): + return False + + # Test 1: Namespace declarations + all_valid = True + if not self.validate_namespaces(): + all_valid = False + + # Test 2: Unique IDs + if not self.validate_unique_ids(): + all_valid = False + + # Test 3: UUID ID validation + if not self.validate_uuid_ids(): + all_valid = False + + # Test 4: Relationship and file reference validation + if not self.validate_file_references(): + all_valid = False + + # Test 5: Slide layout ID validation + if not self.validate_slide_layout_ids(): + all_valid = False + + # Test 6: Content type declarations + if not self.validate_content_types(): + all_valid = False + + # Test 7: XSD schema validation + if not self.validate_against_xsd(): + all_valid = False + + # Test 8: Notes slide reference validation + if not self.validate_notes_slide_references(): + all_valid = False + + # Test 9: Relationship ID reference validation + if not self.validate_all_relationship_ids(): + all_valid = False + + # Test 10: Duplicate slide layout references validation + if not self.validate_no_duplicate_slide_layouts(): + all_valid = False + + return all_valid + + def validate_uuid_ids(self): + """Validate that ID attributes that look like UUIDs contain only hex values.""" + import lxml.etree + + errors = [] + # UUID pattern: 8-4-4-4-12 hex digits with optional braces/hyphens + uuid_pattern = re.compile( + r"^[\{\(]?[0-9A-Fa-f]{8}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{12}[\}\)]?$" + ) + + for xml_file in self.xml_files: + try: + root = lxml.etree.parse(str(xml_file)).getroot() + + # Check all elements for ID attributes + for elem in root.iter(): + for attr, value in elem.attrib.items(): + # Check if this is an ID attribute + attr_name = attr.split("}")[-1].lower() + if attr_name == "id" or attr_name.endswith("id"): + # Check if value looks like a UUID (has the right length and pattern structure) + if self._looks_like_uuid(value): + # Validate that it contains only hex characters in the right positions + if not uuid_pattern.match(value): + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: ID '{value}' appears to be a UUID but contains invalid hex characters" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} UUID ID validation errors:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All UUID-like IDs contain valid hex values") + return True + + def _looks_like_uuid(self, value): + """Check if a value has the general structure of a UUID.""" + # Remove common UUID delimiters + clean_value = value.strip("{}()").replace("-", "") + # Check if it's 32 hex-like characters (could include invalid hex chars) + return len(clean_value) == 32 and all(c.isalnum() for c in clean_value) + + def validate_slide_layout_ids(self): + """Validate that sldLayoutId elements in slide masters reference valid slide layouts.""" + import lxml.etree + + errors = [] + + # Find all slide master files + slide_masters = list(self.unpacked_dir.glob("ppt/slideMasters/*.xml")) + + if not slide_masters: + if self.verbose: + print("PASSED - No slide masters found") + return True + + for slide_master in slide_masters: + try: + # Parse the slide master file + root = lxml.etree.parse(str(slide_master)).getroot() + + # Find the corresponding _rels file for this slide master + rels_file = slide_master.parent / "_rels" / f"{slide_master.name}.rels" + + if not rels_file.exists(): + errors.append( + f" {slide_master.relative_to(self.unpacked_dir)}: " + f"Missing relationships file: {rels_file.relative_to(self.unpacked_dir)}" + ) + continue + + # Parse the relationships file + rels_root = lxml.etree.parse(str(rels_file)).getroot() + + # Build a set of valid relationship IDs that point to slide layouts + valid_layout_rids = set() + for rel in rels_root.findall( + f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship" + ): + rel_type = rel.get("Type", "") + if "slideLayout" in rel_type: + valid_layout_rids.add(rel.get("Id")) + + # Find all sldLayoutId elements in the slide master + for sld_layout_id in root.findall( + f".//{{{self.PRESENTATIONML_NAMESPACE}}}sldLayoutId" + ): + r_id = sld_layout_id.get( + f"{{{self.OFFICE_RELATIONSHIPS_NAMESPACE}}}id" + ) + layout_id = sld_layout_id.get("id") + + if r_id and r_id not in valid_layout_rids: + errors.append( + f" {slide_master.relative_to(self.unpacked_dir)}: " + f"Line {sld_layout_id.sourceline}: sldLayoutId with id='{layout_id}' " + f"references r:id='{r_id}' which is not found in slide layout relationships" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {slide_master.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} slide layout ID validation errors:") + for error in errors: + print(error) + print( + "Remove invalid references or add missing slide layouts to the relationships file." + ) + return False + else: + if self.verbose: + print("PASSED - All slide layout IDs reference valid slide layouts") + return True + + def validate_no_duplicate_slide_layouts(self): + """Validate that each slide has exactly one slideLayout reference.""" + import lxml.etree + + errors = [] + slide_rels_files = list(self.unpacked_dir.glob("ppt/slides/_rels/*.xml.rels")) + + for rels_file in slide_rels_files: + try: + root = lxml.etree.parse(str(rels_file)).getroot() + + # Find all slideLayout relationships + layout_rels = [ + rel + for rel in root.findall( + f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship" + ) + if "slideLayout" in rel.get("Type", "") + ] + + if len(layout_rels) > 1: + errors.append( + f" {rels_file.relative_to(self.unpacked_dir)}: has {len(layout_rels)} slideLayout references" + ) + + except Exception as e: + errors.append( + f" {rels_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print("FAILED - Found slides with duplicate slideLayout references:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All slides have exactly one slideLayout reference") + return True + + def validate_notes_slide_references(self): + """Validate that each notesSlide file is referenced by only one slide.""" + import lxml.etree + + errors = [] + notes_slide_references = {} # Track which slides reference each notesSlide + + # Find all slide relationship files + slide_rels_files = list(self.unpacked_dir.glob("ppt/slides/_rels/*.xml.rels")) + + if not slide_rels_files: + if self.verbose: + print("PASSED - No slide relationship files found") + return True + + for rels_file in slide_rels_files: + try: + # Parse the relationships file + root = lxml.etree.parse(str(rels_file)).getroot() + + # Find all notesSlide relationships + for rel in root.findall( + f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship" + ): + rel_type = rel.get("Type", "") + if "notesSlide" in rel_type: + target = rel.get("Target", "") + if target: + # Normalize the target path to handle relative paths + normalized_target = target.replace("../", "") + + # Track which slide references this notesSlide + slide_name = rels_file.stem.replace( + ".xml", "" + ) # e.g., "slide1" + + if normalized_target not in notes_slide_references: + notes_slide_references[normalized_target] = [] + notes_slide_references[normalized_target].append( + (slide_name, rels_file) + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {rels_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + # Check for duplicate references + for target, references in notes_slide_references.items(): + if len(references) > 1: + slide_names = [ref[0] for ref in references] + errors.append( + f" Notes slide '{target}' is referenced by multiple slides: {', '.join(slide_names)}" + ) + for slide_name, rels_file in references: + errors.append(f" - {rels_file.relative_to(self.unpacked_dir)}") + + if errors: + print( + f"FAILED - Found {len([e for e in errors if not e.startswith(' ')])} notes slide reference validation errors:" + ) + for error in errors: + print(error) + print("Each slide may optionally have its own slide file.") + return False + else: + if self.verbose: + print("PASSED - All notes slide references are unique") + return True + + +if __name__ == "__main__": + raise RuntimeError("This module should not be run directly.") diff --git a/skills/pptx/ooxml/scripts/validation/redlining.py b/skills/pptx/ooxml/scripts/validation/redlining.py new file mode 100644 index 0000000..7ed425e --- /dev/null +++ b/skills/pptx/ooxml/scripts/validation/redlining.py @@ -0,0 +1,279 @@ +""" +Validator for tracked changes in Word documents. +""" + +import subprocess +import tempfile +import zipfile +from pathlib import Path + + +class RedliningValidator: + """Validator for tracked changes in Word documents.""" + + def __init__(self, unpacked_dir, original_docx, verbose=False): + self.unpacked_dir = Path(unpacked_dir) + self.original_docx = Path(original_docx) + self.verbose = verbose + self.namespaces = { + "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main" + } + + def validate(self): + """Main validation method that returns True if valid, False otherwise.""" + # Verify unpacked directory exists and has correct structure + modified_file = self.unpacked_dir / "word" / "document.xml" + if not modified_file.exists(): + print(f"FAILED - Modified document.xml not found at {modified_file}") + return False + + # First, check if there are any tracked changes by Claude to validate + try: + import xml.etree.ElementTree as ET + + tree = ET.parse(modified_file) + root = tree.getroot() + + # Check for w:del or w:ins tags authored by Claude + del_elements = root.findall(".//w:del", self.namespaces) + ins_elements = root.findall(".//w:ins", self.namespaces) + + # Filter to only include changes by Claude + claude_del_elements = [ + elem + for elem in del_elements + if elem.get(f"{{{self.namespaces['w']}}}author") == "Claude" + ] + claude_ins_elements = [ + elem + for elem in ins_elements + if elem.get(f"{{{self.namespaces['w']}}}author") == "Claude" + ] + + # Redlining validation is only needed if tracked changes by Claude have been used. + if not claude_del_elements and not claude_ins_elements: + if self.verbose: + print("PASSED - No tracked changes by Claude found.") + return True + + except Exception: + # If we can't parse the XML, continue with full validation + pass + + # Create temporary directory for unpacking original docx + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + # Unpack original docx + try: + with zipfile.ZipFile(self.original_docx, "r") as zip_ref: + zip_ref.extractall(temp_path) + except Exception as e: + print(f"FAILED - Error unpacking original docx: {e}") + return False + + original_file = temp_path / "word" / "document.xml" + if not original_file.exists(): + print( + f"FAILED - Original document.xml not found in {self.original_docx}" + ) + return False + + # Parse both XML files using xml.etree.ElementTree for redlining validation + try: + import xml.etree.ElementTree as ET + + modified_tree = ET.parse(modified_file) + modified_root = modified_tree.getroot() + original_tree = ET.parse(original_file) + original_root = original_tree.getroot() + except ET.ParseError as e: + print(f"FAILED - Error parsing XML files: {e}") + return False + + # Remove Claude's tracked changes from both documents + self._remove_claude_tracked_changes(original_root) + self._remove_claude_tracked_changes(modified_root) + + # Extract and compare text content + modified_text = self._extract_text_content(modified_root) + original_text = self._extract_text_content(original_root) + + if modified_text != original_text: + # Show detailed character-level differences for each paragraph + error_message = self._generate_detailed_diff( + original_text, modified_text + ) + print(error_message) + return False + + if self.verbose: + print("PASSED - All changes by Claude are properly tracked") + return True + + def _generate_detailed_diff(self, original_text, modified_text): + """Generate detailed word-level differences using git word diff.""" + error_parts = [ + "FAILED - Document text doesn't match after removing Claude's tracked changes", + "", + "Likely causes:", + " 1. Modified text inside another author's or tags", + " 2. Made edits without proper tracked changes", + " 3. Didn't nest inside when deleting another's insertion", + "", + "For pre-redlined documents, use correct patterns:", + " - To reject another's INSERTION: Nest inside their ", + " - To restore another's DELETION: Add new AFTER their ", + "", + ] + + # Show git word diff + git_diff = self._get_git_word_diff(original_text, modified_text) + if git_diff: + error_parts.extend(["Differences:", "============", git_diff]) + else: + error_parts.append("Unable to generate word diff (git not available)") + + return "\n".join(error_parts) + + def _get_git_word_diff(self, original_text, modified_text): + """Generate word diff using git with character-level precision.""" + try: + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + # Create two files + original_file = temp_path / "original.txt" + modified_file = temp_path / "modified.txt" + + original_file.write_text(original_text, encoding="utf-8") + modified_file.write_text(modified_text, encoding="utf-8") + + # Try character-level diff first for precise differences + result = subprocess.run( + [ + "git", + "diff", + "--word-diff=plain", + "--word-diff-regex=.", # Character-by-character diff + "-U0", # Zero lines of context - show only changed lines + "--no-index", + str(original_file), + str(modified_file), + ], + capture_output=True, + text=True, + ) + + if result.stdout.strip(): + # Clean up the output - remove git diff header lines + lines = result.stdout.split("\n") + # Skip the header lines (diff --git, index, +++, ---, @@) + content_lines = [] + in_content = False + for line in lines: + if line.startswith("@@"): + in_content = True + continue + if in_content and line.strip(): + content_lines.append(line) + + if content_lines: + return "\n".join(content_lines) + + # Fallback to word-level diff if character-level is too verbose + result = subprocess.run( + [ + "git", + "diff", + "--word-diff=plain", + "-U0", # Zero lines of context + "--no-index", + str(original_file), + str(modified_file), + ], + capture_output=True, + text=True, + ) + + if result.stdout.strip(): + lines = result.stdout.split("\n") + content_lines = [] + in_content = False + for line in lines: + if line.startswith("@@"): + in_content = True + continue + if in_content and line.strip(): + content_lines.append(line) + return "\n".join(content_lines) + + except (subprocess.CalledProcessError, FileNotFoundError, Exception): + # Git not available or other error, return None to use fallback + pass + + return None + + def _remove_claude_tracked_changes(self, root): + """Remove tracked changes authored by Claude from the XML root.""" + ins_tag = f"{{{self.namespaces['w']}}}ins" + del_tag = f"{{{self.namespaces['w']}}}del" + author_attr = f"{{{self.namespaces['w']}}}author" + + # Remove w:ins elements + for parent in root.iter(): + to_remove = [] + for child in parent: + if child.tag == ins_tag and child.get(author_attr) == "Claude": + to_remove.append(child) + for elem in to_remove: + parent.remove(elem) + + # Unwrap content in w:del elements where author is "Claude" + deltext_tag = f"{{{self.namespaces['w']}}}delText" + t_tag = f"{{{self.namespaces['w']}}}t" + + for parent in root.iter(): + to_process = [] + for child in parent: + if child.tag == del_tag and child.get(author_attr) == "Claude": + to_process.append((child, list(parent).index(child))) + + # Process in reverse order to maintain indices + for del_elem, del_index in reversed(to_process): + # Convert w:delText to w:t before moving + for elem in del_elem.iter(): + if elem.tag == deltext_tag: + elem.tag = t_tag + + # Move all children of w:del to its parent before removing w:del + for child in reversed(list(del_elem)): + parent.insert(del_index, child) + parent.remove(del_elem) + + def _extract_text_content(self, root): + """Extract text content from Word XML, preserving paragraph structure. + + Empty paragraphs are skipped to avoid false positives when tracked + insertions add only structural elements without text content. + """ + p_tag = f"{{{self.namespaces['w']}}}p" + t_tag = f"{{{self.namespaces['w']}}}t" + + paragraphs = [] + for p_elem in root.findall(f".//{p_tag}"): + # Get all text elements within this paragraph + text_parts = [] + for t_elem in p_elem.findall(f".//{t_tag}"): + if t_elem.text: + text_parts.append(t_elem.text) + paragraph_text = "".join(text_parts) + # Skip empty paragraphs - they don't affect content validation + if paragraph_text: + paragraphs.append(paragraph_text) + + return "\n".join(paragraphs) + + +if __name__ == "__main__": + raise RuntimeError("This module should not be run directly.") diff --git a/skills/pptx/scripts/inventory.py b/skills/pptx/scripts/inventory.py new file mode 100644 index 0000000..edda390 --- /dev/null +++ b/skills/pptx/scripts/inventory.py @@ -0,0 +1,1020 @@ +#!/usr/bin/env python3 +""" +Extract structured text content from PowerPoint presentations. + +This module provides functionality to: +- Extract all text content from PowerPoint shapes +- Preserve paragraph formatting (alignment, bullets, fonts, spacing) +- Handle nested GroupShapes recursively with correct absolute positions +- Sort shapes by visual position on slides +- Filter out slide numbers and non-content placeholders +- Export to JSON with clean, structured data + +Classes: + ParagraphData: Represents a text paragraph with formatting + ShapeData: Represents a shape with position and text content + +Main Functions: + extract_text_inventory: Extract all text from a presentation + save_inventory: Save extracted data to JSON + +Usage: + python inventory.py input.pptx output.json +""" + +import argparse +import json +import platform +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, Union + +from PIL import Image, ImageDraw, ImageFont +from pptx import Presentation +from pptx.enum.text import PP_ALIGN +from pptx.shapes.base import BaseShape + +# Type aliases for cleaner signatures +JsonValue = Union[str, int, float, bool, None] +ParagraphDict = Dict[str, JsonValue] +ShapeDict = Dict[ + str, Union[str, float, bool, List[ParagraphDict], List[str], Dict[str, Any], None] +] +InventoryData = Dict[ + str, Dict[str, "ShapeData"] +] # Dict of slide_id -> {shape_id -> ShapeData} +InventoryDict = Dict[str, Dict[str, ShapeDict]] # JSON-serializable inventory + + +def main(): + """Main entry point for command-line usage.""" + parser = argparse.ArgumentParser( + description="Extract text inventory from PowerPoint with proper GroupShape support.", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python inventory.py presentation.pptx inventory.json + Extracts text inventory with correct absolute positions for grouped shapes + + python inventory.py presentation.pptx inventory.json --issues-only + Extracts only text shapes that have overflow or overlap issues + +The output JSON includes: + - All text content organized by slide and shape + - Correct absolute positions for shapes in groups + - Visual position and size in inches + - Paragraph properties and formatting + - Issue detection: text overflow and shape overlaps + """, + ) + + parser.add_argument("input", help="Input PowerPoint file (.pptx)") + parser.add_argument("output", help="Output JSON file for inventory") + parser.add_argument( + "--issues-only", + action="store_true", + help="Include only text shapes that have overflow or overlap issues", + ) + + args = parser.parse_args() + + input_path = Path(args.input) + if not input_path.exists(): + print(f"Error: Input file not found: {args.input}") + sys.exit(1) + + if not input_path.suffix.lower() == ".pptx": + print("Error: Input must be a PowerPoint file (.pptx)") + sys.exit(1) + + try: + print(f"Extracting text inventory from: {args.input}") + if args.issues_only: + print( + "Filtering to include only text shapes with issues (overflow/overlap)" + ) + inventory = extract_text_inventory(input_path, issues_only=args.issues_only) + + output_path = Path(args.output) + output_path.parent.mkdir(parents=True, exist_ok=True) + save_inventory(inventory, output_path) + + print(f"Output saved to: {args.output}") + + # Report statistics + total_slides = len(inventory) + total_shapes = sum(len(shapes) for shapes in inventory.values()) + if args.issues_only: + if total_shapes > 0: + print( + f"Found {total_shapes} text elements with issues in {total_slides} slides" + ) + else: + print("No issues discovered") + else: + print( + f"Found text in {total_slides} slides with {total_shapes} text elements" + ) + + except Exception as e: + print(f"Error processing presentation: {e}") + import traceback + + traceback.print_exc() + sys.exit(1) + + +@dataclass +class ShapeWithPosition: + """A shape with its absolute position on the slide.""" + + shape: BaseShape + absolute_left: int # in EMUs + absolute_top: int # in EMUs + + +class ParagraphData: + """Data structure for paragraph properties extracted from a PowerPoint paragraph.""" + + def __init__(self, paragraph: Any): + """Initialize from a PowerPoint paragraph object. + + Args: + paragraph: The PowerPoint paragraph object + """ + self.text: str = paragraph.text.strip() + self.bullet: bool = False + self.level: Optional[int] = None + self.alignment: Optional[str] = None + self.space_before: Optional[float] = None + self.space_after: Optional[float] = None + self.font_name: Optional[str] = None + self.font_size: Optional[float] = None + self.bold: Optional[bool] = None + self.italic: Optional[bool] = None + self.underline: Optional[bool] = None + self.color: Optional[str] = None + self.theme_color: Optional[str] = None + self.line_spacing: Optional[float] = None + + # Check for bullet formatting + if ( + hasattr(paragraph, "_p") + and paragraph._p is not None + and paragraph._p.pPr is not None + ): + pPr = paragraph._p.pPr + ns = "{http://schemas.openxmlformats.org/drawingml/2006/main}" + if ( + pPr.find(f"{ns}buChar") is not None + or pPr.find(f"{ns}buAutoNum") is not None + ): + self.bullet = True + if hasattr(paragraph, "level"): + self.level = paragraph.level + + # Add alignment if not LEFT (default) + if hasattr(paragraph, "alignment") and paragraph.alignment is not None: + alignment_map = { + PP_ALIGN.CENTER: "CENTER", + PP_ALIGN.RIGHT: "RIGHT", + PP_ALIGN.JUSTIFY: "JUSTIFY", + } + if paragraph.alignment in alignment_map: + self.alignment = alignment_map[paragraph.alignment] + + # Add spacing properties if set + if hasattr(paragraph, "space_before") and paragraph.space_before: + self.space_before = paragraph.space_before.pt + if hasattr(paragraph, "space_after") and paragraph.space_after: + self.space_after = paragraph.space_after.pt + + # Extract font properties from first run + if paragraph.runs: + first_run = paragraph.runs[0] + if hasattr(first_run, "font"): + font = first_run.font + if font.name: + self.font_name = font.name + if font.size: + self.font_size = font.size.pt + if font.bold is not None: + self.bold = font.bold + if font.italic is not None: + self.italic = font.italic + if font.underline is not None: + self.underline = font.underline + + # Handle color - both RGB and theme colors + try: + # Try RGB color first + if font.color.rgb: + self.color = str(font.color.rgb) + except (AttributeError, TypeError): + # Fall back to theme color + try: + if font.color.theme_color: + self.theme_color = font.color.theme_color.name + except (AttributeError, TypeError): + pass + + # Add line spacing if set + if hasattr(paragraph, "line_spacing") and paragraph.line_spacing is not None: + if hasattr(paragraph.line_spacing, "pt"): + self.line_spacing = round(paragraph.line_spacing.pt, 2) + else: + # Multiplier - convert to points + font_size = self.font_size if self.font_size else 12.0 + self.line_spacing = round(paragraph.line_spacing * font_size, 2) + + def to_dict(self) -> ParagraphDict: + """Convert to dictionary for JSON serialization, excluding None values.""" + result: ParagraphDict = {"text": self.text} + + # Add optional fields only if they have values + if self.bullet: + result["bullet"] = self.bullet + if self.level is not None: + result["level"] = self.level + if self.alignment: + result["alignment"] = self.alignment + if self.space_before is not None: + result["space_before"] = self.space_before + if self.space_after is not None: + result["space_after"] = self.space_after + if self.font_name: + result["font_name"] = self.font_name + if self.font_size is not None: + result["font_size"] = self.font_size + if self.bold is not None: + result["bold"] = self.bold + if self.italic is not None: + result["italic"] = self.italic + if self.underline is not None: + result["underline"] = self.underline + if self.color: + result["color"] = self.color + if self.theme_color: + result["theme_color"] = self.theme_color + if self.line_spacing is not None: + result["line_spacing"] = self.line_spacing + + return result + + +class ShapeData: + """Data structure for shape properties extracted from a PowerPoint shape.""" + + @staticmethod + def emu_to_inches(emu: int) -> float: + """Convert EMUs (English Metric Units) to inches.""" + return emu / 914400.0 + + @staticmethod + def inches_to_pixels(inches: float, dpi: int = 96) -> int: + """Convert inches to pixels at given DPI.""" + return int(inches * dpi) + + @staticmethod + def get_font_path(font_name: str) -> Optional[str]: + """Get the font file path for a given font name. + + Args: + font_name: Name of the font (e.g., 'Arial', 'Calibri') + + Returns: + Path to the font file, or None if not found + """ + system = platform.system() + + # Common font file variations to try + font_variations = [ + font_name, + font_name.lower(), + font_name.replace(" ", ""), + font_name.replace(" ", "-"), + ] + + # Define font directories and extensions by platform + if system == "Darwin": # macOS + font_dirs = [ + "/System/Library/Fonts/", + "/Library/Fonts/", + "~/Library/Fonts/", + ] + extensions = [".ttf", ".otf", ".ttc", ".dfont"] + else: # Linux + font_dirs = [ + "/usr/share/fonts/truetype/", + "/usr/local/share/fonts/", + "~/.fonts/", + ] + extensions = [".ttf", ".otf"] + + # Try to find the font file + from pathlib import Path + + for font_dir in font_dirs: + font_dir_path = Path(font_dir).expanduser() + if not font_dir_path.exists(): + continue + + # First try exact matches + for variant in font_variations: + for ext in extensions: + font_path = font_dir_path / f"{variant}{ext}" + if font_path.exists(): + return str(font_path) + + # Then try fuzzy matching - find files containing the font name + try: + for file_path in font_dir_path.iterdir(): + if file_path.is_file(): + file_name_lower = file_path.name.lower() + font_name_lower = font_name.lower().replace(" ", "") + if font_name_lower in file_name_lower and any( + file_name_lower.endswith(ext) for ext in extensions + ): + return str(file_path) + except (OSError, PermissionError): + continue + + return None + + @staticmethod + def get_slide_dimensions(slide: Any) -> tuple[Optional[int], Optional[int]]: + """Get slide dimensions from slide object. + + Args: + slide: Slide object + + Returns: + Tuple of (width_emu, height_emu) or (None, None) if not found + """ + try: + prs = slide.part.package.presentation_part.presentation + return prs.slide_width, prs.slide_height + except (AttributeError, TypeError): + return None, None + + @staticmethod + def get_default_font_size(shape: BaseShape, slide_layout: Any) -> Optional[float]: + """Extract default font size from slide layout for a placeholder shape. + + Args: + shape: Placeholder shape + slide_layout: Slide layout containing the placeholder definition + + Returns: + Default font size in points, or None if not found + """ + try: + if not hasattr(shape, "placeholder_format"): + return None + + shape_type = shape.placeholder_format.type # type: ignore + for layout_placeholder in slide_layout.placeholders: + if layout_placeholder.placeholder_format.type == shape_type: + # Find first defRPr element with sz (size) attribute + for elem in layout_placeholder.element.iter(): + if "defRPr" in elem.tag and (sz := elem.get("sz")): + return float(sz) / 100.0 # Convert EMUs to points + break + except Exception: + pass + return None + + def __init__( + self, + shape: BaseShape, + absolute_left: Optional[int] = None, + absolute_top: Optional[int] = None, + slide: Optional[Any] = None, + ): + """Initialize from a PowerPoint shape object. + + Args: + shape: The PowerPoint shape object (should be pre-validated) + absolute_left: Absolute left position in EMUs (for shapes in groups) + absolute_top: Absolute top position in EMUs (for shapes in groups) + slide: Optional slide object to get dimensions and layout information + """ + self.shape = shape # Store reference to original shape + self.shape_id: str = "" # Will be set after sorting + + # Get slide dimensions from slide object + self.slide_width_emu, self.slide_height_emu = ( + self.get_slide_dimensions(slide) if slide else (None, None) + ) + + # Get placeholder type if applicable + self.placeholder_type: Optional[str] = None + self.default_font_size: Optional[float] = None + if hasattr(shape, "is_placeholder") and shape.is_placeholder: # type: ignore + if shape.placeholder_format and shape.placeholder_format.type: # type: ignore + self.placeholder_type = ( + str(shape.placeholder_format.type).split(".")[-1].split(" ")[0] # type: ignore + ) + + # Get default font size from layout + if slide and hasattr(slide, "slide_layout"): + self.default_font_size = self.get_default_font_size( + shape, slide.slide_layout + ) + + # Get position information + # Use absolute positions if provided (for shapes in groups), otherwise use shape's position + left_emu = ( + absolute_left + if absolute_left is not None + else (shape.left if hasattr(shape, "left") else 0) + ) + top_emu = ( + absolute_top + if absolute_top is not None + else (shape.top if hasattr(shape, "top") else 0) + ) + + self.left: float = round(self.emu_to_inches(left_emu), 2) # type: ignore + self.top: float = round(self.emu_to_inches(top_emu), 2) # type: ignore + self.width: float = round( + self.emu_to_inches(shape.width if hasattr(shape, "width") else 0), + 2, # type: ignore + ) + self.height: float = round( + self.emu_to_inches(shape.height if hasattr(shape, "height") else 0), + 2, # type: ignore + ) + + # Store EMU positions for overflow calculations + self.left_emu = left_emu + self.top_emu = top_emu + self.width_emu = shape.width if hasattr(shape, "width") else 0 + self.height_emu = shape.height if hasattr(shape, "height") else 0 + + # Calculate overflow status + self.frame_overflow_bottom: Optional[float] = None + self.slide_overflow_right: Optional[float] = None + self.slide_overflow_bottom: Optional[float] = None + self.overlapping_shapes: Dict[ + str, float + ] = {} # Dict of shape_id -> overlap area in sq inches + self.warnings: List[str] = [] + self._estimate_frame_overflow() + self._calculate_slide_overflow() + self._detect_bullet_issues() + + @property + def paragraphs(self) -> List[ParagraphData]: + """Calculate paragraphs from the shape's text frame.""" + if not self.shape or not hasattr(self.shape, "text_frame"): + return [] + + paragraphs = [] + for paragraph in self.shape.text_frame.paragraphs: # type: ignore + if paragraph.text.strip(): + paragraphs.append(ParagraphData(paragraph)) + return paragraphs + + def _get_default_font_size(self) -> int: + """Get default font size from theme text styles or use conservative default.""" + try: + if not ( + hasattr(self.shape, "part") and hasattr(self.shape.part, "slide_layout") + ): + return 14 + + slide_master = self.shape.part.slide_layout.slide_master # type: ignore + if not hasattr(slide_master, "element"): + return 14 + + # Determine theme style based on placeholder type + style_name = "bodyStyle" # Default + if self.placeholder_type and "TITLE" in self.placeholder_type: + style_name = "titleStyle" + + # Find font size in theme styles + for child in slide_master.element.iter(): + tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag + if tag == style_name: + for elem in child.iter(): + if "sz" in elem.attrib: + return int(elem.attrib["sz"]) // 100 + except Exception: + pass + + return 14 # Conservative default for body text + + def _get_usable_dimensions(self, text_frame) -> Tuple[int, int]: + """Get usable width and height in pixels after accounting for margins.""" + # Default PowerPoint margins in inches + margins = {"top": 0.05, "bottom": 0.05, "left": 0.1, "right": 0.1} + + # Override with actual margins if set + if hasattr(text_frame, "margin_top") and text_frame.margin_top: + margins["top"] = self.emu_to_inches(text_frame.margin_top) + if hasattr(text_frame, "margin_bottom") and text_frame.margin_bottom: + margins["bottom"] = self.emu_to_inches(text_frame.margin_bottom) + if hasattr(text_frame, "margin_left") and text_frame.margin_left: + margins["left"] = self.emu_to_inches(text_frame.margin_left) + if hasattr(text_frame, "margin_right") and text_frame.margin_right: + margins["right"] = self.emu_to_inches(text_frame.margin_right) + + # Calculate usable area + usable_width = self.width - margins["left"] - margins["right"] + usable_height = self.height - margins["top"] - margins["bottom"] + + # Convert to pixels + return ( + self.inches_to_pixels(usable_width), + self.inches_to_pixels(usable_height), + ) + + def _wrap_text_line(self, line: str, max_width_px: int, draw, font) -> List[str]: + """Wrap a single line of text to fit within max_width_px.""" + if not line: + return [""] + + # Use textlength for efficient width calculation + if draw.textlength(line, font=font) <= max_width_px: + return [line] + + # Need to wrap - split into words + wrapped = [] + words = line.split(" ") + current_line = "" + + for word in words: + test_line = current_line + (" " if current_line else "") + word + if draw.textlength(test_line, font=font) <= max_width_px: + current_line = test_line + else: + if current_line: + wrapped.append(current_line) + current_line = word + + if current_line: + wrapped.append(current_line) + + return wrapped + + def _estimate_frame_overflow(self) -> None: + """Estimate if text overflows the shape bounds using PIL text measurement.""" + if not self.shape or not hasattr(self.shape, "text_frame"): + return + + text_frame = self.shape.text_frame # type: ignore + if not text_frame or not text_frame.paragraphs: + return + + # Get usable dimensions after accounting for margins + usable_width_px, usable_height_px = self._get_usable_dimensions(text_frame) + if usable_width_px <= 0 or usable_height_px <= 0: + return + + # Set up PIL for text measurement + dummy_img = Image.new("RGB", (1, 1)) + draw = ImageDraw.Draw(dummy_img) + + # Get default font size from placeholder or use conservative estimate + default_font_size = self._get_default_font_size() + + # Calculate total height of all paragraphs + total_height_px = 0 + + for para_idx, paragraph in enumerate(text_frame.paragraphs): + if not paragraph.text.strip(): + continue + + para_data = ParagraphData(paragraph) + + # Load font for this paragraph + font_name = para_data.font_name or "Arial" + font_size = int(para_data.font_size or default_font_size) + + font = None + font_path = self.get_font_path(font_name) + if font_path: + try: + font = ImageFont.truetype(font_path, size=font_size) + except Exception: + font = ImageFont.load_default() + else: + font = ImageFont.load_default() + + # Wrap all lines in this paragraph + all_wrapped_lines = [] + for line in paragraph.text.split("\n"): + wrapped = self._wrap_text_line(line, usable_width_px, draw, font) + all_wrapped_lines.extend(wrapped) + + if all_wrapped_lines: + # Calculate line height + if para_data.line_spacing: + # Custom line spacing explicitly set + line_height_px = para_data.line_spacing * 96 / 72 + else: + # PowerPoint default single spacing (1.0x font size) + line_height_px = font_size * 96 / 72 + + # Add space_before (except first paragraph) + if para_idx > 0 and para_data.space_before: + total_height_px += para_data.space_before * 96 / 72 + + # Add paragraph text height + total_height_px += len(all_wrapped_lines) * line_height_px + + # Add space_after + if para_data.space_after: + total_height_px += para_data.space_after * 96 / 72 + + # Check for overflow (ignore negligible overflows <= 0.05") + if total_height_px > usable_height_px: + overflow_px = total_height_px - usable_height_px + overflow_inches = round(overflow_px / 96.0, 2) + if overflow_inches > 0.05: # Only report significant overflows + self.frame_overflow_bottom = overflow_inches + + def _calculate_slide_overflow(self) -> None: + """Calculate if shape overflows the slide boundaries.""" + if self.slide_width_emu is None or self.slide_height_emu is None: + return + + # Check right overflow (ignore negligible overflows <= 0.01") + right_edge_emu = self.left_emu + self.width_emu + if right_edge_emu > self.slide_width_emu: + overflow_emu = right_edge_emu - self.slide_width_emu + overflow_inches = round(self.emu_to_inches(overflow_emu), 2) + if overflow_inches > 0.01: # Only report significant overflows + self.slide_overflow_right = overflow_inches + + # Check bottom overflow (ignore negligible overflows <= 0.01") + bottom_edge_emu = self.top_emu + self.height_emu + if bottom_edge_emu > self.slide_height_emu: + overflow_emu = bottom_edge_emu - self.slide_height_emu + overflow_inches = round(self.emu_to_inches(overflow_emu), 2) + if overflow_inches > 0.01: # Only report significant overflows + self.slide_overflow_bottom = overflow_inches + + def _detect_bullet_issues(self) -> None: + """Detect bullet point formatting issues in paragraphs.""" + if not self.shape or not hasattr(self.shape, "text_frame"): + return + + text_frame = self.shape.text_frame # type: ignore + if not text_frame or not text_frame.paragraphs: + return + + # Common bullet symbols that indicate manual bullets + bullet_symbols = ["•", "●", "○"] + + for paragraph in text_frame.paragraphs: + text = paragraph.text.strip() + # Check for manual bullet symbols + if text and any(text.startswith(symbol + " ") for symbol in bullet_symbols): + self.warnings.append( + "manual_bullet_symbol: use proper bullet formatting" + ) + break + + @property + def has_any_issues(self) -> bool: + """Check if shape has any issues (overflow, overlap, or warnings).""" + return ( + self.frame_overflow_bottom is not None + or self.slide_overflow_right is not None + or self.slide_overflow_bottom is not None + or len(self.overlapping_shapes) > 0 + or len(self.warnings) > 0 + ) + + def to_dict(self) -> ShapeDict: + """Convert to dictionary for JSON serialization.""" + result: ShapeDict = { + "left": self.left, + "top": self.top, + "width": self.width, + "height": self.height, + } + + # Add optional fields if present + if self.placeholder_type: + result["placeholder_type"] = self.placeholder_type + + if self.default_font_size: + result["default_font_size"] = self.default_font_size + + # Add overflow information only if there is overflow + overflow_data = {} + + # Add frame overflow if present + if self.frame_overflow_bottom is not None: + overflow_data["frame"] = {"overflow_bottom": self.frame_overflow_bottom} + + # Add slide overflow if present + slide_overflow = {} + if self.slide_overflow_right is not None: + slide_overflow["overflow_right"] = self.slide_overflow_right + if self.slide_overflow_bottom is not None: + slide_overflow["overflow_bottom"] = self.slide_overflow_bottom + if slide_overflow: + overflow_data["slide"] = slide_overflow + + # Only add overflow field if there is overflow + if overflow_data: + result["overflow"] = overflow_data + + # Add overlap field if there are overlapping shapes + if self.overlapping_shapes: + result["overlap"] = {"overlapping_shapes": self.overlapping_shapes} + + # Add warnings field if there are warnings + if self.warnings: + result["warnings"] = self.warnings + + # Add paragraphs after placeholder_type + result["paragraphs"] = [para.to_dict() for para in self.paragraphs] + + return result + + +def is_valid_shape(shape: BaseShape) -> bool: + """Check if a shape contains meaningful text content.""" + # Must have a text frame with content + if not hasattr(shape, "text_frame") or not shape.text_frame: # type: ignore + return False + + text = shape.text_frame.text.strip() # type: ignore + if not text: + return False + + # Skip slide numbers and numeric footers + if hasattr(shape, "is_placeholder") and shape.is_placeholder: # type: ignore + if shape.placeholder_format and shape.placeholder_format.type: # type: ignore + placeholder_type = ( + str(shape.placeholder_format.type).split(".")[-1].split(" ")[0] # type: ignore + ) + if placeholder_type == "SLIDE_NUMBER": + return False + if placeholder_type == "FOOTER" and text.isdigit(): + return False + + return True + + +def collect_shapes_with_absolute_positions( + shape: BaseShape, parent_left: int = 0, parent_top: int = 0 +) -> List[ShapeWithPosition]: + """Recursively collect all shapes with valid text, calculating absolute positions. + + For shapes within groups, their positions are relative to the group. + This function calculates the absolute position on the slide by accumulating + parent group offsets. + + Args: + shape: The shape to process + parent_left: Accumulated left offset from parent groups (in EMUs) + parent_top: Accumulated top offset from parent groups (in EMUs) + + Returns: + List of ShapeWithPosition objects with absolute positions + """ + if hasattr(shape, "shapes"): # GroupShape + result = [] + # Get this group's position + group_left = shape.left if hasattr(shape, "left") else 0 + group_top = shape.top if hasattr(shape, "top") else 0 + + # Calculate absolute position for this group + abs_group_left = parent_left + group_left + abs_group_top = parent_top + group_top + + # Process children with accumulated offsets + for child in shape.shapes: # type: ignore + result.extend( + collect_shapes_with_absolute_positions( + child, abs_group_left, abs_group_top + ) + ) + return result + + # Regular shape - check if it has valid text + if is_valid_shape(shape): + # Calculate absolute position + shape_left = shape.left if hasattr(shape, "left") else 0 + shape_top = shape.top if hasattr(shape, "top") else 0 + + return [ + ShapeWithPosition( + shape=shape, + absolute_left=parent_left + shape_left, + absolute_top=parent_top + shape_top, + ) + ] + + return [] + + +def sort_shapes_by_position(shapes: List[ShapeData]) -> List[ShapeData]: + """Sort shapes by visual position (top-to-bottom, left-to-right). + + Shapes within 0.5 inches vertically are considered on the same row. + """ + if not shapes: + return shapes + + # Sort by top position first + shapes = sorted(shapes, key=lambda s: (s.top, s.left)) + + # Group shapes by row (within 0.5 inches vertically) + result = [] + row = [shapes[0]] + row_top = shapes[0].top + + for shape in shapes[1:]: + if abs(shape.top - row_top) <= 0.5: + row.append(shape) + else: + # Sort current row by left position and add to result + result.extend(sorted(row, key=lambda s: s.left)) + row = [shape] + row_top = shape.top + + # Don't forget the last row + result.extend(sorted(row, key=lambda s: s.left)) + return result + + +def calculate_overlap( + rect1: Tuple[float, float, float, float], + rect2: Tuple[float, float, float, float], + tolerance: float = 0.05, +) -> Tuple[bool, float]: + """Calculate if and how much two rectangles overlap. + + Args: + rect1: (left, top, width, height) of first rectangle in inches + rect2: (left, top, width, height) of second rectangle in inches + tolerance: Minimum overlap in inches to consider as overlapping (default: 0.05") + + Returns: + Tuple of (overlaps, overlap_area) where: + - overlaps: True if rectangles overlap by more than tolerance + - overlap_area: Area of overlap in square inches + """ + left1, top1, w1, h1 = rect1 + left2, top2, w2, h2 = rect2 + + # Calculate overlap dimensions + overlap_width = min(left1 + w1, left2 + w2) - max(left1, left2) + overlap_height = min(top1 + h1, top2 + h2) - max(top1, top2) + + # Check if there's meaningful overlap (more than tolerance) + if overlap_width > tolerance and overlap_height > tolerance: + # Calculate overlap area in square inches + overlap_area = overlap_width * overlap_height + return True, round(overlap_area, 2) + + return False, 0 + + +def detect_overlaps(shapes: List[ShapeData]) -> None: + """Detect overlapping shapes and update their overlapping_shapes dictionaries. + + This function requires each ShapeData to have its shape_id already set. + It modifies the shapes in-place, adding shape IDs with overlap areas in square inches. + + Args: + shapes: List of ShapeData objects with shape_id attributes set + """ + n = len(shapes) + + # Compare each pair of shapes + for i in range(n): + for j in range(i + 1, n): + shape1 = shapes[i] + shape2 = shapes[j] + + # Ensure shape IDs are set + assert shape1.shape_id, f"Shape at index {i} has no shape_id" + assert shape2.shape_id, f"Shape at index {j} has no shape_id" + + rect1 = (shape1.left, shape1.top, shape1.width, shape1.height) + rect2 = (shape2.left, shape2.top, shape2.width, shape2.height) + + overlaps, overlap_area = calculate_overlap(rect1, rect2) + + if overlaps: + # Add shape IDs with overlap area in square inches + shape1.overlapping_shapes[shape2.shape_id] = overlap_area + shape2.overlapping_shapes[shape1.shape_id] = overlap_area + + +def extract_text_inventory( + pptx_path: Path, prs: Optional[Any] = None, issues_only: bool = False +) -> InventoryData: + """Extract text content from all slides in a PowerPoint presentation. + + Args: + pptx_path: Path to the PowerPoint file + prs: Optional Presentation object to use. If not provided, will load from pptx_path. + issues_only: If True, only include shapes that have overflow or overlap issues + + Returns a nested dictionary: {slide-N: {shape-N: ShapeData}} + Shapes are sorted by visual position (top-to-bottom, left-to-right). + The ShapeData objects contain the full shape information and can be + converted to dictionaries for JSON serialization using to_dict(). + """ + if prs is None: + prs = Presentation(str(pptx_path)) + inventory: InventoryData = {} + + for slide_idx, slide in enumerate(prs.slides): + # Collect all valid shapes from this slide with absolute positions + shapes_with_positions = [] + for shape in slide.shapes: # type: ignore + shapes_with_positions.extend(collect_shapes_with_absolute_positions(shape)) + + if not shapes_with_positions: + continue + + # Convert to ShapeData with absolute positions and slide reference + shape_data_list = [ + ShapeData( + swp.shape, + swp.absolute_left, + swp.absolute_top, + slide, + ) + for swp in shapes_with_positions + ] + + # Sort by visual position and assign stable IDs in one step + sorted_shapes = sort_shapes_by_position(shape_data_list) + for idx, shape_data in enumerate(sorted_shapes): + shape_data.shape_id = f"shape-{idx}" + + # Detect overlaps using the stable shape IDs + if len(sorted_shapes) > 1: + detect_overlaps(sorted_shapes) + + # Filter for issues only if requested (after overlap detection) + if issues_only: + sorted_shapes = [sd for sd in sorted_shapes if sd.has_any_issues] + + if not sorted_shapes: + continue + + # Create slide inventory using the stable shape IDs + inventory[f"slide-{slide_idx}"] = { + shape_data.shape_id: shape_data for shape_data in sorted_shapes + } + + return inventory + + +def get_inventory_as_dict(pptx_path: Path, issues_only: bool = False) -> InventoryDict: + """Extract text inventory and return as JSON-serializable dictionaries. + + This is a convenience wrapper around extract_text_inventory that returns + dictionaries instead of ShapeData objects, useful for testing and direct + JSON serialization. + + Args: + pptx_path: Path to the PowerPoint file + issues_only: If True, only include shapes that have overflow or overlap issues + + Returns: + Nested dictionary with all data serialized for JSON + """ + inventory = extract_text_inventory(pptx_path, issues_only=issues_only) + + # Convert ShapeData objects to dictionaries + dict_inventory: InventoryDict = {} + for slide_key, shapes in inventory.items(): + dict_inventory[slide_key] = { + shape_key: shape_data.to_dict() for shape_key, shape_data in shapes.items() + } + + return dict_inventory + + +def save_inventory(inventory: InventoryData, output_path: Path) -> None: + """Save inventory to JSON file with proper formatting. + + Converts ShapeData objects to dictionaries for JSON serialization. + """ + # Convert ShapeData objects to dictionaries + json_inventory: InventoryDict = {} + for slide_key, shapes in inventory.items(): + json_inventory[slide_key] = { + shape_key: shape_data.to_dict() for shape_key, shape_data in shapes.items() + } + + with open(output_path, "w", encoding="utf-8") as f: + json.dump(json_inventory, f, indent=2, ensure_ascii=False) + + +if __name__ == "__main__": + main() diff --git a/skills/pptx/scripts/rearrange.py b/skills/pptx/scripts/rearrange.py new file mode 100644 index 0000000..2519911 --- /dev/null +++ b/skills/pptx/scripts/rearrange.py @@ -0,0 +1,231 @@ +#!/usr/bin/env python3 +""" +Rearrange PowerPoint slides based on a sequence of indices. + +Usage: + python rearrange.py template.pptx output.pptx 0,34,34,50,52 + +This will create output.pptx using slides from template.pptx in the specified order. +Slides can be repeated (e.g., 34 appears twice). +""" + +import argparse +import shutil +import sys +from copy import deepcopy +from pathlib import Path + +import six +from pptx import Presentation + + +def main(): + parser = argparse.ArgumentParser( + description="Rearrange PowerPoint slides based on a sequence of indices.", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python rearrange.py template.pptx output.pptx 0,34,34,50,52 + Creates output.pptx using slides 0, 34 (twice), 50, and 52 from template.pptx + + python rearrange.py template.pptx output.pptx 5,3,1,2,4 + Creates output.pptx with slides reordered as specified + +Note: Slide indices are 0-based (first slide is 0, second is 1, etc.) + """, + ) + + parser.add_argument("template", help="Path to template PPTX file") + parser.add_argument("output", help="Path for output PPTX file") + parser.add_argument( + "sequence", help="Comma-separated sequence of slide indices (0-based)" + ) + + args = parser.parse_args() + + # Parse the slide sequence + try: + slide_sequence = [int(x.strip()) for x in args.sequence.split(",")] + except ValueError: + print( + "Error: Invalid sequence format. Use comma-separated integers (e.g., 0,34,34,50,52)" + ) + sys.exit(1) + + # Check template exists + template_path = Path(args.template) + if not template_path.exists(): + print(f"Error: Template file not found: {args.template}") + sys.exit(1) + + # Create output directory if needed + output_path = Path(args.output) + output_path.parent.mkdir(parents=True, exist_ok=True) + + try: + rearrange_presentation(template_path, output_path, slide_sequence) + except ValueError as e: + print(f"Error: {e}") + sys.exit(1) + except Exception as e: + print(f"Error processing presentation: {e}") + sys.exit(1) + + +def duplicate_slide(pres, index): + """Duplicate a slide in the presentation.""" + source = pres.slides[index] + + # Use source's layout to preserve formatting + new_slide = pres.slides.add_slide(source.slide_layout) + + # Collect all image and media relationships from the source slide + image_rels = {} + for rel_id, rel in six.iteritems(source.part.rels): + if "image" in rel.reltype or "media" in rel.reltype: + image_rels[rel_id] = rel + + # CRITICAL: Clear placeholder shapes to avoid duplicates + for shape in new_slide.shapes: + sp = shape.element + sp.getparent().remove(sp) + + # Copy all shapes from source + for shape in source.shapes: + el = shape.element + new_el = deepcopy(el) + new_slide.shapes._spTree.insert_element_before(new_el, "p:extLst") + + # Handle picture shapes - need to update the blip reference + # Look for all blip elements (they can be in pic or other contexts) + # Using the element's own xpath method without namespaces argument + blips = new_el.xpath(".//a:blip[@r:embed]") + for blip in blips: + old_rId = blip.get( + "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed" + ) + if old_rId in image_rels: + # Create a new relationship in the destination slide for this image + old_rel = image_rels[old_rId] + # get_or_add returns the rId directly, or adds and returns new rId + new_rId = new_slide.part.rels.get_or_add( + old_rel.reltype, old_rel._target + ) + # Update the blip's embed reference to use the new relationship ID + blip.set( + "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed", + new_rId, + ) + + # Copy any additional image/media relationships that might be referenced elsewhere + for rel_id, rel in image_rels.items(): + try: + new_slide.part.rels.get_or_add(rel.reltype, rel._target) + except Exception: + pass # Relationship might already exist + + return new_slide + + +def delete_slide(pres, index): + """Delete a slide from the presentation.""" + rId = pres.slides._sldIdLst[index].rId + pres.part.drop_rel(rId) + del pres.slides._sldIdLst[index] + + +def reorder_slides(pres, slide_index, target_index): + """Move a slide from one position to another.""" + slides = pres.slides._sldIdLst + + # Remove slide element from current position + slide_element = slides[slide_index] + slides.remove(slide_element) + + # Insert at target position + slides.insert(target_index, slide_element) + + +def rearrange_presentation(template_path, output_path, slide_sequence): + """ + Create a new presentation with slides from template in specified order. + + Args: + template_path: Path to template PPTX file + output_path: Path for output PPTX file + slide_sequence: List of slide indices (0-based) to include + """ + # Copy template to preserve dimensions and theme + if template_path != output_path: + shutil.copy2(template_path, output_path) + prs = Presentation(output_path) + else: + prs = Presentation(template_path) + + total_slides = len(prs.slides) + + # Validate indices + for idx in slide_sequence: + if idx < 0 or idx >= total_slides: + raise ValueError(f"Slide index {idx} out of range (0-{total_slides - 1})") + + # Track original slides and their duplicates + slide_map = [] # List of actual slide indices for final presentation + duplicated = {} # Track duplicates: original_idx -> [duplicate_indices] + + # Step 1: DUPLICATE repeated slides + print(f"Processing {len(slide_sequence)} slides from template...") + for i, template_idx in enumerate(slide_sequence): + if template_idx in duplicated and duplicated[template_idx]: + # Already duplicated this slide, use the duplicate + slide_map.append(duplicated[template_idx].pop(0)) + print(f" [{i}] Using duplicate of slide {template_idx}") + elif slide_sequence.count(template_idx) > 1 and template_idx not in duplicated: + # First occurrence of a repeated slide - create duplicates + slide_map.append(template_idx) + duplicates = [] + count = slide_sequence.count(template_idx) - 1 + print( + f" [{i}] Using original slide {template_idx}, creating {count} duplicate(s)" + ) + for _ in range(count): + duplicate_slide(prs, template_idx) + duplicates.append(len(prs.slides) - 1) + duplicated[template_idx] = duplicates + else: + # Unique slide or first occurrence already handled, use original + slide_map.append(template_idx) + print(f" [{i}] Using original slide {template_idx}") + + # Step 2: DELETE unwanted slides (work backwards) + slides_to_keep = set(slide_map) + print(f"\nDeleting {len(prs.slides) - len(slides_to_keep)} unused slides...") + for i in range(len(prs.slides) - 1, -1, -1): + if i not in slides_to_keep: + delete_slide(prs, i) + # Update slide_map indices after deletion + slide_map = [idx - 1 if idx > i else idx for idx in slide_map] + + # Step 3: REORDER to final sequence + print(f"Reordering {len(slide_map)} slides to final sequence...") + for target_pos in range(len(slide_map)): + # Find which slide should be at target_pos + current_pos = slide_map[target_pos] + if current_pos != target_pos: + reorder_slides(prs, current_pos, target_pos) + # Update slide_map: the move shifts other slides + for i in range(len(slide_map)): + if slide_map[i] > current_pos and slide_map[i] <= target_pos: + slide_map[i] -= 1 + elif slide_map[i] < current_pos and slide_map[i] >= target_pos: + slide_map[i] += 1 + slide_map[target_pos] = target_pos + + # Save the presentation + prs.save(output_path) + print(f"\nSaved rearranged presentation to: {output_path}") + print(f"Final presentation has {len(prs.slides)} slides") + + +if __name__ == "__main__": + main() diff --git a/skills/pptx/scripts/replace.py b/skills/pptx/scripts/replace.py new file mode 100644 index 0000000..8f7a8b1 --- /dev/null +++ b/skills/pptx/scripts/replace.py @@ -0,0 +1,385 @@ +#!/usr/bin/env python3 +"""Apply text replacements to PowerPoint presentation. + +Usage: + python replace.py + +The replacements JSON should have the structure output by inventory.py. +ALL text shapes identified by inventory.py will have their text cleared +unless "paragraphs" is specified in the replacements for that shape. +""" + +import json +import sys +from pathlib import Path +from typing import Any, Dict, List + +from inventory import InventoryData, extract_text_inventory +from pptx import Presentation +from pptx.dml.color import RGBColor +from pptx.enum.dml import MSO_THEME_COLOR +from pptx.enum.text import PP_ALIGN +from pptx.oxml.xmlchemy import OxmlElement +from pptx.util import Pt + + +def clear_paragraph_bullets(paragraph): + """Clear bullet formatting from a paragraph.""" + pPr = paragraph._element.get_or_add_pPr() + + # Remove existing bullet elements + for child in list(pPr): + if ( + child.tag.endswith("buChar") + or child.tag.endswith("buNone") + or child.tag.endswith("buAutoNum") + or child.tag.endswith("buFont") + ): + pPr.remove(child) + + return pPr + + +def apply_paragraph_properties(paragraph, para_data: Dict[str, Any]): + """Apply formatting properties to a paragraph.""" + # Get the text but don't set it on paragraph directly yet + text = para_data.get("text", "") + + # Get or create paragraph properties + pPr = clear_paragraph_bullets(paragraph) + + # Handle bullet formatting + if para_data.get("bullet", False): + level = para_data.get("level", 0) + paragraph.level = level + + # Calculate font-proportional indentation + font_size = para_data.get("font_size", 18.0) + level_indent_emu = int((font_size * (1.6 + level * 1.6)) * 12700) + hanging_indent_emu = int(-font_size * 0.8 * 12700) + + # Set indentation + pPr.attrib["marL"] = str(level_indent_emu) + pPr.attrib["indent"] = str(hanging_indent_emu) + + # Add bullet character + buChar = OxmlElement("a:buChar") + buChar.set("char", "•") + pPr.append(buChar) + + # Default to left alignment for bullets if not specified + if "alignment" not in para_data: + paragraph.alignment = PP_ALIGN.LEFT + else: + # Remove indentation for non-bullet text + pPr.attrib["marL"] = "0" + pPr.attrib["indent"] = "0" + + # Add buNone element + buNone = OxmlElement("a:buNone") + pPr.insert(0, buNone) + + # Apply alignment + if "alignment" in para_data: + alignment_map = { + "LEFT": PP_ALIGN.LEFT, + "CENTER": PP_ALIGN.CENTER, + "RIGHT": PP_ALIGN.RIGHT, + "JUSTIFY": PP_ALIGN.JUSTIFY, + } + if para_data["alignment"] in alignment_map: + paragraph.alignment = alignment_map[para_data["alignment"]] + + # Apply spacing + if "space_before" in para_data: + paragraph.space_before = Pt(para_data["space_before"]) + if "space_after" in para_data: + paragraph.space_after = Pt(para_data["space_after"]) + if "line_spacing" in para_data: + paragraph.line_spacing = Pt(para_data["line_spacing"]) + + # Apply run-level formatting + if not paragraph.runs: + run = paragraph.add_run() + run.text = text + else: + run = paragraph.runs[0] + run.text = text + + # Apply font properties + apply_font_properties(run, para_data) + + +def apply_font_properties(run, para_data: Dict[str, Any]): + """Apply font properties to a text run.""" + if "bold" in para_data: + run.font.bold = para_data["bold"] + if "italic" in para_data: + run.font.italic = para_data["italic"] + if "underline" in para_data: + run.font.underline = para_data["underline"] + if "font_size" in para_data: + run.font.size = Pt(para_data["font_size"]) + if "font_name" in para_data: + run.font.name = para_data["font_name"] + + # Apply color - prefer RGB, fall back to theme_color + if "color" in para_data: + color_hex = para_data["color"].lstrip("#") + if len(color_hex) == 6: + r = int(color_hex[0:2], 16) + g = int(color_hex[2:4], 16) + b = int(color_hex[4:6], 16) + run.font.color.rgb = RGBColor(r, g, b) + elif "theme_color" in para_data: + # Get theme color by name (e.g., "DARK_1", "ACCENT_1") + theme_name = para_data["theme_color"] + try: + run.font.color.theme_color = getattr(MSO_THEME_COLOR, theme_name) + except AttributeError: + print(f" WARNING: Unknown theme color name '{theme_name}'") + + +def detect_frame_overflow(inventory: InventoryData) -> Dict[str, Dict[str, float]]: + """Detect text overflow in shapes (text exceeding shape bounds). + + Returns dict of slide_key -> shape_key -> overflow_inches. + Only includes shapes that have text overflow. + """ + overflow_map = {} + + for slide_key, shapes_dict in inventory.items(): + for shape_key, shape_data in shapes_dict.items(): + # Check for frame overflow (text exceeding shape bounds) + if shape_data.frame_overflow_bottom is not None: + if slide_key not in overflow_map: + overflow_map[slide_key] = {} + overflow_map[slide_key][shape_key] = shape_data.frame_overflow_bottom + + return overflow_map + + +def validate_replacements(inventory: InventoryData, replacements: Dict) -> List[str]: + """Validate that all shapes in replacements exist in inventory. + + Returns list of error messages. + """ + errors = [] + + for slide_key, shapes_data in replacements.items(): + if not slide_key.startswith("slide-"): + continue + + # Check if slide exists + if slide_key not in inventory: + errors.append(f"Slide '{slide_key}' not found in inventory") + continue + + # Check each shape + for shape_key in shapes_data.keys(): + if shape_key not in inventory[slide_key]: + # Find shapes without replacements defined and show their content + unused_with_content = [] + for k in inventory[slide_key].keys(): + if k not in shapes_data: + shape_data = inventory[slide_key][k] + # Get text from paragraphs as preview + paragraphs = shape_data.paragraphs + if paragraphs and paragraphs[0].text: + first_text = paragraphs[0].text[:50] + if len(paragraphs[0].text) > 50: + first_text += "..." + unused_with_content.append(f"{k} ('{first_text}')") + else: + unused_with_content.append(k) + + errors.append( + f"Shape '{shape_key}' not found on '{slide_key}'. " + f"Shapes without replacements: {', '.join(sorted(unused_with_content)) if unused_with_content else 'none'}" + ) + + return errors + + +def check_duplicate_keys(pairs): + """Check for duplicate keys when loading JSON.""" + result = {} + for key, value in pairs: + if key in result: + raise ValueError(f"Duplicate key found in JSON: '{key}'") + result[key] = value + return result + + +def apply_replacements(pptx_file: str, json_file: str, output_file: str): + """Apply text replacements from JSON to PowerPoint presentation.""" + + # Load presentation + prs = Presentation(pptx_file) + + # Get inventory of all text shapes (returns ShapeData objects) + # Pass prs to use same Presentation instance + inventory = extract_text_inventory(Path(pptx_file), prs) + + # Detect text overflow in original presentation + original_overflow = detect_frame_overflow(inventory) + + # Load replacement data with duplicate key detection + with open(json_file, "r") as f: + replacements = json.load(f, object_pairs_hook=check_duplicate_keys) + + # Validate replacements + errors = validate_replacements(inventory, replacements) + if errors: + print("ERROR: Invalid shapes in replacement JSON:") + for error in errors: + print(f" - {error}") + print("\nPlease check the inventory and update your replacement JSON.") + print( + "You can regenerate the inventory with: python inventory.py " + ) + raise ValueError(f"Found {len(errors)} validation error(s)") + + # Track statistics + shapes_processed = 0 + shapes_cleared = 0 + shapes_replaced = 0 + + # Process each slide from inventory + for slide_key, shapes_dict in inventory.items(): + if not slide_key.startswith("slide-"): + continue + + slide_index = int(slide_key.split("-")[1]) + + if slide_index >= len(prs.slides): + print(f"Warning: Slide {slide_index} not found") + continue + + # Process each shape from inventory + for shape_key, shape_data in shapes_dict.items(): + shapes_processed += 1 + + # Get the shape directly from ShapeData + shape = shape_data.shape + if not shape: + print(f"Warning: {shape_key} has no shape reference") + continue + + # ShapeData already validates text_frame in __init__ + text_frame = shape.text_frame # type: ignore + + text_frame.clear() # type: ignore + shapes_cleared += 1 + + # Check for replacement paragraphs + replacement_shape_data = replacements.get(slide_key, {}).get(shape_key, {}) + if "paragraphs" not in replacement_shape_data: + continue + + shapes_replaced += 1 + + # Add replacement paragraphs + for i, para_data in enumerate(replacement_shape_data["paragraphs"]): + if i == 0: + p = text_frame.paragraphs[0] # type: ignore + else: + p = text_frame.add_paragraph() # type: ignore + + apply_paragraph_properties(p, para_data) + + # Check for issues after replacements + # Save to a temporary file and reload to avoid modifying the presentation during inventory + # (extract_text_inventory accesses font.color which adds empty elements) + import tempfile + + with tempfile.NamedTemporaryFile(suffix=".pptx", delete=False) as tmp: + tmp_path = Path(tmp.name) + prs.save(str(tmp_path)) + + try: + updated_inventory = extract_text_inventory(tmp_path) + updated_overflow = detect_frame_overflow(updated_inventory) + finally: + tmp_path.unlink() # Clean up temp file + + # Check if any text overflow got worse + overflow_errors = [] + for slide_key, shape_overflows in updated_overflow.items(): + for shape_key, new_overflow in shape_overflows.items(): + # Get original overflow (0 if there was no overflow before) + original = original_overflow.get(slide_key, {}).get(shape_key, 0.0) + + # Error if overflow increased + if new_overflow > original + 0.01: # Small tolerance for rounding + increase = new_overflow - original + overflow_errors.append( + f'{slide_key}/{shape_key}: overflow worsened by {increase:.2f}" ' + f'(was {original:.2f}", now {new_overflow:.2f}")' + ) + + # Collect warnings from updated shapes + warnings = [] + for slide_key, shapes_dict in updated_inventory.items(): + for shape_key, shape_data in shapes_dict.items(): + if shape_data.warnings: + for warning in shape_data.warnings: + warnings.append(f"{slide_key}/{shape_key}: {warning}") + + # Fail if there are any issues + if overflow_errors or warnings: + print("\nERROR: Issues detected in replacement output:") + if overflow_errors: + print("\nText overflow worsened:") + for error in overflow_errors: + print(f" - {error}") + if warnings: + print("\nFormatting warnings:") + for warning in warnings: + print(f" - {warning}") + print("\nPlease fix these issues before saving.") + raise ValueError( + f"Found {len(overflow_errors)} overflow error(s) and {len(warnings)} warning(s)" + ) + + # Save the presentation + prs.save(output_file) + + # Report results + print(f"Saved updated presentation to: {output_file}") + print(f"Processed {len(prs.slides)} slides") + print(f" - Shapes processed: {shapes_processed}") + print(f" - Shapes cleared: {shapes_cleared}") + print(f" - Shapes replaced: {shapes_replaced}") + + +def main(): + """Main entry point for command-line usage.""" + if len(sys.argv) != 4: + print(__doc__) + sys.exit(1) + + input_pptx = Path(sys.argv[1]) + replacements_json = Path(sys.argv[2]) + output_pptx = Path(sys.argv[3]) + + if not input_pptx.exists(): + print(f"Error: Input file '{input_pptx}' not found") + sys.exit(1) + + if not replacements_json.exists(): + print(f"Error: Replacements JSON file '{replacements_json}' not found") + sys.exit(1) + + try: + apply_replacements(str(input_pptx), str(replacements_json), str(output_pptx)) + except Exception as e: + print(f"Error applying replacements: {e}") + import traceback + + traceback.print_exc() + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/skills/pptx/scripts/thumbnail.py b/skills/pptx/scripts/thumbnail.py new file mode 100644 index 0000000..5c7fdf1 --- /dev/null +++ b/skills/pptx/scripts/thumbnail.py @@ -0,0 +1,450 @@ +#!/usr/bin/env python3 +""" +Create thumbnail grids from PowerPoint presentation slides. + +Creates a grid layout of slide thumbnails with configurable columns (max 6). +Each grid contains up to cols×(cols+1) images. For presentations with more +slides, multiple numbered grid files are created automatically. + +The program outputs the names of all files created. + +Output: +- Single grid: {prefix}.jpg (if slides fit in one grid) +- Multiple grids: {prefix}-1.jpg, {prefix}-2.jpg, etc. + +Grid limits by column count: +- 3 cols: max 12 slides per grid (3×4) +- 4 cols: max 20 slides per grid (4×5) +- 5 cols: max 30 slides per grid (5×6) [default] +- 6 cols: max 42 slides per grid (6×7) + +Usage: + python thumbnail.py input.pptx [output_prefix] [--cols N] [--outline-placeholders] + +Examples: + python thumbnail.py presentation.pptx + # Creates: thumbnails.jpg (using default prefix) + # Outputs: + # Created 1 grid(s): + # - thumbnails.jpg + + python thumbnail.py large-deck.pptx grid --cols 4 + # Creates: grid-1.jpg, grid-2.jpg, grid-3.jpg + # Outputs: + # Created 3 grid(s): + # - grid-1.jpg + # - grid-2.jpg + # - grid-3.jpg + + python thumbnail.py template.pptx analysis --outline-placeholders + # Creates thumbnail grids with red outlines around text placeholders +""" + +import argparse +import subprocess +import sys +import tempfile +from pathlib import Path + +from inventory import extract_text_inventory +from PIL import Image, ImageDraw, ImageFont +from pptx import Presentation + +# Constants +THUMBNAIL_WIDTH = 300 # Fixed thumbnail width in pixels +CONVERSION_DPI = 100 # DPI for PDF to image conversion +MAX_COLS = 6 # Maximum number of columns +DEFAULT_COLS = 5 # Default number of columns +JPEG_QUALITY = 95 # JPEG compression quality + +# Grid layout constants +GRID_PADDING = 20 # Padding between thumbnails +BORDER_WIDTH = 2 # Border width around thumbnails +FONT_SIZE_RATIO = 0.12 # Font size as fraction of thumbnail width +LABEL_PADDING_RATIO = 0.4 # Label padding as fraction of font size + + +def main(): + parser = argparse.ArgumentParser( + description="Create thumbnail grids from PowerPoint slides." + ) + parser.add_argument("input", help="Input PowerPoint file (.pptx)") + parser.add_argument( + "output_prefix", + nargs="?", + default="thumbnails", + help="Output prefix for image files (default: thumbnails, will create prefix.jpg or prefix-N.jpg)", + ) + parser.add_argument( + "--cols", + type=int, + default=DEFAULT_COLS, + help=f"Number of columns (default: {DEFAULT_COLS}, max: {MAX_COLS})", + ) + parser.add_argument( + "--outline-placeholders", + action="store_true", + help="Outline text placeholders with a colored border", + ) + + args = parser.parse_args() + + # Validate columns + cols = min(args.cols, MAX_COLS) + if args.cols > MAX_COLS: + print(f"Warning: Columns limited to {MAX_COLS} (requested {args.cols})") + + # Validate input + input_path = Path(args.input) + if not input_path.exists() or input_path.suffix.lower() != ".pptx": + print(f"Error: Invalid PowerPoint file: {args.input}") + sys.exit(1) + + # Construct output path (always JPG) + output_path = Path(f"{args.output_prefix}.jpg") + + print(f"Processing: {args.input}") + + try: + with tempfile.TemporaryDirectory() as temp_dir: + # Get placeholder regions if outlining is enabled + placeholder_regions = None + slide_dimensions = None + if args.outline_placeholders: + print("Extracting placeholder regions...") + placeholder_regions, slide_dimensions = get_placeholder_regions( + input_path + ) + if placeholder_regions: + print(f"Found placeholders on {len(placeholder_regions)} slides") + + # Convert slides to images + slide_images = convert_to_images(input_path, Path(temp_dir), CONVERSION_DPI) + if not slide_images: + print("Error: No slides found") + sys.exit(1) + + print(f"Found {len(slide_images)} slides") + + # Create grids (max cols×(cols+1) images per grid) + grid_files = create_grids( + slide_images, + cols, + THUMBNAIL_WIDTH, + output_path, + placeholder_regions, + slide_dimensions, + ) + + # Print saved files + print(f"Created {len(grid_files)} grid(s):") + for grid_file in grid_files: + print(f" - {grid_file}") + + except Exception as e: + print(f"Error: {e}") + sys.exit(1) + + +def create_hidden_slide_placeholder(size): + """Create placeholder image for hidden slides.""" + img = Image.new("RGB", size, color="#F0F0F0") + draw = ImageDraw.Draw(img) + line_width = max(5, min(size) // 100) + draw.line([(0, 0), size], fill="#CCCCCC", width=line_width) + draw.line([(size[0], 0), (0, size[1])], fill="#CCCCCC", width=line_width) + return img + + +def get_placeholder_regions(pptx_path): + """Extract ALL text regions from the presentation. + + Returns a tuple of (placeholder_regions, slide_dimensions). + text_regions is a dict mapping slide indices to lists of text regions. + Each region is a dict with 'left', 'top', 'width', 'height' in inches. + slide_dimensions is a tuple of (width_inches, height_inches). + """ + prs = Presentation(str(pptx_path)) + inventory = extract_text_inventory(pptx_path, prs) + placeholder_regions = {} + + # Get actual slide dimensions in inches (EMU to inches conversion) + slide_width_inches = (prs.slide_width or 9144000) / 914400.0 + slide_height_inches = (prs.slide_height or 5143500) / 914400.0 + + for slide_key, shapes in inventory.items(): + # Extract slide index from "slide-N" format + slide_idx = int(slide_key.split("-")[1]) + regions = [] + + for shape_key, shape_data in shapes.items(): + # The inventory only contains shapes with text, so all shapes should be highlighted + regions.append( + { + "left": shape_data.left, + "top": shape_data.top, + "width": shape_data.width, + "height": shape_data.height, + } + ) + + if regions: + placeholder_regions[slide_idx] = regions + + return placeholder_regions, (slide_width_inches, slide_height_inches) + + +def convert_to_images(pptx_path, temp_dir, dpi): + """Convert PowerPoint to images via PDF, handling hidden slides.""" + # Detect hidden slides + print("Analyzing presentation...") + prs = Presentation(str(pptx_path)) + total_slides = len(prs.slides) + + # Find hidden slides (1-based indexing for display) + hidden_slides = { + idx + 1 + for idx, slide in enumerate(prs.slides) + if slide.element.get("show") == "0" + } + + print(f"Total slides: {total_slides}") + if hidden_slides: + print(f"Hidden slides: {sorted(hidden_slides)}") + + pdf_path = temp_dir / f"{pptx_path.stem}.pdf" + + # Convert to PDF + print("Converting to PDF...") + result = subprocess.run( + [ + "soffice", + "--headless", + "--convert-to", + "pdf", + "--outdir", + str(temp_dir), + str(pptx_path), + ], + capture_output=True, + text=True, + ) + if result.returncode != 0 or not pdf_path.exists(): + raise RuntimeError("PDF conversion failed") + + # Convert PDF to images + print(f"Converting to images at {dpi} DPI...") + result = subprocess.run( + ["pdftoppm", "-jpeg", "-r", str(dpi), str(pdf_path), str(temp_dir / "slide")], + capture_output=True, + text=True, + ) + if result.returncode != 0: + raise RuntimeError("Image conversion failed") + + visible_images = sorted(temp_dir.glob("slide-*.jpg")) + + # Create full list with placeholders for hidden slides + all_images = [] + visible_idx = 0 + + # Get placeholder dimensions from first visible slide + if visible_images: + with Image.open(visible_images[0]) as img: + placeholder_size = img.size + else: + placeholder_size = (1920, 1080) + + for slide_num in range(1, total_slides + 1): + if slide_num in hidden_slides: + # Create placeholder image for hidden slide + placeholder_path = temp_dir / f"hidden-{slide_num:03d}.jpg" + placeholder_img = create_hidden_slide_placeholder(placeholder_size) + placeholder_img.save(placeholder_path, "JPEG") + all_images.append(placeholder_path) + else: + # Use the actual visible slide image + if visible_idx < len(visible_images): + all_images.append(visible_images[visible_idx]) + visible_idx += 1 + + return all_images + + +def create_grids( + image_paths, + cols, + width, + output_path, + placeholder_regions=None, + slide_dimensions=None, +): + """Create multiple thumbnail grids from slide images, max cols×(cols+1) images per grid.""" + # Maximum images per grid is cols × (cols + 1) for better proportions + max_images_per_grid = cols * (cols + 1) + grid_files = [] + + print( + f"Creating grids with {cols} columns (max {max_images_per_grid} images per grid)" + ) + + # Split images into chunks + for chunk_idx, start_idx in enumerate( + range(0, len(image_paths), max_images_per_grid) + ): + end_idx = min(start_idx + max_images_per_grid, len(image_paths)) + chunk_images = image_paths[start_idx:end_idx] + + # Create grid for this chunk + grid = create_grid( + chunk_images, cols, width, start_idx, placeholder_regions, slide_dimensions + ) + + # Generate output filename + if len(image_paths) <= max_images_per_grid: + # Single grid - use base filename without suffix + grid_filename = output_path + else: + # Multiple grids - insert index before extension with dash + stem = output_path.stem + suffix = output_path.suffix + grid_filename = output_path.parent / f"{stem}-{chunk_idx + 1}{suffix}" + + # Save grid + grid_filename.parent.mkdir(parents=True, exist_ok=True) + grid.save(str(grid_filename), quality=JPEG_QUALITY) + grid_files.append(str(grid_filename)) + + return grid_files + + +def create_grid( + image_paths, + cols, + width, + start_slide_num=0, + placeholder_regions=None, + slide_dimensions=None, +): + """Create thumbnail grid from slide images with optional placeholder outlining.""" + font_size = int(width * FONT_SIZE_RATIO) + label_padding = int(font_size * LABEL_PADDING_RATIO) + + # Get dimensions + with Image.open(image_paths[0]) as img: + aspect = img.height / img.width + height = int(width * aspect) + + # Calculate grid size + rows = (len(image_paths) + cols - 1) // cols + grid_w = cols * width + (cols + 1) * GRID_PADDING + grid_h = rows * (height + font_size + label_padding * 2) + (rows + 1) * GRID_PADDING + + # Create grid + grid = Image.new("RGB", (grid_w, grid_h), "white") + draw = ImageDraw.Draw(grid) + + # Load font with size based on thumbnail width + try: + # Use Pillow's default font with size + font = ImageFont.load_default(size=font_size) + except Exception: + # Fall back to basic default font if size parameter not supported + font = ImageFont.load_default() + + # Place thumbnails + for i, img_path in enumerate(image_paths): + row, col = i // cols, i % cols + x = col * width + (col + 1) * GRID_PADDING + y_base = ( + row * (height + font_size + label_padding * 2) + (row + 1) * GRID_PADDING + ) + + # Add label with actual slide number + label = f"{start_slide_num + i}" + bbox = draw.textbbox((0, 0), label, font=font) + text_w = bbox[2] - bbox[0] + draw.text( + (x + (width - text_w) // 2, y_base + label_padding), + label, + fill="black", + font=font, + ) + + # Add thumbnail below label with proportional spacing + y_thumbnail = y_base + label_padding + font_size + label_padding + + with Image.open(img_path) as img: + # Get original dimensions before thumbnail + orig_w, orig_h = img.size + + # Apply placeholder outlines if enabled + if placeholder_regions and (start_slide_num + i) in placeholder_regions: + # Convert to RGBA for transparency support + if img.mode != "RGBA": + img = img.convert("RGBA") + + # Get the regions for this slide + regions = placeholder_regions[start_slide_num + i] + + # Calculate scale factors using actual slide dimensions + if slide_dimensions: + slide_width_inches, slide_height_inches = slide_dimensions + else: + # Fallback: estimate from image size at CONVERSION_DPI + slide_width_inches = orig_w / CONVERSION_DPI + slide_height_inches = orig_h / CONVERSION_DPI + + x_scale = orig_w / slide_width_inches + y_scale = orig_h / slide_height_inches + + # Create a highlight overlay + overlay = Image.new("RGBA", img.size, (255, 255, 255, 0)) + overlay_draw = ImageDraw.Draw(overlay) + + # Highlight each placeholder region + for region in regions: + # Convert from inches to pixels in the original image + px_left = int(region["left"] * x_scale) + px_top = int(region["top"] * y_scale) + px_width = int(region["width"] * x_scale) + px_height = int(region["height"] * y_scale) + + # Draw highlight outline with red color and thick stroke + # Using a bright red outline instead of fill + stroke_width = max( + 5, min(orig_w, orig_h) // 150 + ) # Thicker proportional stroke width + overlay_draw.rectangle( + [(px_left, px_top), (px_left + px_width, px_top + px_height)], + outline=(255, 0, 0, 255), # Bright red, fully opaque + width=stroke_width, + ) + + # Composite the overlay onto the image using alpha blending + img = Image.alpha_composite(img, overlay) + # Convert back to RGB for JPEG saving + img = img.convert("RGB") + + img.thumbnail((width, height), Image.Resampling.LANCZOS) + w, h = img.size + tx = x + (width - w) // 2 + ty = y_thumbnail + (height - h) // 2 + grid.paste(img, (tx, ty)) + + # Add border + if BORDER_WIDTH > 0: + draw.rectangle( + [ + (tx - BORDER_WIDTH, ty - BORDER_WIDTH), + (tx + w + BORDER_WIDTH - 1, ty + h + BORDER_WIDTH - 1), + ], + outline="gray", + width=BORDER_WIDTH, + ) + + return grid + + +if __name__ == "__main__": + main() diff --git a/skills/pptx/template.pptx b/skills/pptx/template.pptx new file mode 100644 index 0000000..5531170 Binary files /dev/null and b/skills/pptx/template.pptx differ