commit 3cf7af596267826ce1f3fb5cece3f8edd763340e Author: Zhongwei Li Date: Sat Nov 29 18:03:26 2025 +0800 Initial commit diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json new file mode 100644 index 0000000..40df132 --- /dev/null +++ b/.claude-plugin/plugin.json @@ -0,0 +1,15 @@ +{ + "name": "document-skills", + "description": "Collection of document processing suite including Excel, Word, PowerPoint, and PDF capabilities", + "version": "0.0.0-2025.11.28", + "author": { + "name": "Cameron Rohn", + "email": "github@cameronrohn.com" + }, + "skills": [ + "./skills/xlsx", + "./skills/docx", + "./skills/pptx", + "./skills/pdf" + ] +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..ce5165e --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# document-skills + +Collection of document processing suite including Excel, Word, PowerPoint, and PDF capabilities diff --git a/plugin.lock.json b/plugin.lock.json new file mode 100644 index 0000000..226466b --- /dev/null +++ b/plugin.lock.json @@ -0,0 +1,564 @@ +{ + "$schema": "internal://schemas/plugin.lock.v1.json", + "pluginId": "gh:Cam10001110101/claude-skills-base:document-skills", + "normalized": { + "repo": null, + "ref": "refs/tags/v20251128.0", + "commit": "fb9ac202a94b58fdb76adf9a459306cfbbfc2f84", + "treeHash": "47eca301dcb8f3011a3efdb315bab412e3f8bbc7ab82f7d0c78ada7f3f68e204", + "generatedAt": "2025-11-28T10:10:00.947781Z", + "toolVersion": "publish_plugins.py@0.2.0" + }, + "origin": { + "remote": "git@github.com:zhongweili/42plugin-data.git", + "branch": "master", + "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390", + "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data" + }, + "manifest": { + "name": "document-skills", + "description": "Collection of document processing suite including Excel, Word, PowerPoint, and PDF capabilities" + }, + "content": { + "files": [ + { + "path": "README.md", + "sha256": "b0bd4626e1c6924ac7fc6ace512ab54e3415b1fe72c816113a7e77d3e0cdda7b" + }, + { + "path": ".claude-plugin/plugin.json", + "sha256": "9f1c38f44e3f0acdfda22e07c101b8739dc70b60b4abce6f478edae8c012d7df" + }, + { + "path": "skills/xlsx/recalc.py", + "sha256": "ab1ef0c94536bb23b6c6a3d32769b0401ec3cc85e73c247d574dd84ec73af15d" + }, + { + "path": "skills/xlsx/SKILL.md", + "sha256": "020ccdb5932257b66c638ec1157ea248d57fa52c8c01f1f68b559b5970c7df35" + }, + { + "path": "skills/xlsx/LICENSE.txt", + "sha256": "79f6d8f5b427252fa3b1c11ecdbdb6bf610b944f7530b4de78f770f38741cfaa" + }, + { + "path": "skills/pdf/REFERENCE.md", + "sha256": "03a5f964f8abecbbe156f363356e927e864d7ee964f1012c84ee1bfc8acbeb95" + }, + { + "path": "skills/pdf/FORMS.md", + "sha256": "0ab10e9095deb1c1f9f79eb04254589f55c1d16e095cb53191e03f9fc3184449" + }, + { + "path": "skills/pdf/SKILL.md", + "sha256": "6c17133c12d30173ab5c32db3f37fa303ef473b90ec7da4a66443337deb28131" + }, + { + "path": "skills/pdf/LICENSE.txt", + "sha256": "79f6d8f5b427252fa3b1c11ecdbdb6bf610b944f7530b4de78f770f38741cfaa" + }, + { + "path": "skills/pdf/scripts/fill_fillable_fields.py", + "sha256": "0b954387dd7069dc5f38209d90cd8dce3839a9cc95490f258f331fd5e8233ba2" + }, + { + "path": "skills/pdf/scripts/convert_pdf_to_images.py", + "sha256": "095a0105a718af75ede309cb03f84a20c81d17f1727f7686fd4b294f1f40294f" + }, + { + "path": "skills/pdf/scripts/extract_form_field_info.py", + "sha256": "191a3d9d8b43f6abce0e0e35cf56f5f1c1760521bdacc590b5b154389e4c270a" + }, + { + "path": "skills/pdf/scripts/check_bounding_boxes.py", + "sha256": "98c84bfb3e712e846df046418809caa7e2eee01264ef3caaa22c4fa4c9d4c4c8" + }, + { + "path": "skills/pdf/scripts/check_bounding_boxes_test.py", + "sha256": "f95dca01a8b79aafd152511e9f7bf2bbcd606dde1be77d691f03a18624e002ca" + }, + { + "path": "skills/pdf/scripts/create_validation_image.py", + "sha256": "90b3bba0bd492b5f18b7c0c47f61cc047bc7b6cbaf965d3762b1bb74f6516fd9" + }, + { + "path": "skills/pdf/scripts/fill_pdf_form_with_annotations.py", + "sha256": "1592c3d705602baf1ca08589228c361765e1ab726cd94f0c45f00d33d9e007b0" + }, + { + "path": "skills/pdf/scripts/check_fillable_fields.py", + "sha256": "159c9525764f95815cd57a02dafccb7b66e20fa6c5f356ab9c0da6e0a8b672ba" + }, + { + "path": "skills/pptx/css.md", + "sha256": "fd6551c21cad23c3b580a7032422f437ee3c2cf115808df6ebd37ea58c0d1353" + }, + { + "path": "skills/pptx/ooxml.md", + "sha256": "09868e9f1786765421ecf3f0f49c77006738efda82a76df43ed87f7a9bfe2467" + }, + { + "path": "skills/pptx/html2pptx.tgz", + "sha256": "542e19f94c89a888a4708fa1728dff9b082a9fb2d65c5114a730a6f9060c6e6a" + }, + { + "path": "skills/pptx/SKILL.md", + "sha256": "336864cfb0878052defd1b014acde767ba2aeed96a9fd8e28b5bb2b462807007" + }, + { + "path": "skills/pptx/html2pptx.md", + "sha256": "cadbe677d069867f96ce1dc6e24c84fe3169b735bef12f113999a0d07c032457" + }, + { + "path": "skills/pptx/LICENSE.txt", + "sha256": "79f6d8f5b427252fa3b1c11ecdbdb6bf610b944f7530b4de78f770f38741cfaa" + }, + { + "path": "skills/pptx/ooxml/schemas/microsoft/wml-sdtdatahash-2020.xsd", + "sha256": "842e7163409c8d74f4d7088a8bc99500d80bc75332681a0980055b08f374a604" + }, + { + "path": "skills/pptx/ooxml/schemas/microsoft/wml-2012.xsd", + "sha256": "0fa75578a000439a7988ba0c59fdc69f774bbd416cbacc14d07125b3f686cb74" + }, + { + "path": "skills/pptx/ooxml/schemas/microsoft/wml-2010.xsd", + "sha256": "568b26ee156cb9549aa439ca2158965f77b7c1602b7e0316f40ac6cf586e35f2" + }, + { + "path": "skills/pptx/ooxml/schemas/microsoft/wml-cid-2016.xsd", + "sha256": "127ca209fa73d7cb708449cb355c871867948a96e4a74f7bf5811ef62d17991d" + }, + { + "path": "skills/pptx/ooxml/schemas/microsoft/wml-symex-2015.xsd", + "sha256": "16f6f8072249f431370723c2cd8974672e0d9c897e00e97dd918079df934871b" + }, + { + "path": "skills/pptx/ooxml/schemas/microsoft/wml-cex-2018.xsd", + "sha256": "fddc2b880cabb9005aebbc7e783e53c19fec1c03df7d0e2f2076a33a0fdfd081" + }, + { + "path": "skills/pptx/ooxml/schemas/microsoft/wml-2018.xsd", + "sha256": "be0ff793a22dd31384650c3a4da14c2fa8062751c2e97b0e5ee852bda13c60ad" + }, + { + "path": "skills/pptx/ooxml/schemas/mce/mc.xsd", + "sha256": "3a37e461ecf5a8670fdec34029703401f8728ab9c96ec1739a6ae58d55212413" + }, + { + "path": "skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-coreProperties.xsd", + "sha256": "451958454e8588dfc7cd945981ada142ca06ff3307937f5700df059c2b307fa8" + }, + { + "path": "skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-relationships.xsd", + "sha256": "f565adfef5a502044abc3a9153e157edc25af78304d335994afb958874b15e26" + }, + { + "path": "skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-contentTypes.xsd", + "sha256": "9e0b7209fc69ab11987900404540969976000c5ebe4d4f58c43dc3842886bf3a" + }, + { + "path": "skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-digSig.xsd", + "sha256": "6de111e11403f7cd49027400755bae0ea1cabef2815f09bd40a24f0017613b24" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd", + "sha256": "133c9f64a5c5d573b78d0a474122b22506d8eadb5e063f67cdbbb8fa2f161d0e" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd", + "sha256": "585bedc1313b40888dcc544cb74cd939a105ee674f3b1d3aa1cc6d34f70ff155" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd", + "sha256": "0d103b99a4a8652f8871552a69d42d2a3760ac6a5e3ef02d979c4273257ff6a4" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/pml.xsd", + "sha256": "d173c3e5d61e42e2e3a97226c632fd2ab7cc481fc4e492365b87024ab546daff" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd", + "sha256": "5cb76dabd8b97d1e9308a1700b90c20139be4d50792d21a7f09789f5cccd6026" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-chart.xsd", + "sha256": "41b93bd8857cc68b1e43be2806a872d736a9bdd6566900062d8fdb57d7bbb354" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd", + "sha256": "3fd0586f2637b98bb9886f0e0b67d89e1cc987c2d158cc7deb5f5b9890ced412" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/sml.xsd", + "sha256": "beffeed56945c22a77440122c8bdc426f3fcbe7f3b12ea0976c770d1f8d54578" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd", + "sha256": "6bdeb169c3717eb01108853bd9fc5a3750fb1fa5b82abbdd854d49855a40f519" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/wml.xsd", + "sha256": "c2dd9f61f892deae6acd8d20771ea79b12018af25f3bf8d06639c8542d218cfd" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-picture.xsd", + "sha256": "5d389d42befbebd91945d620242347caecd3367f9a3a7cf8d97949507ae1f53c" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd", + "sha256": "29b254ee0d10414a8504b5a08149c7baec35a60d5ff607d6b3f492aa36815f40" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-main.xsd", + "sha256": "5375417f0f5394b8dd1a7035b9679151f19a6b65df309dec10cfb4a420cb00e9" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd", + "sha256": "9c085407751b9061c1f996f6c39ce58451be22a8d334f09175f0e89e42736285" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd", + "sha256": "b4532b6d258832953fbb3ee4c711f4fe25d3faf46a10644b2505f17010d01e88" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd", + "sha256": "e2abacbb9a55ce1365f8961bc1b1395bbc811e512b111000d8c333f98458dece" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd", + "sha256": "bdad416b096b61d37b71603b2c949484f9070c830bdaeba93bf35e15c8900614" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd", + "sha256": "475dcae1e7d1ea46232db6f8481040c15e53a52a3c256831d3df204212b0e831" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd", + "sha256": "0b364451dc36a48dd6dae0f3b6ada05fd9b71e5208211f8ee5537d7e51a587e2" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd", + "sha256": "bc92e36ccd233722d4c5869bec71ddc7b12e2df56059942cce5a39065cc9c368" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-main.xsd", + "sha256": "f5ee623b08b6a66935e5aced2f5d8ad0fc71bf9e8e833cd490150c0fa94b8763" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/xml.xsd", + "sha256": "a539aa2fb154fa50e0f5cc97e6ad7cbc66f8ec3e3746f61ec6a8b0d5d15ecdf2" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd", + "sha256": "12264f3c03d738311cd9237d212f1c07479e70f0cbe1ae725d29b36539aef637" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd", + "sha256": "0ef4bb354ff44b923564c4ddbdda5987919d220225129ec94614a618ceafc281" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd", + "sha256": "7b5b7413e2c895b1e148e82e292a117d53c7ec65b0696c992edca57b61b4a74b" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-math.xsd", + "sha256": "3213ef1631606250f5010b42cad7ef716f7c59426367798e33c374c0ec391d3a" + }, + { + "path": "skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd", + "sha256": "3c6709101c6aaa82888df5d8795c33f9e857196790eb320d9194e64be2b6bdd8" + }, + { + "path": "skills/pptx/ooxml/scripts/pack.py", + "sha256": "6fe762f45aff8c63fd95b9fcb1337b28921d6fa454e18a0e8158d4c8708d6d00" + }, + { + "path": "skills/pptx/ooxml/scripts/validate.py", + "sha256": "1ec252de8b14b07d16966c48906ccb1c45c68bcd23557ad31d8c50a27f5f8c0f" + }, + { + "path": "skills/pptx/ooxml/scripts/unpack.py", + "sha256": "0bd17f76a1a4c388aba42c6d1d39015fa84e405c3e0692397fe12762bd632b58" + }, + { + "path": "skills/pptx/ooxml/scripts/validation/docx.py", + "sha256": "e65d6cda0525866a24cc847b2e883bd2416ae6f87b3f5b9e2784dfbb0ec13093" + }, + { + "path": "skills/pptx/ooxml/scripts/validation/__init__.py", + "sha256": "83e0f035c5abea238d3f2c3968afbd511ed022b527b7c9cb60a9434cc34ff987" + }, + { + "path": "skills/pptx/ooxml/scripts/validation/redlining.py", + "sha256": "97abfdff4f08f43f9a4bb5c8a2f8fd483398b5b339592724e8635153b5507967" + }, + { + "path": "skills/pptx/ooxml/scripts/validation/pptx.py", + "sha256": "00bf2623da1177b3948143a4ade2f1cda7cb389dee31960861913fa42ef1b00f" + }, + { + "path": "skills/pptx/ooxml/scripts/validation/base.py", + "sha256": "f2c70d481613456e32b43869d1604b05c236c8da34b5b3967677a661cac7ba63" + }, + { + "path": "skills/pptx/scripts/thumbnail.py", + "sha256": "c21fd950b6ada7bd2f029885d3e56bc66b7ff061cc8404c492eb301664aa9e5d" + }, + { + "path": "skills/pptx/scripts/rearrange.py", + "sha256": "c04ac37916f398ba621b2d9e1e4c1a69225eaad6d7fb0ad116c237ddeb1b2b68" + }, + { + "path": "skills/pptx/scripts/inventory.py", + "sha256": "adead8fe6270e520c397cec9fbee4d606ab10bb80f749e018b42ec894c60d2e5" + }, + { + "path": "skills/pptx/scripts/replace.py", + "sha256": "8a590747551be847a904e3296fb2f35aa4e7feeb4970a61596c2375306462820" + }, + { + "path": "skills/docx/ooxml.md", + "sha256": "93fc077b48a0d4797d04a19fcf14520fb149289ce8532f6b7b6196eaf4f0714a" + }, + { + "path": "skills/docx/docx-js.md", + "sha256": "83b4a2f88d058a10509fbc0b3b12b6933c407805f4d4afc955cd3fb939c16428" + }, + { + "path": "skills/docx/SKILL.md", + "sha256": "278d10606659283366935c77d51852a6c75cbb08dfccb8de876ac8b2d7b6957d" + }, + { + "path": "skills/docx/LICENSE.txt", + "sha256": "79f6d8f5b427252fa3b1c11ecdbdb6bf610b944f7530b4de78f770f38741cfaa" + }, + { + "path": "skills/docx/ooxml/schemas/microsoft/wml-sdtdatahash-2020.xsd", + "sha256": "842e7163409c8d74f4d7088a8bc99500d80bc75332681a0980055b08f374a604" + }, + { + "path": "skills/docx/ooxml/schemas/microsoft/wml-2012.xsd", + "sha256": "0fa75578a000439a7988ba0c59fdc69f774bbd416cbacc14d07125b3f686cb74" + }, + { + "path": "skills/docx/ooxml/schemas/microsoft/wml-2010.xsd", + "sha256": "568b26ee156cb9549aa439ca2158965f77b7c1602b7e0316f40ac6cf586e35f2" + }, + { + "path": "skills/docx/ooxml/schemas/microsoft/wml-cid-2016.xsd", + "sha256": "127ca209fa73d7cb708449cb355c871867948a96e4a74f7bf5811ef62d17991d" + }, + { + "path": "skills/docx/ooxml/schemas/microsoft/wml-symex-2015.xsd", + "sha256": "16f6f8072249f431370723c2cd8974672e0d9c897e00e97dd918079df934871b" + }, + { + "path": "skills/docx/ooxml/schemas/microsoft/wml-cex-2018.xsd", + "sha256": "fddc2b880cabb9005aebbc7e783e53c19fec1c03df7d0e2f2076a33a0fdfd081" + }, + { + "path": "skills/docx/ooxml/schemas/microsoft/wml-2018.xsd", + "sha256": "be0ff793a22dd31384650c3a4da14c2fa8062751c2e97b0e5ee852bda13c60ad" + }, + { + "path": "skills/docx/ooxml/schemas/mce/mc.xsd", + "sha256": "3a37e461ecf5a8670fdec34029703401f8728ab9c96ec1739a6ae58d55212413" + }, + { + "path": "skills/docx/ooxml/schemas/ecma/fouth-edition/opc-coreProperties.xsd", + "sha256": "451958454e8588dfc7cd945981ada142ca06ff3307937f5700df059c2b307fa8" + }, + { + "path": "skills/docx/ooxml/schemas/ecma/fouth-edition/opc-relationships.xsd", + "sha256": "f565adfef5a502044abc3a9153e157edc25af78304d335994afb958874b15e26" + }, + { + "path": "skills/docx/ooxml/schemas/ecma/fouth-edition/opc-contentTypes.xsd", + "sha256": "9e0b7209fc69ab11987900404540969976000c5ebe4d4f58c43dc3842886bf3a" + }, + { + "path": "skills/docx/ooxml/schemas/ecma/fouth-edition/opc-digSig.xsd", + "sha256": "6de111e11403f7cd49027400755bae0ea1cabef2815f09bd40a24f0017613b24" + }, + { + "path": "skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd", + "sha256": "133c9f64a5c5d573b78d0a474122b22506d8eadb5e063f67cdbbb8fa2f161d0e" + }, + { + "path": "skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd", + "sha256": "585bedc1313b40888dcc544cb74cd939a105ee674f3b1d3aa1cc6d34f70ff155" + }, + { + "path": "skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd", + "sha256": "0d103b99a4a8652f8871552a69d42d2a3760ac6a5e3ef02d979c4273257ff6a4" + }, + { + "path": "skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/pml.xsd", + "sha256": "d173c3e5d61e42e2e3a97226c632fd2ab7cc481fc4e492365b87024ab546daff" + }, + { + "path": "skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd", + "sha256": "5cb76dabd8b97d1e9308a1700b90c20139be4d50792d21a7f09789f5cccd6026" + }, + { + "path": "skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-chart.xsd", + "sha256": "41b93bd8857cc68b1e43be2806a872d736a9bdd6566900062d8fdb57d7bbb354" + }, + { + "path": "skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd", + "sha256": "3fd0586f2637b98bb9886f0e0b67d89e1cc987c2d158cc7deb5f5b9890ced412" + }, + { + "path": "skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/sml.xsd", + "sha256": "beffeed56945c22a77440122c8bdc426f3fcbe7f3b12ea0976c770d1f8d54578" + }, + { + "path": "skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd", + "sha256": "6bdeb169c3717eb01108853bd9fc5a3750fb1fa5b82abbdd854d49855a40f519" + }, + { + "path": "skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/wml.xsd", + "sha256": "c2dd9f61f892deae6acd8d20771ea79b12018af25f3bf8d06639c8542d218cfd" + }, + { + "path": "skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-picture.xsd", + "sha256": "5d389d42befbebd91945d620242347caecd3367f9a3a7cf8d97949507ae1f53c" + }, + { + "path": "skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd", + "sha256": "29b254ee0d10414a8504b5a08149c7baec35a60d5ff607d6b3f492aa36815f40" + }, + { + "path": "skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-main.xsd", + "sha256": "5375417f0f5394b8dd1a7035b9679151f19a6b65df309dec10cfb4a420cb00e9" + }, + { + "path": "skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd", + "sha256": "9c085407751b9061c1f996f6c39ce58451be22a8d334f09175f0e89e42736285" + }, + { + "path": "skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd", + "sha256": "b4532b6d258832953fbb3ee4c711f4fe25d3faf46a10644b2505f17010d01e88" + }, + { + "path": "skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd", + "sha256": "e2abacbb9a55ce1365f8961bc1b1395bbc811e512b111000d8c333f98458dece" + }, + { + "path": "skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd", + "sha256": "bdad416b096b61d37b71603b2c949484f9070c830bdaeba93bf35e15c8900614" + }, + { + "path": "skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd", + "sha256": "475dcae1e7d1ea46232db6f8481040c15e53a52a3c256831d3df204212b0e831" + }, + { + "path": "skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd", + "sha256": "0b364451dc36a48dd6dae0f3b6ada05fd9b71e5208211f8ee5537d7e51a587e2" + }, + { + "path": "skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd", + "sha256": "bc92e36ccd233722d4c5869bec71ddc7b12e2df56059942cce5a39065cc9c368" + }, + { + "path": "skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-main.xsd", + "sha256": "f5ee623b08b6a66935e5aced2f5d8ad0fc71bf9e8e833cd490150c0fa94b8763" + }, + { + "path": "skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/xml.xsd", + "sha256": "a539aa2fb154fa50e0f5cc97e6ad7cbc66f8ec3e3746f61ec6a8b0d5d15ecdf2" + }, + { + "path": "skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd", + "sha256": "12264f3c03d738311cd9237d212f1c07479e70f0cbe1ae725d29b36539aef637" + }, + { + "path": "skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd", + "sha256": "0ef4bb354ff44b923564c4ddbdda5987919d220225129ec94614a618ceafc281" + }, + { + "path": "skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd", + "sha256": "7b5b7413e2c895b1e148e82e292a117d53c7ec65b0696c992edca57b61b4a74b" + }, + { + "path": "skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-math.xsd", + "sha256": "3213ef1631606250f5010b42cad7ef716f7c59426367798e33c374c0ec391d3a" + }, + { + "path": "skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd", + "sha256": "3c6709101c6aaa82888df5d8795c33f9e857196790eb320d9194e64be2b6bdd8" + }, + { + "path": "skills/docx/ooxml/scripts/pack.py", + "sha256": "6fe762f45aff8c63fd95b9fcb1337b28921d6fa454e18a0e8158d4c8708d6d00" + }, + { + "path": "skills/docx/ooxml/scripts/validate.py", + "sha256": "1ec252de8b14b07d16966c48906ccb1c45c68bcd23557ad31d8c50a27f5f8c0f" + }, + { + "path": "skills/docx/ooxml/scripts/unpack.py", + "sha256": "0bd17f76a1a4c388aba42c6d1d39015fa84e405c3e0692397fe12762bd632b58" + }, + { + "path": "skills/docx/ooxml/scripts/validation/docx.py", + "sha256": "e65d6cda0525866a24cc847b2e883bd2416ae6f87b3f5b9e2784dfbb0ec13093" + }, + { + "path": "skills/docx/ooxml/scripts/validation/__init__.py", + "sha256": "83e0f035c5abea238d3f2c3968afbd511ed022b527b7c9cb60a9434cc34ff987" + }, + { + "path": "skills/docx/ooxml/scripts/validation/redlining.py", + "sha256": "97abfdff4f08f43f9a4bb5c8a2f8fd483398b5b339592724e8635153b5507967" + }, + { + "path": "skills/docx/ooxml/scripts/validation/pptx.py", + "sha256": "00bf2623da1177b3948143a4ade2f1cda7cb389dee31960861913fa42ef1b00f" + }, + { + "path": "skills/docx/ooxml/scripts/validation/base.py", + "sha256": "f2c70d481613456e32b43869d1604b05c236c8da34b5b3967677a661cac7ba63" + }, + { + "path": "skills/docx/scripts/__init__.py", + "sha256": "83e262a425814b72add701272b99ddcf9635251c5d4672bf9fc38d2b03f00d85" + }, + { + "path": "skills/docx/scripts/document.py", + "sha256": "6ada306126560e5e4090d45473a3620b63408d68885554afc95766e983c0afe1" + }, + { + "path": "skills/docx/scripts/utilities.py", + "sha256": "62a4b689056501b91e2df2d1f4e6335818e421c7390e48050717ea8f461a0ed0" + }, + { + "path": "skills/docx/scripts/templates/comments.xml", + "sha256": "87e218a3a295016ec855f2cd74495c416072f29c4846e86b527aec0a4d93ba21" + }, + { + "path": "skills/docx/scripts/templates/commentsExtensible.xml", + "sha256": "af5d057e16462ca172cea845e502bafb4f3e1b474a8d5848ffe92214853a4935" + }, + { + "path": "skills/docx/scripts/templates/commentsExtended.xml", + "sha256": "86bf401354c111102033ed147763faccb82479598f17777a3384c2f3e9fa0014" + }, + { + "path": "skills/docx/scripts/templates/commentsIds.xml", + "sha256": "20168f7b237af091332f8348c548eb7f755f583185bb198359c5978155099d67" + }, + { + "path": "skills/docx/scripts/templates/people.xml", + "sha256": "61db9900b579acd4c4f84ff7f40df47e77e9e780c40d5f5ef6a7beba41d62ec5" + } + ], + "dirSha256": "47eca301dcb8f3011a3efdb315bab412e3f8bbc7ab82f7d0c78ada7f3f68e204" + }, + "security": { + "scannedAt": null, + "scannerVersion": null, + "flags": [] + } +} \ No newline at end of file diff --git a/skills/docx/LICENSE.txt b/skills/docx/LICENSE.txt new file mode 100644 index 0000000..c55ab42 --- /dev/null +++ b/skills/docx/LICENSE.txt @@ -0,0 +1,30 @@ +© 2025 Anthropic, PBC. All rights reserved. + +LICENSE: Use of these materials (including all code, prompts, assets, files, +and other components of this Skill) is governed by your agreement with +Anthropic regarding use of Anthropic's services. If no separate agreement +exists, use is governed by Anthropic's Consumer Terms of Service or +Commercial Terms of Service, as applicable: +https://www.anthropic.com/legal/consumer-terms +https://www.anthropic.com/legal/commercial-terms +Your applicable agreement is referred to as the "Agreement." "Services" are +as defined in the Agreement. + +ADDITIONAL RESTRICTIONS: Notwithstanding anything in the Agreement to the +contrary, users may not: + +- Extract these materials from the Services or retain copies of these + materials outside the Services +- Reproduce or copy these materials, except for temporary copies created + automatically during authorized use of the Services +- Create derivative works based on these materials +- Distribute, sublicense, or transfer these materials to any third party +- Make, offer to sell, sell, or import any inventions embodied in these + materials +- Reverse engineer, decompile, or disassemble these materials + +The receipt, viewing, or possession of these materials does not convey or +imply any license or right beyond those expressly granted above. + +Anthropic retains all right, title, and interest in these materials, +including all copyrights, patents, and other intellectual property rights. diff --git a/skills/docx/SKILL.md b/skills/docx/SKILL.md new file mode 100644 index 0000000..8c97e43 --- /dev/null +++ b/skills/docx/SKILL.md @@ -0,0 +1,197 @@ +--- +name: docx +description: "Comprehensive document creation, editing, and analysis with support for tracked changes, comments, formatting preservation, and text extraction. When Claude needs to work with professional documents (.docx files) for: (1) Creating new documents, (2) Modifying or editing content, (3) Working with tracked changes, (4) Adding comments, or any other document tasks" +license: Proprietary. LICENSE.txt has complete terms +--- + +# DOCX creation, editing, and analysis + +## Overview + +A user may ask you to create, edit, or analyze the contents of a .docx file. A .docx file is essentially a ZIP archive containing XML files and other resources that you can read or edit. You have different tools and workflows available for different tasks. + +## Workflow Decision Tree + +### Reading/Analyzing Content +Use "Text extraction" or "Raw XML access" sections below + +### Creating New Document +Use "Creating a new Word document" workflow + +### Editing Existing Document +- **Your own document + simple changes** + Use "Basic OOXML editing" workflow + +- **Someone else's document** + Use **"Redlining workflow"** (recommended default) + +- **Legal, academic, business, or government docs** + Use **"Redlining workflow"** (required) + +## Reading and analyzing content + +### Text extraction +If you just need to read the text contents of a document, you should convert the document to markdown using pandoc. Pandoc provides excellent support for preserving document structure and can show tracked changes: + +```bash +# Convert document to markdown with tracked changes +pandoc --track-changes=all path-to-file.docx -o output.md +# Options: --track-changes=accept/reject/all +``` + +### Raw XML access +You need raw XML access for: comments, complex formatting, document structure, embedded media, and metadata. For any of these features, you'll need to unpack a document and read its raw XML contents. + +#### Unpacking a file +`python ooxml/scripts/unpack.py ` + +#### Key file structures +* `word/document.xml` - Main document contents +* `word/comments.xml` - Comments referenced in document.xml +* `word/media/` - Embedded images and media files +* Tracked changes use `` (insertions) and `` (deletions) tags + +## Creating a new Word document + +When creating a new Word document from scratch, use **docx-js**, which allows you to create Word documents using JavaScript/TypeScript. + +### Workflow +1. **MANDATORY - READ ENTIRE FILE**: Read [`docx-js.md`](docx-js.md) (~500 lines) completely from start to finish. **NEVER set any range limits when reading this file.** Read the full file content for detailed syntax, critical formatting rules, and best practices before proceeding with document creation. +2. Create a JavaScript/TypeScript file using Document, Paragraph, TextRun components (You can assume all dependencies are installed, but if not, refer to the dependencies section below) +3. Export as .docx using Packer.toBuffer() + +## Editing an existing Word document + +When editing an existing Word document, use the **Document library** (a Python library for OOXML manipulation). The library automatically handles infrastructure setup and provides methods for document manipulation. For complex scenarios, you can access the underlying DOM directly through the library. + +### Workflow +1. **MANDATORY - READ ENTIRE FILE**: Read [`ooxml.md`](ooxml.md) (~600 lines) completely from start to finish. **NEVER set any range limits when reading this file.** Read the full file content for the Document library API and XML patterns for directly editing document files. +2. Unpack the document: `python ooxml/scripts/unpack.py ` +3. Create and run a Python script using the Document library (set PYTHONPATH per "Initialization" section in ooxml.md) +4. Pack the final document: `python ooxml/scripts/pack.py ` + +The Document library provides both high-level methods for common operations and direct DOM access for complex scenarios. + +## Redlining workflow for document review + +This workflow allows you to plan comprehensive tracked changes using markdown before implementing them in OOXML. **CRITICAL**: For complete tracked changes, you must implement ALL changes systematically. + +**Batching Strategy**: Group related changes into batches of 3-10 changes. This makes debugging manageable while maintaining efficiency. Test each batch before moving to the next. + +**Principle: Minimal, Precise Edits** +When implementing tracked changes, only mark text that actually changes. Repeating unchanged text makes edits harder to review and appears unprofessional. Break replacements into: [unchanged text] + [deletion] + [insertion] + [unchanged text]. Preserve the original run's RSID for unchanged text by extracting the `` element from the original and reusing it. + +Example - Changing "30 days" to "60 days" in a sentence: +```python +# BAD - Replaces entire sentence +'The term is 30 days.The term is 60 days.' + +# GOOD - Only marks what changed, preserves original for unchanged text +'The term is 3060 days.' +``` + +### Tracked changes workflow + +1. **Get markdown representation**: Convert document to markdown with tracked changes preserved: + ```bash + pandoc --track-changes=all path-to-file.docx -o current.md + ``` + +2. **Identify and group changes**: Review the document and identify ALL changes needed, organizing them into logical batches: + + **Location methods** (for finding changes in XML): + - Section/heading numbers (e.g., "Section 3.2", "Article IV") + - Paragraph identifiers if numbered + - Grep patterns with unique surrounding text + - Document structure (e.g., "first paragraph", "signature block") + - **DO NOT use markdown line numbers** - they don't map to XML structure + + **Batch organization** (group 3-10 related changes per batch): + - By section: "Batch 1: Section 2 amendments", "Batch 2: Section 5 updates" + - By type: "Batch 1: Date corrections", "Batch 2: Party name changes" + - By complexity: Start with simple text replacements, then tackle complex structural changes + - Sequential: "Batch 1: Pages 1-3", "Batch 2: Pages 4-6" + +3. **Read documentation and unpack**: + - **MANDATORY - READ ENTIRE FILE**: Read [`ooxml.md`](ooxml.md) (~600 lines) completely from start to finish. **NEVER set any range limits when reading this file.** Read the full file content for the Document library API and XML patterns for directly editing document files. + - **Unpack the document**: `python ooxml/scripts/unpack.py ` + - **Note the suggested RSID**: The unpack script will suggest an RSID to use for your tracked changes. Copy this RSID for use in step 4b. + +4. **Implement changes in batches**: Group changes logically (by section, by type, or by proximity) and implement them together in a single script. This approach: + - Makes debugging easier (smaller batch = easier to isolate errors) + - Allows incremental progress + - Maintains efficiency (batch size of 3-10 changes works well) + + **Suggested batch groupings:** + - By document section (e.g., "Section 3 changes", "Definitions", "Termination clause") + - By change type (e.g., "Date changes", "Party name updates", "Legal term replacements") + - By proximity (e.g., "Changes on pages 1-3", "Changes in first half of document") + + For each batch of related changes: + + **a. Map text to XML**: Grep for text in `word/document.xml` to verify how text is split across `` elements. + + **b. Create and run script**: Set PYTHONPATH and import Document library (see "Initialization" in ooxml.md), then use `get_node` to find nodes, implement changes, and `doc.save()`. See **"Document Library"** section in ooxml.md for patterns. + + **Note**: Always grep `word/document.xml` immediately before writing a script to get current line numbers and verify text content. Line numbers change after each script run. + +5. **Pack the document**: After all batches are complete, convert the unpacked directory back to .docx: + ```bash + python ooxml/scripts/pack.py unpacked reviewed-document.docx + ``` + +6. **Final verification**: Do a comprehensive check of the complete document: + - Convert final document to markdown: + ```bash + pandoc --track-changes=all reviewed-document.docx -o verification.md + ``` + - Verify ALL changes were applied correctly: + ```bash + grep "original phrase" verification.md # Should NOT find it + grep "replacement phrase" verification.md # Should find it + ``` + - Check that no unintended changes were introduced + + +## Converting Documents to Images + +To visually analyze Word documents, convert them to images using a two-step process: + +1. **Convert DOCX to PDF**: + ```bash + soffice --headless --convert-to pdf document.docx + ``` + +2. **Convert PDF pages to JPEG images**: + ```bash + pdftoppm -jpeg -r 150 document.pdf page + ``` + This creates files like `page-1.jpg`, `page-2.jpg`, etc. + +Options: +- `-r 150`: Sets resolution to 150 DPI (adjust for quality/size balance) +- `-jpeg`: Output JPEG format (use `-png` for PNG if preferred) +- `-f N`: First page to convert (e.g., `-f 2` starts from page 2) +- `-l N`: Last page to convert (e.g., `-l 5` stops at page 5) +- `page`: Prefix for output files + +Example for specific range: +```bash +pdftoppm -jpeg -r 150 -f 2 -l 5 document.pdf page # Converts only pages 2-5 +``` + +## Code Style Guidelines +**IMPORTANT**: When generating code for DOCX operations: +- Write concise code +- Avoid verbose variable names and redundant operations +- Avoid unnecessary print statements + +## Dependencies + +Required dependencies (install if not available): + +- **pandoc**: `sudo apt-get install pandoc` (for text extraction) +- **docx**: `npm install -g docx` (for creating new documents) +- **LibreOffice**: `sudo apt-get install libreoffice` (for PDF conversion) +- **Poppler**: `sudo apt-get install poppler-utils` (for pdftoppm to convert PDF to images) +- **defusedxml**: `pip install defusedxml` (for secure XML parsing) \ No newline at end of file diff --git a/skills/docx/docx-js.md b/skills/docx/docx-js.md new file mode 100644 index 0000000..c6d7b2d --- /dev/null +++ b/skills/docx/docx-js.md @@ -0,0 +1,350 @@ +# DOCX Library Tutorial + +Generate .docx files with JavaScript/TypeScript. + +**Important: Read this entire document before starting.** Critical formatting rules and common pitfalls are covered throughout - skipping sections may result in corrupted files or rendering issues. + +## Setup +Assumes docx is already installed globally +If not installed: `npm install -g docx` + +```javascript +const { Document, Packer, Paragraph, TextRun, Table, TableRow, TableCell, ImageRun, Media, + Header, Footer, AlignmentType, PageOrientation, LevelFormat, ExternalHyperlink, + InternalHyperlink, TableOfContents, HeadingLevel, BorderStyle, WidthType, TabStopType, + TabStopPosition, UnderlineType, ShadingType, VerticalAlign, SymbolRun, PageNumber, + FootnoteReferenceRun, Footnote, PageBreak } = require('docx'); + +// Create & Save +const doc = new Document({ sections: [{ children: [/* content */] }] }); +Packer.toBuffer(doc).then(buffer => fs.writeFileSync("doc.docx", buffer)); // Node.js +Packer.toBlob(doc).then(blob => { /* download logic */ }); // Browser +``` + +## Text & Formatting +```javascript +// IMPORTANT: Never use \n for line breaks - always use separate Paragraph elements +// ❌ WRONG: new TextRun("Line 1\nLine 2") +// ✅ CORRECT: new Paragraph({ children: [new TextRun("Line 1")] }), new Paragraph({ children: [new TextRun("Line 2")] }) + +// Basic text with all formatting options +new Paragraph({ + alignment: AlignmentType.CENTER, + spacing: { before: 200, after: 200 }, + indent: { left: 720, right: 720 }, + children: [ + new TextRun({ text: "Bold", bold: true }), + new TextRun({ text: "Italic", italics: true }), + new TextRun({ text: "Underlined", underline: { type: UnderlineType.DOUBLE, color: "FF0000" } }), + new TextRun({ text: "Colored", color: "FF0000", size: 28, font: "Arial" }), // Arial default + new TextRun({ text: "Highlighted", highlight: "yellow" }), + new TextRun({ text: "Strikethrough", strike: true }), + new TextRun({ text: "x2", superScript: true }), + new TextRun({ text: "H2O", subScript: true }), + new TextRun({ text: "SMALL CAPS", smallCaps: true }), + new SymbolRun({ char: "2022", font: "Symbol" }), // Bullet • + new SymbolRun({ char: "00A9", font: "Arial" }) // Copyright © - Arial for symbols + ] +}) +``` + +## Styles & Professional Formatting + +```javascript +const doc = new Document({ + styles: { + default: { document: { run: { font: "Arial", size: 24 } } }, // 12pt default + paragraphStyles: [ + // Document title style - override built-in Title style + { id: "Title", name: "Title", basedOn: "Normal", + run: { size: 56, bold: true, color: "000000", font: "Arial" }, + paragraph: { spacing: { before: 240, after: 120 }, alignment: AlignmentType.CENTER } }, + // IMPORTANT: Override built-in heading styles by using their exact IDs + { id: "Heading1", name: "Heading 1", basedOn: "Normal", next: "Normal", quickFormat: true, + run: { size: 32, bold: true, color: "000000", font: "Arial" }, // 16pt + paragraph: { spacing: { before: 240, after: 240 }, outlineLevel: 0 } }, // Required for TOC + { id: "Heading2", name: "Heading 2", basedOn: "Normal", next: "Normal", quickFormat: true, + run: { size: 28, bold: true, color: "000000", font: "Arial" }, // 14pt + paragraph: { spacing: { before: 180, after: 180 }, outlineLevel: 1 } }, + // Custom styles use your own IDs + { id: "myStyle", name: "My Style", basedOn: "Normal", + run: { size: 28, bold: true, color: "000000" }, + paragraph: { spacing: { after: 120 }, alignment: AlignmentType.CENTER } } + ], + characterStyles: [{ id: "myCharStyle", name: "My Char Style", + run: { color: "FF0000", bold: true, underline: { type: UnderlineType.SINGLE } } }] + }, + sections: [{ + properties: { page: { margin: { top: 1440, right: 1440, bottom: 1440, left: 1440 } } }, + children: [ + new Paragraph({ heading: HeadingLevel.TITLE, children: [new TextRun("Document Title")] }), // Uses overridden Title style + new Paragraph({ heading: HeadingLevel.HEADING_1, children: [new TextRun("Heading 1")] }), // Uses overridden Heading1 style + new Paragraph({ style: "myStyle", children: [new TextRun("Custom paragraph style")] }), + new Paragraph({ children: [ + new TextRun("Normal with "), + new TextRun({ text: "custom char style", style: "myCharStyle" }) + ]}) + ] + }] +}); +``` + +**Professional Font Combinations:** +- **Arial (Headers) + Arial (Body)** - Most universally supported, clean and professional +- **Times New Roman (Headers) + Arial (Body)** - Classic serif headers with modern sans-serif body +- **Georgia (Headers) + Verdana (Body)** - Optimized for screen reading, elegant contrast + +**Key Styling Principles:** +- **Override built-in styles**: Use exact IDs like "Heading1", "Heading2", "Heading3" to override Word's built-in heading styles +- **HeadingLevel constants**: `HeadingLevel.HEADING_1` uses "Heading1" style, `HeadingLevel.HEADING_2` uses "Heading2" style, etc. +- **Include outlineLevel**: Set `outlineLevel: 0` for H1, `outlineLevel: 1` for H2, etc. to ensure TOC works correctly +- **Use custom styles** instead of inline formatting for consistency +- **Set a default font** using `styles.default.document.run.font` - Arial is universally supported +- **Establish visual hierarchy** with different font sizes (titles > headers > body) +- **Add proper spacing** with `before` and `after` paragraph spacing +- **Use colors sparingly**: Default to black (000000) and shades of gray for titles and headings (heading 1, heading 2, etc.) +- **Set consistent margins** (1440 = 1 inch is standard) + + +## Lists (ALWAYS USE PROPER LISTS - NEVER USE UNICODE BULLETS) +```javascript +// Bullets - ALWAYS use the numbering config, NOT unicode symbols +// CRITICAL: Use LevelFormat.BULLET constant, NOT the string "bullet" +const doc = new Document({ + numbering: { + config: [ + { reference: "bullet-list", + levels: [{ level: 0, format: LevelFormat.BULLET, text: "•", alignment: AlignmentType.LEFT, + style: { paragraph: { indent: { left: 720, hanging: 360 } } } }] }, + { reference: "first-numbered-list", + levels: [{ level: 0, format: LevelFormat.DECIMAL, text: "%1.", alignment: AlignmentType.LEFT, + style: { paragraph: { indent: { left: 720, hanging: 360 } } } }] }, + { reference: "second-numbered-list", // Different reference = restarts at 1 + levels: [{ level: 0, format: LevelFormat.DECIMAL, text: "%1.", alignment: AlignmentType.LEFT, + style: { paragraph: { indent: { left: 720, hanging: 360 } } } }] } + ] + }, + sections: [{ + children: [ + // Bullet list items + new Paragraph({ numbering: { reference: "bullet-list", level: 0 }, + children: [new TextRun("First bullet point")] }), + new Paragraph({ numbering: { reference: "bullet-list", level: 0 }, + children: [new TextRun("Second bullet point")] }), + // Numbered list items + new Paragraph({ numbering: { reference: "first-numbered-list", level: 0 }, + children: [new TextRun("First numbered item")] }), + new Paragraph({ numbering: { reference: "first-numbered-list", level: 0 }, + children: [new TextRun("Second numbered item")] }), + // ⚠️ CRITICAL: Different reference = INDEPENDENT list that restarts at 1 + // Same reference = CONTINUES previous numbering + new Paragraph({ numbering: { reference: "second-numbered-list", level: 0 }, + children: [new TextRun("Starts at 1 again (because different reference)")] }) + ] + }] +}); + +// ⚠️ CRITICAL NUMBERING RULE: Each reference creates an INDEPENDENT numbered list +// - Same reference = continues numbering (1, 2, 3... then 4, 5, 6...) +// - Different reference = restarts at 1 (1, 2, 3... then 1, 2, 3...) +// Use unique reference names for each separate numbered section! + +// ⚠️ CRITICAL: NEVER use unicode bullets - they create fake lists that don't work properly +// new TextRun("• Item") // WRONG +// new SymbolRun({ char: "2022" }) // WRONG +// ✅ ALWAYS use numbering config with LevelFormat.BULLET for real Word lists +``` + +## Tables +```javascript +// Complete table with margins, borders, headers, and bullet points +const tableBorder = { style: BorderStyle.SINGLE, size: 1, color: "CCCCCC" }; +const cellBorders = { top: tableBorder, bottom: tableBorder, left: tableBorder, right: tableBorder }; + +new Table({ + columnWidths: [4680, 4680], // ⚠️ CRITICAL: Set column widths at table level - values in DXA (twentieths of a point) + margins: { top: 100, bottom: 100, left: 180, right: 180 }, // Set once for all cells + rows: [ + new TableRow({ + tableHeader: true, + children: [ + new TableCell({ + borders: cellBorders, + width: { size: 4680, type: WidthType.DXA }, // ALSO set width on each cell + // ⚠️ CRITICAL: Always use ShadingType.CLEAR to prevent black backgrounds in Word. + shading: { fill: "D5E8F0", type: ShadingType.CLEAR }, + verticalAlign: VerticalAlign.CENTER, + children: [new Paragraph({ + alignment: AlignmentType.CENTER, + children: [new TextRun({ text: "Header", bold: true, size: 22 })] + })] + }), + new TableCell({ + borders: cellBorders, + width: { size: 4680, type: WidthType.DXA }, // ALSO set width on each cell + shading: { fill: "D5E8F0", type: ShadingType.CLEAR }, + children: [new Paragraph({ + alignment: AlignmentType.CENTER, + children: [new TextRun({ text: "Bullet Points", bold: true, size: 22 })] + })] + }) + ] + }), + new TableRow({ + children: [ + new TableCell({ + borders: cellBorders, + width: { size: 4680, type: WidthType.DXA }, // ALSO set width on each cell + children: [new Paragraph({ children: [new TextRun("Regular data")] })] + }), + new TableCell({ + borders: cellBorders, + width: { size: 4680, type: WidthType.DXA }, // ALSO set width on each cell + children: [ + new Paragraph({ + numbering: { reference: "bullet-list", level: 0 }, + children: [new TextRun("First bullet point")] + }), + new Paragraph({ + numbering: { reference: "bullet-list", level: 0 }, + children: [new TextRun("Second bullet point")] + }) + ] + }) + ] + }) + ] +}) +``` + +**IMPORTANT: Table Width & Borders** +- Use BOTH `columnWidths: [width1, width2, ...]` array AND `width: { size: X, type: WidthType.DXA }` on each cell +- Values in DXA (twentieths of a point): 1440 = 1 inch, Letter usable width = 9360 DXA (with 1" margins) +- Apply borders to individual `TableCell` elements, NOT the `Table` itself + +**Precomputed Column Widths (Letter size with 1" margins = 9360 DXA total):** +- **2 columns:** `columnWidths: [4680, 4680]` (equal width) +- **3 columns:** `columnWidths: [3120, 3120, 3120]` (equal width) + +## Links & Navigation +```javascript +// TOC (requires headings) - CRITICAL: Use HeadingLevel only, NOT custom styles +// ❌ WRONG: new Paragraph({ heading: HeadingLevel.HEADING_1, style: "customHeader", children: [new TextRun("Title")] }) +// ✅ CORRECT: new Paragraph({ heading: HeadingLevel.HEADING_1, children: [new TextRun("Title")] }) +new TableOfContents("Table of Contents", { hyperlink: true, headingStyleRange: "1-3" }), + +// External link +new Paragraph({ + children: [new ExternalHyperlink({ + children: [new TextRun({ text: "Google", style: "Hyperlink" })], + link: "https://www.google.com" + })] +}), + +// Internal link & bookmark +new Paragraph({ + children: [new InternalHyperlink({ + children: [new TextRun({ text: "Go to Section", style: "Hyperlink" })], + anchor: "section1" + })] +}), +new Paragraph({ + children: [new TextRun("Section Content")], + bookmark: { id: "section1", name: "section1" } +}), +``` + +## Images & Media +```javascript +// Basic image with sizing & positioning +// CRITICAL: Always specify 'type' parameter - it's REQUIRED for ImageRun +new Paragraph({ + alignment: AlignmentType.CENTER, + children: [new ImageRun({ + type: "png", // NEW REQUIREMENT: Must specify image type (png, jpg, jpeg, gif, bmp, svg) + data: fs.readFileSync("image.png"), + transformation: { width: 200, height: 150, rotation: 0 }, // rotation in degrees + altText: { title: "Logo", description: "Company logo", name: "Name" } // IMPORTANT: All three fields are required + })] +}) +``` + +## Page Breaks +```javascript +// Manual page break +new Paragraph({ children: [new PageBreak()] }), + +// Page break before paragraph +new Paragraph({ + pageBreakBefore: true, + children: [new TextRun("This starts on a new page")] +}) + +// ⚠️ CRITICAL: NEVER use PageBreak standalone - it will create invalid XML that Word cannot open +// ❌ WRONG: new PageBreak() +// ✅ CORRECT: new Paragraph({ children: [new PageBreak()] }) +``` + +## Headers/Footers & Page Setup +```javascript +const doc = new Document({ + sections: [{ + properties: { + page: { + margin: { top: 1440, right: 1440, bottom: 1440, left: 1440 }, // 1440 = 1 inch + size: { orientation: PageOrientation.LANDSCAPE }, + pageNumbers: { start: 1, formatType: "decimal" } // "upperRoman", "lowerRoman", "upperLetter", "lowerLetter" + } + }, + headers: { + default: new Header({ children: [new Paragraph({ + alignment: AlignmentType.RIGHT, + children: [new TextRun("Header Text")] + })] }) + }, + footers: { + default: new Footer({ children: [new Paragraph({ + alignment: AlignmentType.CENTER, + children: [new TextRun("Page "), new TextRun({ children: [PageNumber.CURRENT] }), new TextRun(" of "), new TextRun({ children: [PageNumber.TOTAL_PAGES] })] + })] }) + }, + children: [/* content */] + }] +}); +``` + +## Tabs +```javascript +new Paragraph({ + tabStops: [ + { type: TabStopType.LEFT, position: TabStopPosition.MAX / 4 }, + { type: TabStopType.CENTER, position: TabStopPosition.MAX / 2 }, + { type: TabStopType.RIGHT, position: TabStopPosition.MAX * 3 / 4 } + ], + children: [new TextRun("Left\tCenter\tRight")] +}) +``` + +## Constants & Quick Reference +- **Underlines:** `SINGLE`, `DOUBLE`, `WAVY`, `DASH` +- **Borders:** `SINGLE`, `DOUBLE`, `DASHED`, `DOTTED` +- **Numbering:** `DECIMAL` (1,2,3), `UPPER_ROMAN` (I,II,III), `LOWER_LETTER` (a,b,c) +- **Tabs:** `LEFT`, `CENTER`, `RIGHT`, `DECIMAL` +- **Symbols:** `"2022"` (•), `"00A9"` (©), `"00AE"` (®), `"2122"` (™), `"00B0"` (°), `"F070"` (✓), `"F0FC"` (✗) + +## Critical Issues & Common Mistakes +- **CRITICAL: PageBreak must ALWAYS be inside a Paragraph** - standalone PageBreak creates invalid XML that Word cannot open +- **ALWAYS use ShadingType.CLEAR for table cell shading** - Never use ShadingType.SOLID (causes black background). +- Measurements in DXA (1440 = 1 inch) | Each table cell needs ≥1 Paragraph | TOC requires HeadingLevel styles only +- **ALWAYS use custom styles** with Arial font for professional appearance and proper visual hierarchy +- **ALWAYS set a default font** using `styles.default.document.run.font` - Arial recommended +- **ALWAYS use columnWidths array for tables** + individual cell widths for compatibility +- **NEVER use unicode symbols for bullets** - always use proper numbering configuration with `LevelFormat.BULLET` constant (NOT the string "bullet") +- **NEVER use \n for line breaks anywhere** - always use separate Paragraph elements for each line +- **ALWAYS use TextRun objects within Paragraph children** - never use text property directly on Paragraph +- **CRITICAL for images**: ImageRun REQUIRES `type` parameter - always specify "png", "jpg", "jpeg", "gif", "bmp", or "svg" +- **CRITICAL for bullets**: Must use `LevelFormat.BULLET` constant, not string "bullet", and include `text: "•"` for the bullet character +- **CRITICAL for numbering**: Each numbering reference creates an INDEPENDENT list. Same reference = continues numbering (1,2,3 then 4,5,6). Different reference = restarts at 1 (1,2,3 then 1,2,3). Use unique reference names for each separate numbered section! +- **CRITICAL for TOC**: When using TableOfContents, headings must use HeadingLevel ONLY - do NOT add custom styles to heading paragraphs or TOC will break +- **Tables**: Set `columnWidths` array + individual cell widths, apply borders to cells not table +- **Set table margins at TABLE level** for consistent cell padding (avoids repetition per cell) \ No newline at end of file diff --git a/skills/docx/ooxml.md b/skills/docx/ooxml.md new file mode 100644 index 0000000..993013f --- /dev/null +++ b/skills/docx/ooxml.md @@ -0,0 +1,632 @@ +# Office Open XML Technical Reference + +**Important: Read this entire document before starting.** This document covers: +- [Technical Guidelines](#technical-guidelines) - Schema compliance rules and validation requirements +- [Document Content Patterns](#document-content-patterns) - XML patterns for headings, lists, tables, formatting, etc. +- [Document Library (Python)](#document-library-python) - Recommended approach for OOXML manipulation with automatic infrastructure setup +- [Tracked Changes (Redlining)](#tracked-changes-redlining) - XML patterns for implementing tracked changes + +## Technical Guidelines + +### Schema Compliance +- **Element ordering in ``**: ``, ``, ``, ``, ``, ``, ``, then `` last +- **Element ordering in ``**: All regular properties (``, ``, ``, ``) must come before `` or ``, which must be last. No properties can follow tracked change elements +- **Whitespace**: Add `xml:space='preserve'` to `` elements with leading/trailing spaces +- **Unicode**: Escape characters in ASCII content: `"` becomes `“` + - **Character encoding reference**: Curly quotes `""` become `“”`, apostrophe `'` becomes `’`, em-dash `—` becomes `—` +- **Tracked changes**: Use `` and `` tags with `w:author="Claude"` outside `` elements + - **Critical**: `` closes with ``, `` closes with `` - never mix + - **RSIDs must be 8-digit hex**: Use values like `00AB1234` (only 0-9, A-F characters) + - **trackRevisions placement**: Add `` after `` in settings.xml +- **Images**: Add to `word/media/`, reference in `document.xml`, set dimensions to prevent overflow + +## Document Content Patterns + +### Basic Structure +```xml + + Text content + +``` + +### Headings and Styles +```xml + + + + + + Document Title + + + + + Section Heading + +``` + +### Text Formatting +```xml + +Bold + +Italic + +Underlined + +Highlighted +``` + +### Lists +```xml + + + + + + + + First item + + + + + + + + + + New list item 1 + + + + + + + + + + + Bullet item + +``` + +### Tables + +**CRITICAL**: When adding rows to existing tables, match the EXACT cell structure of existing rows: +- Count cells in an existing row and match the count exactly +- Check for `` (cell spans multiple columns) and `` (columns after cells) +- Match cell widths (``) from the table's `` definition +- **Match content placement**: Check which cell contains the content in the reference row and place your content in the same cell position (e.g., if label rows have empty first cells with content in second cells, replicate this pattern) +- Examine which columns contain content vs. which are empty - replicate this exact pattern + +```xml + + + + + + + + + + + + Cell 1 + + + + Cell 2 + + + +``` + +### Layout +```xml + + + + + + + + + + + + New Section Title + + + + + + + + + + Centered text + + + + + + + + Monospace text + + + + + + + This text is Courier New + + and this text uses default font + +``` + +## File Updates + +When adding content, update these files: + +**`word/_rels/document.xml.rels`:** +```xml + + +``` + +**`[Content_Types].xml`:** +```xml + + +``` + +### Images +**CRITICAL**: Calculate dimensions to prevent page overflow and maintain aspect ratio. + +```xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +``` + +### Links (Hyperlinks) + +**IMPORTANT**: All hyperlinks (both internal and external) require the Hyperlink style to be defined in styles.xml. Without this style, links will look like regular text instead of blue underlined clickable links. + +**External Links:** +```xml + + + + + Link Text + + + + + +``` + +**Internal Links:** + +```xml + + + + + Link Text + + + + + +Target content + +``` + +**Hyperlink Style (required in styles.xml):** +```xml + + + + + + + + + + +``` + +## Document Library (Python) + +Use the Document class from `scripts/document.py` for all tracked changes and comments. It automatically handles infrastructure setup (people.xml, RSIDs, settings.xml, comment files, relationships, content types). Only use direct XML manipulation for complex scenarios not supported by the library. + +**Working with Unicode and Entities:** +- **Searching**: Both entity notation and Unicode characters work - `contains="“Company"` and `contains="\u201cCompany"` find the same text +- **Replacing**: Use either entities (`“`) or Unicode (`\u201c`) - both work and will be converted appropriately based on the file's encoding (ascii → entities, utf-8 → Unicode) + +### Initialization + +**Set PYTHONPATH to the docx skill root:** + +```bash +# Find the docx skill root (directory containing scripts/ and ooxml/) +find /mnt/skills -name "document.py" -path "*/docx/scripts/*" 2>/dev/null | head -1 +# Example output: /mnt/skills/public/docx/scripts/document.py +# Skill root is: /mnt/skills/public/docx + +# Option 1: Export for entire session +export PYTHONPATH=/mnt/skills/public/docx:$PYTHONPATH + +# Option 2: Inline with script execution +PYTHONPATH=/mnt/skills/public/docx python3 your_script.py +``` + +**In your script**, import normally: +```python +from scripts.document import Document + +# Basic initialization (automatically creates temp copy and sets up infrastructure) +doc = Document('unpacked') + +# Customize author and initials +doc = Document('unpacked', author="John Doe", initials="JD") + +# Enable track revisions mode +doc = Document('unpacked', track_revisions=True) + +# Specify custom RSID (auto-generated if not provided) +doc = Document('unpacked', rsid="07DC5ECB") +``` + +### Creating Tracked Changes + +**CRITICAL**: Only mark text that actually changes. Keep ALL unchanged text outside ``/`` tags. Marking unchanged text makes edits unprofessional and harder to review. + +**Attribute Handling**: The Document class auto-injects attributes (w:id, w:date, w:rsidR, w:rsidDel, w16du:dateUtc, xml:space) into new elements. When preserving unchanged text from the original document, copy the original `` element with its existing attributes to maintain document integrity. + +**Method Selection Guide**: +- **Adding your own changes to regular text**: Use `replace_node()` with ``/`` tags, or `suggest_deletion()` for removing entire `` or `` elements +- **Partially modifying another author's tracked change**: Use `replace_node()` to nest your changes inside their ``/`` +- **Completely rejecting another author's insertion**: Use `revert_insertion()` on the `` element (NOT `suggest_deletion()`) +- **Completely rejecting another author's deletion**: Use `revert_deletion()` on the `` element to restore deleted content using tracked changes + +```python +# Minimal edit - change one word: "The report is monthly" → "The report is quarterly" +# Original: The report is monthly +node = doc["word/document.xml"].get_node(tag="w:r", contains="The report is monthly") +rpr = tags[0].toxml() if (tags := node.getElementsByTagName("w:rPr")) else "" +replacement = f'{rpr}The report is {rpr}monthly{rpr}quarterly' +doc["word/document.xml"].replace_node(node, replacement) + +# Minimal edit - change number: "within 30 days" → "within 45 days" +# Original: within 30 days +node = doc["word/document.xml"].get_node(tag="w:r", contains="within 30 days") +rpr = tags[0].toxml() if (tags := node.getElementsByTagName("w:rPr")) else "" +replacement = f'{rpr}within {rpr}30{rpr}45{rpr} days' +doc["word/document.xml"].replace_node(node, replacement) + +# Complete replacement - preserve formatting even when replacing all text +node = doc["word/document.xml"].get_node(tag="w:r", contains="apple") +rpr = tags[0].toxml() if (tags := node.getElementsByTagName("w:rPr")) else "" +replacement = f'{rpr}apple{rpr}banana orange' +doc["word/document.xml"].replace_node(node, replacement) + +# Insert new content (no attributes needed - auto-injected) +node = doc["word/document.xml"].get_node(tag="w:r", contains="existing text") +doc["word/document.xml"].insert_after(node, 'new text') + +# Partially delete another author's insertion +# Original: quarterly financial report +# Goal: Delete only "financial" to make it "quarterly report" +node = doc["word/document.xml"].get_node(tag="w:ins", attrs={"w:id": "5"}) +# IMPORTANT: Preserve w:author="Jane Smith" on the outer to maintain authorship +replacement = ''' + quarterly + financial + report +''' +doc["word/document.xml"].replace_node(node, replacement) + +# Change part of another author's insertion +# Original: in silence, safe and sound +# Goal: Change "safe and sound" to "soft and unbound" +node = doc["word/document.xml"].get_node(tag="w:ins", attrs={"w:id": "8"}) +replacement = f''' + in silence, + + + soft and unbound + + + safe and sound +''' +doc["word/document.xml"].replace_node(node, replacement) + +# Delete entire run (use only when deleting all content; use replace_node for partial deletions) +node = doc["word/document.xml"].get_node(tag="w:r", contains="text to delete") +doc["word/document.xml"].suggest_deletion(node) + +# Delete entire paragraph (in-place, handles both regular and numbered list paragraphs) +para = doc["word/document.xml"].get_node(tag="w:p", contains="paragraph to delete") +doc["word/document.xml"].suggest_deletion(para) + +# Add new numbered list item +target_para = doc["word/document.xml"].get_node(tag="w:p", contains="existing list item") +pPr = tags[0].toxml() if (tags := target_para.getElementsByTagName("w:pPr")) else "" +new_item = f'{pPr}New item' +tracked_para = doc.suggest_paragraph(new_item) +doc["word/document.xml"].insert_after(target_para, tracked_para) +# Optional: add spacing paragraph before content for better visual separation +# spacing = doc.suggest_paragraph('') +# doc["word/document.xml"].insert_after(target_para, spacing + tracked_para) + +# Add table row with tracked changes (requires 3 levels: row, cell properties, content) +# IMPORTANT: First examine an existing row to match cell count, widths, and content placement +last_row = doc["word/document.xml"].get_node(tag="w:tr", line_number=5000) +new_row = ''' + + + + New Cell + +''' +doc["word/document.xml"].insert_after(last_row, new_row) +``` + +### Adding Comments + +```python +# Add comment spanning two existing tracked changes +# Note: w:id is auto-generated. Only search by w:id if you know it from XML inspection +start_node = doc["word/document.xml"].get_node(tag="w:del", attrs={"w:id": "1"}) +end_node = doc["word/document.xml"].get_node(tag="w:ins", attrs={"w:id": "2"}) +doc.add_comment(start=start_node, end=end_node, text="Explanation of this change") + +# Add comment on a paragraph +para = doc["word/document.xml"].get_node(tag="w:p", contains="paragraph text") +doc.add_comment(start=para, end=para, text="Comment on this paragraph") + +# Add comment on newly created tracked change +# First create the tracked change +node = doc["word/document.xml"].get_node(tag="w:r", contains="old") +new_nodes = doc["word/document.xml"].replace_node( + node, + 'oldnew' +) +# Then add comment on the newly created elements +# new_nodes[0] is the , new_nodes[1] is the +doc.add_comment(start=new_nodes[0], end=new_nodes[1], text="Changed old to new per requirements") + +# Reply to existing comment +doc.reply_to_comment(parent_comment_id=0, text="I agree with this change") +``` + +### Rejecting Tracked Changes + +**IMPORTANT**: Use `revert_insertion()` to reject insertions and `revert_deletion()` to restore deletions using tracked changes. Use `suggest_deletion()` only for regular unmarked content. + +```python +# Reject insertion (wraps it in deletion) +# Use this when another author inserted text that you want to delete +ins = doc["word/document.xml"].get_node(tag="w:ins", attrs={"w:id": "5"}) +nodes = doc["word/document.xml"].revert_insertion(ins) # Returns [ins] + +# Reject deletion (creates insertion to restore deleted content) +# Use this when another author deleted text that you want to restore +del_elem = doc["word/document.xml"].get_node(tag="w:del", attrs={"w:id": "3"}) +nodes = doc["word/document.xml"].revert_deletion(del_elem) # Returns [del_elem, new_ins] + +# Reject all insertions in a paragraph +para = doc["word/document.xml"].get_node(tag="w:p", contains="paragraph text") +nodes = doc["word/document.xml"].revert_insertion(para) # Returns [para] + +# Reject all deletions in a paragraph +para = doc["word/document.xml"].get_node(tag="w:p", contains="paragraph text") +nodes = doc["word/document.xml"].revert_deletion(para) # Returns [para] +``` + +### Inserting Images + +**CRITICAL**: The Document class works with a temporary copy at `doc.unpacked_path`. Always copy images to this temp directory, not the original unpacked folder. + +```python +from PIL import Image +import shutil, os + +# Initialize document first +doc = Document('unpacked') + +# Copy image and calculate full-width dimensions with aspect ratio +media_dir = os.path.join(doc.unpacked_path, 'word/media') +os.makedirs(media_dir, exist_ok=True) +shutil.copy('image.png', os.path.join(media_dir, 'image1.png')) +img = Image.open(os.path.join(media_dir, 'image1.png')) +width_emus = int(6.5 * 914400) # 6.5" usable width, 914400 EMUs/inch +height_emus = int(width_emus * img.size[1] / img.size[0]) + +# Add relationship and content type +rels_editor = doc['word/_rels/document.xml.rels'] +next_rid = rels_editor.get_next_rid() +rels_editor.append_to(rels_editor.dom.documentElement, + f'') +doc['[Content_Types].xml'].append_to(doc['[Content_Types].xml'].dom.documentElement, + '') + +# Insert image +node = doc["word/document.xml"].get_node(tag="w:p", line_number=100) +doc["word/document.xml"].insert_after(node, f''' + + + + + + + + + + + + + + + + + +''') +``` + +### Getting Nodes + +```python +# By text content +node = doc["word/document.xml"].get_node(tag="w:p", contains="specific text") + +# By line range +para = doc["word/document.xml"].get_node(tag="w:p", line_number=range(100, 150)) + +# By attributes +node = doc["word/document.xml"].get_node(tag="w:del", attrs={"w:id": "1"}) + +# By exact line number (must be line number where tag opens) +para = doc["word/document.xml"].get_node(tag="w:p", line_number=42) + +# Combine filters +node = doc["word/document.xml"].get_node(tag="w:r", line_number=range(40, 60), contains="text") + +# Disambiguate when text appears multiple times - add line_number range +node = doc["word/document.xml"].get_node(tag="w:r", contains="Section", line_number=range(2400, 2500)) +``` + +### Saving + +```python +# Save with automatic validation (copies back to original directory) +doc.save() # Validates by default, raises error if validation fails + +# Save to different location +doc.save('modified-unpacked') + +# Skip validation (debugging only - needing this in production indicates XML issues) +doc.save(validate=False) +``` + +### Direct DOM Manipulation + +For complex scenarios not covered by the library: + +```python +# Access any XML file +editor = doc["word/document.xml"] +editor = doc["word/comments.xml"] + +# Direct DOM access (defusedxml.minidom.Document) +node = doc["word/document.xml"].get_node(tag="w:p", line_number=5) +parent = node.parentNode +parent.removeChild(node) +parent.appendChild(node) # Move to end + +# General document manipulation (without tracked changes) +old_node = doc["word/document.xml"].get_node(tag="w:p", contains="original text") +doc["word/document.xml"].replace_node(old_node, "replacement text") + +# Multiple insertions - use return value to maintain order +node = doc["word/document.xml"].get_node(tag="w:r", line_number=100) +nodes = doc["word/document.xml"].insert_after(node, "A") +nodes = doc["word/document.xml"].insert_after(nodes[-1], "B") +nodes = doc["word/document.xml"].insert_after(nodes[-1], "C") +# Results in: original_node, A, B, C +``` + +## Tracked Changes (Redlining) + +**Use the Document class above for all tracked changes.** The patterns below are for reference when constructing replacement XML strings. + +### Validation Rules +The validator checks that the document text matches the original after reverting Claude's changes. This means: +- **NEVER modify text inside another author's `` or `` tags** +- **ALWAYS use nested deletions** to remove another author's insertions +- **Every edit must be properly tracked** with `` or `` tags + +### Tracked Change Patterns + +**CRITICAL RULES**: +1. Never modify the content inside another author's tracked changes. Always use nested deletions. +2. **XML Structure**: Always place `` and `` at paragraph level containing complete `` elements. Never nest inside `` elements - this creates invalid XML that breaks document processing. + +**Text Insertion:** +```xml + + + inserted text + + +``` + +**Text Deletion:** +```xml + + + deleted text + + +``` + +**Deleting Another Author's Insertion (MUST use nested structure):** +```xml + + + + monthly + + + + weekly + +``` + +**Restoring Another Author's Deletion:** +```xml + + + within 30 days + + + within 30 days + +``` \ No newline at end of file diff --git a/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-chart.xsd b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-chart.xsd new file mode 100644 index 0000000..6454ef9 --- /dev/null +++ b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-chart.xsd @@ -0,0 +1,1499 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd new file mode 100644 index 0000000..afa4f46 --- /dev/null +++ b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd @@ -0,0 +1,146 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd new file mode 100644 index 0000000..64e66b8 --- /dev/null +++ b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd @@ -0,0 +1,1085 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd new file mode 100644 index 0000000..687eea8 --- /dev/null +++ b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd @@ -0,0 +1,11 @@ + + + + + diff --git a/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-main.xsd b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-main.xsd new file mode 100644 index 0000000..6ac81b0 --- /dev/null +++ b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-main.xsd @@ -0,0 +1,3081 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-picture.xsd b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-picture.xsd new file mode 100644 index 0000000..1dbf051 --- /dev/null +++ b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-picture.xsd @@ -0,0 +1,23 @@ + + + + + + + + + + + + + + + + + + diff --git a/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd new file mode 100644 index 0000000..f1af17d --- /dev/null +++ b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd @@ -0,0 +1,185 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd new file mode 100644 index 0000000..0a185ab --- /dev/null +++ b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd @@ -0,0 +1,287 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/pml.xsd b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/pml.xsd new file mode 100644 index 0000000..14ef488 --- /dev/null +++ b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/pml.xsd @@ -0,0 +1,1676 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd new file mode 100644 index 0000000..c20f3bf --- /dev/null +++ b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd @@ -0,0 +1,28 @@ + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd new file mode 100644 index 0000000..ac60252 --- /dev/null +++ b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd @@ -0,0 +1,144 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd new file mode 100644 index 0000000..424b8ba --- /dev/null +++ b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd @@ -0,0 +1,174 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd new file mode 100644 index 0000000..2bddce2 --- /dev/null +++ b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd @@ -0,0 +1,25 @@ + + + + + + + + + + + + + + + + + + + diff --git a/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd new file mode 100644 index 0000000..8a8c18b --- /dev/null +++ b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + + diff --git a/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd new file mode 100644 index 0000000..5c42706 --- /dev/null +++ b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd @@ -0,0 +1,59 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd new file mode 100644 index 0000000..853c341 --- /dev/null +++ b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd @@ -0,0 +1,56 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd new file mode 100644 index 0000000..da835ee --- /dev/null +++ b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd @@ -0,0 +1,195 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-math.xsd b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-math.xsd new file mode 100644 index 0000000..87ad265 --- /dev/null +++ b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-math.xsd @@ -0,0 +1,582 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd new file mode 100644 index 0000000..9e86f1b --- /dev/null +++ b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd @@ -0,0 +1,25 @@ + + + + + + + + + + + + + + + + + + + + diff --git a/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/sml.xsd b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/sml.xsd new file mode 100644 index 0000000..d0be42e --- /dev/null +++ b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/sml.xsd @@ -0,0 +1,4439 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-main.xsd b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-main.xsd new file mode 100644 index 0000000..8821dd1 --- /dev/null +++ b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-main.xsd @@ -0,0 +1,570 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd new file mode 100644 index 0000000..ca2575c --- /dev/null +++ b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd @@ -0,0 +1,509 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd new file mode 100644 index 0000000..dd079e6 --- /dev/null +++ b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd @@ -0,0 +1,12 @@ + + + + + + + + + diff --git a/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd new file mode 100644 index 0000000..3dd6cf6 --- /dev/null +++ b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd @@ -0,0 +1,108 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd new file mode 100644 index 0000000..f1041e3 --- /dev/null +++ b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd @@ -0,0 +1,96 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/wml.xsd b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/wml.xsd new file mode 100644 index 0000000..9c5b7a6 --- /dev/null +++ b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/wml.xsd @@ -0,0 +1,3646 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/xml.xsd b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/xml.xsd new file mode 100644 index 0000000..0f13678 --- /dev/null +++ b/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/xml.xsd @@ -0,0 +1,116 @@ + + + + + + See http://www.w3.org/XML/1998/namespace.html and + http://www.w3.org/TR/REC-xml for information about this namespace. + + This schema document describes the XML namespace, in a form + suitable for import by other schema documents. + + Note that local names in this namespace are intended to be defined + only by the World Wide Web Consortium or its subgroups. The + following names are currently defined in this namespace and should + not be used with conflicting semantics by any Working Group, + specification, or document instance: + + base (as an attribute name): denotes an attribute whose value + provides a URI to be used as the base for interpreting any + relative URIs in the scope of the element on which it + appears; its value is inherited. This name is reserved + by virtue of its definition in the XML Base specification. + + lang (as an attribute name): denotes an attribute whose value + is a language code for the natural language of the content of + any element; its value is inherited. This name is reserved + by virtue of its definition in the XML specification. + + space (as an attribute name): denotes an attribute whose + value is a keyword indicating what whitespace processing + discipline is intended for the content of the element; its + value is inherited. This name is reserved by virtue of its + definition in the XML specification. + + Father (in any context at all): denotes Jon Bosak, the chair of + the original XML Working Group. This name is reserved by + the following decision of the W3C XML Plenary and + XML Coordination groups: + + In appreciation for his vision, leadership and dedication + the W3C XML Plenary on this 10th day of February, 2000 + reserves for Jon Bosak in perpetuity the XML name + xml:Father + + + + + This schema defines attributes and an attribute group + suitable for use by + schemas wishing to allow xml:base, xml:lang or xml:space attributes + on elements they define. + + To enable this, such a schema must import this schema + for the XML namespace, e.g. as follows: + <schema . . .> + . . . + <import namespace="http://www.w3.org/XML/1998/namespace" + schemaLocation="http://www.w3.org/2001/03/xml.xsd"/> + + Subsequently, qualified reference to any of the attributes + or the group defined below will have the desired effect, e.g. + + <type . . .> + . . . + <attributeGroup ref="xml:specialAttrs"/> + + will define a type which will schema-validate an instance + element with any of those attributes + + + + In keeping with the XML Schema WG's standard versioning + policy, this schema document will persist at + http://www.w3.org/2001/03/xml.xsd. + At the date of issue it can also be found at + http://www.w3.org/2001/xml.xsd. + The schema document at that URI may however change in the future, + in order to remain compatible with the latest version of XML Schema + itself. In other words, if the XML Schema namespace changes, the version + of this document at + http://www.w3.org/2001/xml.xsd will change + accordingly; the version at + http://www.w3.org/2001/03/xml.xsd will not change. + + + + + + In due course, we should install the relevant ISO 2- and 3-letter + codes as the enumerated possible values . . . + + + + + + + + + + + + + + + See http://www.w3.org/TR/xmlbase/ for + information about this attribute. + + + + + + + + + + diff --git a/skills/docx/ooxml/schemas/ecma/fouth-edition/opc-contentTypes.xsd b/skills/docx/ooxml/schemas/ecma/fouth-edition/opc-contentTypes.xsd new file mode 100644 index 0000000..a6de9d2 --- /dev/null +++ b/skills/docx/ooxml/schemas/ecma/fouth-edition/opc-contentTypes.xsd @@ -0,0 +1,42 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/docx/ooxml/schemas/ecma/fouth-edition/opc-coreProperties.xsd b/skills/docx/ooxml/schemas/ecma/fouth-edition/opc-coreProperties.xsd new file mode 100644 index 0000000..10e978b --- /dev/null +++ b/skills/docx/ooxml/schemas/ecma/fouth-edition/opc-coreProperties.xsd @@ -0,0 +1,50 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/docx/ooxml/schemas/ecma/fouth-edition/opc-digSig.xsd b/skills/docx/ooxml/schemas/ecma/fouth-edition/opc-digSig.xsd new file mode 100644 index 0000000..4248bf7 --- /dev/null +++ b/skills/docx/ooxml/schemas/ecma/fouth-edition/opc-digSig.xsd @@ -0,0 +1,49 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/docx/ooxml/schemas/ecma/fouth-edition/opc-relationships.xsd b/skills/docx/ooxml/schemas/ecma/fouth-edition/opc-relationships.xsd new file mode 100644 index 0000000..5649746 --- /dev/null +++ b/skills/docx/ooxml/schemas/ecma/fouth-edition/opc-relationships.xsd @@ -0,0 +1,33 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/docx/ooxml/schemas/mce/mc.xsd b/skills/docx/ooxml/schemas/mce/mc.xsd new file mode 100644 index 0000000..ef72545 --- /dev/null +++ b/skills/docx/ooxml/schemas/mce/mc.xsd @@ -0,0 +1,75 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/docx/ooxml/schemas/microsoft/wml-2010.xsd b/skills/docx/ooxml/schemas/microsoft/wml-2010.xsd new file mode 100644 index 0000000..f65f777 --- /dev/null +++ b/skills/docx/ooxml/schemas/microsoft/wml-2010.xsd @@ -0,0 +1,560 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/docx/ooxml/schemas/microsoft/wml-2012.xsd b/skills/docx/ooxml/schemas/microsoft/wml-2012.xsd new file mode 100644 index 0000000..6b00755 --- /dev/null +++ b/skills/docx/ooxml/schemas/microsoft/wml-2012.xsd @@ -0,0 +1,67 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/docx/ooxml/schemas/microsoft/wml-2018.xsd b/skills/docx/ooxml/schemas/microsoft/wml-2018.xsd new file mode 100644 index 0000000..f321d33 --- /dev/null +++ b/skills/docx/ooxml/schemas/microsoft/wml-2018.xsd @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + diff --git a/skills/docx/ooxml/schemas/microsoft/wml-cex-2018.xsd b/skills/docx/ooxml/schemas/microsoft/wml-cex-2018.xsd new file mode 100644 index 0000000..364c6a9 --- /dev/null +++ b/skills/docx/ooxml/schemas/microsoft/wml-cex-2018.xsd @@ -0,0 +1,20 @@ + + + + + + + + + + + + + + + + + + + + diff --git a/skills/docx/ooxml/schemas/microsoft/wml-cid-2016.xsd b/skills/docx/ooxml/schemas/microsoft/wml-cid-2016.xsd new file mode 100644 index 0000000..fed9d15 --- /dev/null +++ b/skills/docx/ooxml/schemas/microsoft/wml-cid-2016.xsd @@ -0,0 +1,13 @@ + + + + + + + + + + + + + diff --git a/skills/docx/ooxml/schemas/microsoft/wml-sdtdatahash-2020.xsd b/skills/docx/ooxml/schemas/microsoft/wml-sdtdatahash-2020.xsd new file mode 100644 index 0000000..680cf15 --- /dev/null +++ b/skills/docx/ooxml/schemas/microsoft/wml-sdtdatahash-2020.xsd @@ -0,0 +1,4 @@ + + + + diff --git a/skills/docx/ooxml/schemas/microsoft/wml-symex-2015.xsd b/skills/docx/ooxml/schemas/microsoft/wml-symex-2015.xsd new file mode 100644 index 0000000..89ada90 --- /dev/null +++ b/skills/docx/ooxml/schemas/microsoft/wml-symex-2015.xsd @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/skills/docx/ooxml/scripts/pack.py b/skills/docx/ooxml/scripts/pack.py new file mode 100644 index 0000000..68bc088 --- /dev/null +++ b/skills/docx/ooxml/scripts/pack.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python3 +""" +Tool to pack a directory into a .docx, .pptx, or .xlsx file with XML formatting undone. + +Example usage: + python pack.py [--force] +""" + +import argparse +import shutil +import subprocess +import sys +import tempfile +import defusedxml.minidom +import zipfile +from pathlib import Path + + +def main(): + parser = argparse.ArgumentParser(description="Pack a directory into an Office file") + parser.add_argument("input_directory", help="Unpacked Office document directory") + parser.add_argument("output_file", help="Output Office file (.docx/.pptx/.xlsx)") + parser.add_argument("--force", action="store_true", help="Skip validation") + args = parser.parse_args() + + try: + success = pack_document( + args.input_directory, args.output_file, validate=not args.force + ) + + # Show warning if validation was skipped + if args.force: + print("Warning: Skipped validation, file may be corrupt", file=sys.stderr) + # Exit with error if validation failed + elif not success: + print("Contents would produce a corrupt file.", file=sys.stderr) + print("Please validate XML before repacking.", file=sys.stderr) + print("Use --force to skip validation and pack anyway.", file=sys.stderr) + sys.exit(1) + + except ValueError as e: + sys.exit(f"Error: {e}") + + +def pack_document(input_dir, output_file, validate=False): + """Pack a directory into an Office file (.docx/.pptx/.xlsx). + + Args: + input_dir: Path to unpacked Office document directory + output_file: Path to output Office file + validate: If True, validates with soffice (default: False) + + Returns: + bool: True if successful, False if validation failed + """ + input_dir = Path(input_dir) + output_file = Path(output_file) + + if not input_dir.is_dir(): + raise ValueError(f"{input_dir} is not a directory") + if output_file.suffix.lower() not in {".docx", ".pptx", ".xlsx"}: + raise ValueError(f"{output_file} must be a .docx, .pptx, or .xlsx file") + + # Work in temporary directory to avoid modifying original + with tempfile.TemporaryDirectory() as temp_dir: + temp_content_dir = Path(temp_dir) / "content" + shutil.copytree(input_dir, temp_content_dir) + + # Process XML files to remove pretty-printing whitespace + for pattern in ["*.xml", "*.rels"]: + for xml_file in temp_content_dir.rglob(pattern): + condense_xml(xml_file) + + # Create final Office file as zip archive + output_file.parent.mkdir(parents=True, exist_ok=True) + with zipfile.ZipFile(output_file, "w", zipfile.ZIP_DEFLATED) as zf: + for f in temp_content_dir.rglob("*"): + if f.is_file(): + zf.write(f, f.relative_to(temp_content_dir)) + + # Validate if requested + if validate: + if not validate_document(output_file): + output_file.unlink() # Delete the corrupt file + return False + + return True + + +def validate_document(doc_path): + """Validate document by converting to HTML with soffice.""" + # Determine the correct filter based on file extension + match doc_path.suffix.lower(): + case ".docx": + filter_name = "html:HTML" + case ".pptx": + filter_name = "html:impress_html_Export" + case ".xlsx": + filter_name = "html:HTML (StarCalc)" + + with tempfile.TemporaryDirectory() as temp_dir: + try: + result = subprocess.run( + [ + "soffice", + "--headless", + "--convert-to", + filter_name, + "--outdir", + temp_dir, + str(doc_path), + ], + capture_output=True, + timeout=10, + text=True, + ) + if not (Path(temp_dir) / f"{doc_path.stem}.html").exists(): + error_msg = result.stderr.strip() or "Document validation failed" + print(f"Validation error: {error_msg}", file=sys.stderr) + return False + return True + except FileNotFoundError: + print("Warning: soffice not found. Skipping validation.", file=sys.stderr) + return True + except subprocess.TimeoutExpired: + print("Validation error: Timeout during conversion", file=sys.stderr) + return False + except Exception as e: + print(f"Validation error: {e}", file=sys.stderr) + return False + + +def condense_xml(xml_file): + """Strip unnecessary whitespace and remove comments.""" + with open(xml_file, "r", encoding="utf-8") as f: + dom = defusedxml.minidom.parse(f) + + # Process each element to remove whitespace and comments + for element in dom.getElementsByTagName("*"): + # Skip w:t elements and their processing + if element.tagName.endswith(":t"): + continue + + # Remove whitespace-only text nodes and comment nodes + for child in list(element.childNodes): + if ( + child.nodeType == child.TEXT_NODE + and child.nodeValue + and child.nodeValue.strip() == "" + ) or child.nodeType == child.COMMENT_NODE: + element.removeChild(child) + + # Write back the condensed XML + with open(xml_file, "wb") as f: + f.write(dom.toxml(encoding="UTF-8")) + + +if __name__ == "__main__": + main() diff --git a/skills/docx/ooxml/scripts/unpack.py b/skills/docx/ooxml/scripts/unpack.py new file mode 100644 index 0000000..4938798 --- /dev/null +++ b/skills/docx/ooxml/scripts/unpack.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 +"""Unpack and format XML contents of Office files (.docx, .pptx, .xlsx)""" + +import random +import sys +import defusedxml.minidom +import zipfile +from pathlib import Path + +# Get command line arguments +assert len(sys.argv) == 3, "Usage: python unpack.py " +input_file, output_dir = sys.argv[1], sys.argv[2] + +# Extract and format +output_path = Path(output_dir) +output_path.mkdir(parents=True, exist_ok=True) +zipfile.ZipFile(input_file).extractall(output_path) + +# Pretty print all XML files +xml_files = list(output_path.rglob("*.xml")) + list(output_path.rglob("*.rels")) +for xml_file in xml_files: + content = xml_file.read_text(encoding="utf-8") + dom = defusedxml.minidom.parseString(content) + xml_file.write_bytes(dom.toprettyxml(indent=" ", encoding="ascii")) + +# For .docx files, suggest an RSID for tracked changes +if input_file.endswith(".docx"): + suggested_rsid = "".join(random.choices("0123456789ABCDEF", k=8)) + print(f"Suggested RSID for edit session: {suggested_rsid}") diff --git a/skills/docx/ooxml/scripts/validate.py b/skills/docx/ooxml/scripts/validate.py new file mode 100644 index 0000000..508c589 --- /dev/null +++ b/skills/docx/ooxml/scripts/validate.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 +""" +Command line tool to validate Office document XML files against XSD schemas and tracked changes. + +Usage: + python validate.py --original +""" + +import argparse +import sys +from pathlib import Path + +from validation import DOCXSchemaValidator, PPTXSchemaValidator, RedliningValidator + + +def main(): + parser = argparse.ArgumentParser(description="Validate Office document XML files") + parser.add_argument( + "unpacked_dir", + help="Path to unpacked Office document directory", + ) + parser.add_argument( + "--original", + required=True, + help="Path to original file (.docx/.pptx/.xlsx)", + ) + parser.add_argument( + "-v", + "--verbose", + action="store_true", + help="Enable verbose output", + ) + args = parser.parse_args() + + # Validate paths + unpacked_dir = Path(args.unpacked_dir) + original_file = Path(args.original) + file_extension = original_file.suffix.lower() + assert unpacked_dir.is_dir(), f"Error: {unpacked_dir} is not a directory" + assert original_file.is_file(), f"Error: {original_file} is not a file" + assert file_extension in [".docx", ".pptx", ".xlsx"], ( + f"Error: {original_file} must be a .docx, .pptx, or .xlsx file" + ) + + # Run validations + match file_extension: + case ".docx": + validators = [DOCXSchemaValidator, RedliningValidator] + case ".pptx": + validators = [PPTXSchemaValidator] + case _: + print(f"Error: Validation not supported for file type {file_extension}") + sys.exit(1) + + # Run validators + success = True + for V in validators: + validator = V(unpacked_dir, original_file, verbose=args.verbose) + if not validator.validate(): + success = False + + if success: + print("All validations PASSED!") + + sys.exit(0 if success else 1) + + +if __name__ == "__main__": + main() diff --git a/skills/docx/ooxml/scripts/validation/__init__.py b/skills/docx/ooxml/scripts/validation/__init__.py new file mode 100644 index 0000000..db092ec --- /dev/null +++ b/skills/docx/ooxml/scripts/validation/__init__.py @@ -0,0 +1,15 @@ +""" +Validation modules for Word document processing. +""" + +from .base import BaseSchemaValidator +from .docx import DOCXSchemaValidator +from .pptx import PPTXSchemaValidator +from .redlining import RedliningValidator + +__all__ = [ + "BaseSchemaValidator", + "DOCXSchemaValidator", + "PPTXSchemaValidator", + "RedliningValidator", +] diff --git a/skills/docx/ooxml/scripts/validation/base.py b/skills/docx/ooxml/scripts/validation/base.py new file mode 100644 index 0000000..0681b19 --- /dev/null +++ b/skills/docx/ooxml/scripts/validation/base.py @@ -0,0 +1,951 @@ +""" +Base validator with common validation logic for document files. +""" + +import re +from pathlib import Path + +import lxml.etree + + +class BaseSchemaValidator: + """Base validator with common validation logic for document files.""" + + # Elements whose 'id' attributes must be unique within their file + # Format: element_name -> (attribute_name, scope) + # scope can be 'file' (unique within file) or 'global' (unique across all files) + UNIQUE_ID_REQUIREMENTS = { + # Word elements + "comment": ("id", "file"), # Comment IDs in comments.xml + "commentrangestart": ("id", "file"), # Must match comment IDs + "commentrangeend": ("id", "file"), # Must match comment IDs + "bookmarkstart": ("id", "file"), # Bookmark start IDs + "bookmarkend": ("id", "file"), # Bookmark end IDs + # Note: ins and del (track changes) can share IDs when part of same revision + # PowerPoint elements + "sldid": ("id", "file"), # Slide IDs in presentation.xml + "sldmasterid": ("id", "global"), # Slide master IDs must be globally unique + "sldlayoutid": ("id", "global"), # Slide layout IDs must be globally unique + "cm": ("authorid", "file"), # Comment author IDs + # Excel elements + "sheet": ("sheetid", "file"), # Sheet IDs in workbook.xml + "definedname": ("id", "file"), # Named range IDs + # Drawing/Shape elements (all formats) + "cxnsp": ("id", "file"), # Connection shape IDs + "sp": ("id", "file"), # Shape IDs + "pic": ("id", "file"), # Picture IDs + "grpsp": ("id", "file"), # Group shape IDs + } + + # Mapping of element names to expected relationship types + # Subclasses should override this with format-specific mappings + ELEMENT_RELATIONSHIP_TYPES = {} + + # Unified schema mappings for all Office document types + SCHEMA_MAPPINGS = { + # Document type specific schemas + "word": "ISO-IEC29500-4_2016/wml.xsd", # Word documents + "ppt": "ISO-IEC29500-4_2016/pml.xsd", # PowerPoint presentations + "xl": "ISO-IEC29500-4_2016/sml.xsd", # Excel spreadsheets + # Common file types + "[Content_Types].xml": "ecma/fouth-edition/opc-contentTypes.xsd", + "app.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd", + "core.xml": "ecma/fouth-edition/opc-coreProperties.xsd", + "custom.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd", + ".rels": "ecma/fouth-edition/opc-relationships.xsd", + # Word-specific files + "people.xml": "microsoft/wml-2012.xsd", + "commentsIds.xml": "microsoft/wml-cid-2016.xsd", + "commentsExtensible.xml": "microsoft/wml-cex-2018.xsd", + "commentsExtended.xml": "microsoft/wml-2012.xsd", + # Chart files (common across document types) + "chart": "ISO-IEC29500-4_2016/dml-chart.xsd", + # Theme files (common across document types) + "theme": "ISO-IEC29500-4_2016/dml-main.xsd", + # Drawing and media files + "drawing": "ISO-IEC29500-4_2016/dml-main.xsd", + } + + # Unified namespace constants + MC_NAMESPACE = "http://schemas.openxmlformats.org/markup-compatibility/2006" + XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace" + + # Common OOXML namespaces used across validators + PACKAGE_RELATIONSHIPS_NAMESPACE = ( + "http://schemas.openxmlformats.org/package/2006/relationships" + ) + OFFICE_RELATIONSHIPS_NAMESPACE = ( + "http://schemas.openxmlformats.org/officeDocument/2006/relationships" + ) + CONTENT_TYPES_NAMESPACE = ( + "http://schemas.openxmlformats.org/package/2006/content-types" + ) + + # Folders where we should clean ignorable namespaces + MAIN_CONTENT_FOLDERS = {"word", "ppt", "xl"} + + # All allowed OOXML namespaces (superset of all document types) + OOXML_NAMESPACES = { + "http://schemas.openxmlformats.org/officeDocument/2006/math", + "http://schemas.openxmlformats.org/officeDocument/2006/relationships", + "http://schemas.openxmlformats.org/schemaLibrary/2006/main", + "http://schemas.openxmlformats.org/drawingml/2006/main", + "http://schemas.openxmlformats.org/drawingml/2006/chart", + "http://schemas.openxmlformats.org/drawingml/2006/chartDrawing", + "http://schemas.openxmlformats.org/drawingml/2006/diagram", + "http://schemas.openxmlformats.org/drawingml/2006/picture", + "http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing", + "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing", + "http://schemas.openxmlformats.org/wordprocessingml/2006/main", + "http://schemas.openxmlformats.org/presentationml/2006/main", + "http://schemas.openxmlformats.org/spreadsheetml/2006/main", + "http://schemas.openxmlformats.org/officeDocument/2006/sharedTypes", + "http://www.w3.org/XML/1998/namespace", + } + + def __init__(self, unpacked_dir, original_file, verbose=False): + self.unpacked_dir = Path(unpacked_dir).resolve() + self.original_file = Path(original_file) + self.verbose = verbose + + # Set schemas directory + self.schemas_dir = Path(__file__).parent.parent.parent / "schemas" + + # Get all XML and .rels files + patterns = ["*.xml", "*.rels"] + self.xml_files = [ + f for pattern in patterns for f in self.unpacked_dir.rglob(pattern) + ] + + if not self.xml_files: + print(f"Warning: No XML files found in {self.unpacked_dir}") + + def validate(self): + """Run all validation checks and return True if all pass.""" + raise NotImplementedError("Subclasses must implement the validate method") + + def validate_xml(self): + """Validate that all XML files are well-formed.""" + errors = [] + + for xml_file in self.xml_files: + try: + # Try to parse the XML file + lxml.etree.parse(str(xml_file)) + except lxml.etree.XMLSyntaxError as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {e.lineno}: {e.msg}" + ) + except Exception as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Unexpected error: {str(e)}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} XML violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All XML files are well-formed") + return True + + def validate_namespaces(self): + """Validate that namespace prefixes in Ignorable attributes are declared.""" + errors = [] + + for xml_file in self.xml_files: + try: + root = lxml.etree.parse(str(xml_file)).getroot() + declared = set(root.nsmap.keys()) - {None} # Exclude default namespace + + for attr_val in [ + v for k, v in root.attrib.items() if k.endswith("Ignorable") + ]: + undeclared = set(attr_val.split()) - declared + errors.extend( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Namespace '{ns}' in Ignorable but not declared" + for ns in undeclared + ) + except lxml.etree.XMLSyntaxError: + continue + + if errors: + print(f"FAILED - {len(errors)} namespace issues:") + for error in errors: + print(error) + return False + if self.verbose: + print("PASSED - All namespace prefixes properly declared") + return True + + def validate_unique_ids(self): + """Validate that specific IDs are unique according to OOXML requirements.""" + errors = [] + global_ids = {} # Track globally unique IDs across all files + + for xml_file in self.xml_files: + try: + root = lxml.etree.parse(str(xml_file)).getroot() + file_ids = {} # Track IDs that must be unique within this file + + # Remove all mc:AlternateContent elements from the tree + mc_elements = root.xpath( + ".//mc:AlternateContent", namespaces={"mc": self.MC_NAMESPACE} + ) + for elem in mc_elements: + elem.getparent().remove(elem) + + # Now check IDs in the cleaned tree + for elem in root.iter(): + # Get the element name without namespace + tag = ( + elem.tag.split("}")[-1].lower() + if "}" in elem.tag + else elem.tag.lower() + ) + + # Check if this element type has ID uniqueness requirements + if tag in self.UNIQUE_ID_REQUIREMENTS: + attr_name, scope = self.UNIQUE_ID_REQUIREMENTS[tag] + + # Look for the specified attribute + id_value = None + for attr, value in elem.attrib.items(): + attr_local = ( + attr.split("}")[-1].lower() + if "}" in attr + else attr.lower() + ) + if attr_local == attr_name: + id_value = value + break + + if id_value is not None: + if scope == "global": + # Check global uniqueness + if id_value in global_ids: + prev_file, prev_line, prev_tag = global_ids[ + id_value + ] + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: Global ID '{id_value}' in <{tag}> " + f"already used in {prev_file} at line {prev_line} in <{prev_tag}>" + ) + else: + global_ids[id_value] = ( + xml_file.relative_to(self.unpacked_dir), + elem.sourceline, + tag, + ) + elif scope == "file": + # Check file-level uniqueness + key = (tag, attr_name) + if key not in file_ids: + file_ids[key] = {} + + if id_value in file_ids[key]: + prev_line = file_ids[key][id_value] + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: Duplicate {attr_name}='{id_value}' in <{tag}> " + f"(first occurrence at line {prev_line})" + ) + else: + file_ids[key][id_value] = elem.sourceline + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} ID uniqueness violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All required IDs are unique") + return True + + def validate_file_references(self): + """ + Validate that all .rels files properly reference files and that all files are referenced. + """ + errors = [] + + # Find all .rels files + rels_files = list(self.unpacked_dir.rglob("*.rels")) + + if not rels_files: + if self.verbose: + print("PASSED - No .rels files found") + return True + + # Get all files in the unpacked directory (excluding reference files) + all_files = [] + for file_path in self.unpacked_dir.rglob("*"): + if ( + file_path.is_file() + and file_path.name != "[Content_Types].xml" + and not file_path.name.endswith(".rels") + ): # This file is not referenced by .rels + all_files.append(file_path.resolve()) + + # Track all files that are referenced by any .rels file + all_referenced_files = set() + + if self.verbose: + print( + f"Found {len(rels_files)} .rels files and {len(all_files)} target files" + ) + + # Check each .rels file + for rels_file in rels_files: + try: + # Parse relationships file + rels_root = lxml.etree.parse(str(rels_file)).getroot() + + # Get the directory where this .rels file is located + rels_dir = rels_file.parent + + # Find all relationships and their targets + referenced_files = set() + broken_refs = [] + + for rel in rels_root.findall( + ".//ns:Relationship", + namespaces={"ns": self.PACKAGE_RELATIONSHIPS_NAMESPACE}, + ): + target = rel.get("Target") + if target and not target.startswith( + ("http", "mailto:") + ): # Skip external URLs + # Resolve the target path relative to the .rels file location + if rels_file.name == ".rels": + # Root .rels file - targets are relative to unpacked_dir + target_path = self.unpacked_dir / target + else: + # Other .rels files - targets are relative to their parent's parent + # e.g., word/_rels/document.xml.rels -> targets relative to word/ + base_dir = rels_dir.parent + target_path = base_dir / target + + # Normalize the path and check if it exists + try: + target_path = target_path.resolve() + if target_path.exists() and target_path.is_file(): + referenced_files.add(target_path) + all_referenced_files.add(target_path) + else: + broken_refs.append((target, rel.sourceline)) + except (OSError, ValueError): + broken_refs.append((target, rel.sourceline)) + + # Report broken references + if broken_refs: + rel_path = rels_file.relative_to(self.unpacked_dir) + for broken_ref, line_num in broken_refs: + errors.append( + f" {rel_path}: Line {line_num}: Broken reference to {broken_ref}" + ) + + except Exception as e: + rel_path = rels_file.relative_to(self.unpacked_dir) + errors.append(f" Error parsing {rel_path}: {e}") + + # Check for unreferenced files (files that exist but are not referenced anywhere) + unreferenced_files = set(all_files) - all_referenced_files + + if unreferenced_files: + for unref_file in sorted(unreferenced_files): + unref_rel_path = unref_file.relative_to(self.unpacked_dir) + errors.append(f" Unreferenced file: {unref_rel_path}") + + if errors: + print(f"FAILED - Found {len(errors)} relationship validation errors:") + for error in errors: + print(error) + print( + "CRITICAL: These errors will cause the document to appear corrupt. " + + "Broken references MUST be fixed, " + + "and unreferenced files MUST be referenced or removed." + ) + return False + else: + if self.verbose: + print( + "PASSED - All references are valid and all files are properly referenced" + ) + return True + + def validate_all_relationship_ids(self): + """ + Validate that all r:id attributes in XML files reference existing IDs + in their corresponding .rels files, and optionally validate relationship types. + """ + import lxml.etree + + errors = [] + + # Process each XML file that might contain r:id references + for xml_file in self.xml_files: + # Skip .rels files themselves + if xml_file.suffix == ".rels": + continue + + # Determine the corresponding .rels file + # For dir/file.xml, it's dir/_rels/file.xml.rels + rels_dir = xml_file.parent / "_rels" + rels_file = rels_dir / f"{xml_file.name}.rels" + + # Skip if there's no corresponding .rels file (that's okay) + if not rels_file.exists(): + continue + + try: + # Parse the .rels file to get valid relationship IDs and their types + rels_root = lxml.etree.parse(str(rels_file)).getroot() + rid_to_type = {} + + for rel in rels_root.findall( + f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship" + ): + rid = rel.get("Id") + rel_type = rel.get("Type", "") + if rid: + # Check for duplicate rIds + if rid in rid_to_type: + rels_rel_path = rels_file.relative_to(self.unpacked_dir) + errors.append( + f" {rels_rel_path}: Line {rel.sourceline}: " + f"Duplicate relationship ID '{rid}' (IDs must be unique)" + ) + # Extract just the type name from the full URL + type_name = ( + rel_type.split("/")[-1] if "/" in rel_type else rel_type + ) + rid_to_type[rid] = type_name + + # Parse the XML file to find all r:id references + xml_root = lxml.etree.parse(str(xml_file)).getroot() + + # Find all elements with r:id attributes + for elem in xml_root.iter(): + # Check for r:id attribute (relationship ID) + rid_attr = elem.get(f"{{{self.OFFICE_RELATIONSHIPS_NAMESPACE}}}id") + if rid_attr: + xml_rel_path = xml_file.relative_to(self.unpacked_dir) + elem_name = ( + elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag + ) + + # Check if the ID exists + if rid_attr not in rid_to_type: + errors.append( + f" {xml_rel_path}: Line {elem.sourceline}: " + f"<{elem_name}> references non-existent relationship '{rid_attr}' " + f"(valid IDs: {', '.join(sorted(rid_to_type.keys())[:5])}{'...' if len(rid_to_type) > 5 else ''})" + ) + # Check if we have type expectations for this element + elif self.ELEMENT_RELATIONSHIP_TYPES: + expected_type = self._get_expected_relationship_type( + elem_name + ) + if expected_type: + actual_type = rid_to_type[rid_attr] + # Check if the actual type matches or contains the expected type + if expected_type not in actual_type.lower(): + errors.append( + f" {xml_rel_path}: Line {elem.sourceline}: " + f"<{elem_name}> references '{rid_attr}' which points to '{actual_type}' " + f"but should point to a '{expected_type}' relationship" + ) + + except Exception as e: + xml_rel_path = xml_file.relative_to(self.unpacked_dir) + errors.append(f" Error processing {xml_rel_path}: {e}") + + if errors: + print(f"FAILED - Found {len(errors)} relationship ID reference errors:") + for error in errors: + print(error) + print("\nThese ID mismatches will cause the document to appear corrupt!") + return False + else: + if self.verbose: + print("PASSED - All relationship ID references are valid") + return True + + def _get_expected_relationship_type(self, element_name): + """ + Get the expected relationship type for an element. + First checks the explicit mapping, then tries pattern detection. + """ + # Normalize element name to lowercase + elem_lower = element_name.lower() + + # Check explicit mapping first + if elem_lower in self.ELEMENT_RELATIONSHIP_TYPES: + return self.ELEMENT_RELATIONSHIP_TYPES[elem_lower] + + # Try pattern detection for common patterns + # Pattern 1: Elements ending in "Id" often expect a relationship of the prefix type + if elem_lower.endswith("id") and len(elem_lower) > 2: + # e.g., "sldId" -> "sld", "sldMasterId" -> "sldMaster" + prefix = elem_lower[:-2] # Remove "id" + # Check if this might be a compound like "sldMasterId" + if prefix.endswith("master"): + return prefix.lower() + elif prefix.endswith("layout"): + return prefix.lower() + else: + # Simple case like "sldId" -> "slide" + # Common transformations + if prefix == "sld": + return "slide" + return prefix.lower() + + # Pattern 2: Elements ending in "Reference" expect a relationship of the prefix type + if elem_lower.endswith("reference") and len(elem_lower) > 9: + prefix = elem_lower[:-9] # Remove "reference" + return prefix.lower() + + return None + + def validate_content_types(self): + """Validate that all content files are properly declared in [Content_Types].xml.""" + errors = [] + + # Find [Content_Types].xml file + content_types_file = self.unpacked_dir / "[Content_Types].xml" + if not content_types_file.exists(): + print("FAILED - [Content_Types].xml file not found") + return False + + try: + # Parse and get all declared parts and extensions + root = lxml.etree.parse(str(content_types_file)).getroot() + declared_parts = set() + declared_extensions = set() + + # Get Override declarations (specific files) + for override in root.findall( + f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Override" + ): + part_name = override.get("PartName") + if part_name is not None: + declared_parts.add(part_name.lstrip("/")) + + # Get Default declarations (by extension) + for default in root.findall( + f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Default" + ): + extension = default.get("Extension") + if extension is not None: + declared_extensions.add(extension.lower()) + + # Root elements that require content type declaration + declarable_roots = { + "sld", + "sldLayout", + "sldMaster", + "presentation", # PowerPoint + "document", # Word + "workbook", + "worksheet", # Excel + "theme", # Common + } + + # Common media file extensions that should be declared + media_extensions = { + "png": "image/png", + "jpg": "image/jpeg", + "jpeg": "image/jpeg", + "gif": "image/gif", + "bmp": "image/bmp", + "tiff": "image/tiff", + "wmf": "image/x-wmf", + "emf": "image/x-emf", + } + + # Get all files in the unpacked directory + all_files = list(self.unpacked_dir.rglob("*")) + all_files = [f for f in all_files if f.is_file()] + + # Check all XML files for Override declarations + for xml_file in self.xml_files: + path_str = str(xml_file.relative_to(self.unpacked_dir)).replace( + "\\", "/" + ) + + # Skip non-content files + if any( + skip in path_str + for skip in [".rels", "[Content_Types]", "docProps/", "_rels/"] + ): + continue + + try: + root_tag = lxml.etree.parse(str(xml_file)).getroot().tag + root_name = root_tag.split("}")[-1] if "}" in root_tag else root_tag + + if root_name in declarable_roots and path_str not in declared_parts: + errors.append( + f" {path_str}: File with <{root_name}> root not declared in [Content_Types].xml" + ) + + except Exception: + continue # Skip unparseable files + + # Check all non-XML files for Default extension declarations + for file_path in all_files: + # Skip XML files and metadata files (already checked above) + if file_path.suffix.lower() in {".xml", ".rels"}: + continue + if file_path.name == "[Content_Types].xml": + continue + if "_rels" in file_path.parts or "docProps" in file_path.parts: + continue + + extension = file_path.suffix.lstrip(".").lower() + if extension and extension not in declared_extensions: + # Check if it's a known media extension that should be declared + if extension in media_extensions: + relative_path = file_path.relative_to(self.unpacked_dir) + errors.append( + f' {relative_path}: File with extension \'{extension}\' not declared in [Content_Types].xml - should add: ' + ) + + except Exception as e: + errors.append(f" Error parsing [Content_Types].xml: {e}") + + if errors: + print(f"FAILED - Found {len(errors)} content type declaration errors:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print( + "PASSED - All content files are properly declared in [Content_Types].xml" + ) + return True + + def validate_file_against_xsd(self, xml_file, verbose=False): + """Validate a single XML file against XSD schema, comparing with original. + + Args: + xml_file: Path to XML file to validate + verbose: Enable verbose output + + Returns: + tuple: (is_valid, new_errors_set) where is_valid is True/False/None (skipped) + """ + # Resolve both paths to handle symlinks + xml_file = Path(xml_file).resolve() + unpacked_dir = self.unpacked_dir.resolve() + + # Validate current file + is_valid, current_errors = self._validate_single_file_xsd( + xml_file, unpacked_dir + ) + + if is_valid is None: + return None, set() # Skipped + elif is_valid: + return True, set() # Valid, no errors + + # Get errors from original file for this specific file + original_errors = self._get_original_file_errors(xml_file) + + # Compare with original (both are guaranteed to be sets here) + assert current_errors is not None + new_errors = current_errors - original_errors + + if new_errors: + if verbose: + relative_path = xml_file.relative_to(unpacked_dir) + print(f"FAILED - {relative_path}: {len(new_errors)} new error(s)") + for error in list(new_errors)[:3]: + truncated = error[:250] + "..." if len(error) > 250 else error + print(f" - {truncated}") + return False, new_errors + else: + # All errors existed in original + if verbose: + print( + f"PASSED - No new errors (original had {len(current_errors)} errors)" + ) + return True, set() + + def validate_against_xsd(self): + """Validate XML files against XSD schemas, showing only new errors compared to original.""" + new_errors = [] + original_error_count = 0 + valid_count = 0 + skipped_count = 0 + + for xml_file in self.xml_files: + relative_path = str(xml_file.relative_to(self.unpacked_dir)) + is_valid, new_file_errors = self.validate_file_against_xsd( + xml_file, verbose=False + ) + + if is_valid is None: + skipped_count += 1 + continue + elif is_valid and not new_file_errors: + valid_count += 1 + continue + elif is_valid: + # Had errors but all existed in original + original_error_count += 1 + valid_count += 1 + continue + + # Has new errors + new_errors.append(f" {relative_path}: {len(new_file_errors)} new error(s)") + for error in list(new_file_errors)[:3]: # Show first 3 errors + new_errors.append( + f" - {error[:250]}..." if len(error) > 250 else f" - {error}" + ) + + # Print summary + if self.verbose: + print(f"Validated {len(self.xml_files)} files:") + print(f" - Valid: {valid_count}") + print(f" - Skipped (no schema): {skipped_count}") + if original_error_count: + print(f" - With original errors (ignored): {original_error_count}") + print( + f" - With NEW errors: {len(new_errors) > 0 and len([e for e in new_errors if not e.startswith(' ')]) or 0}" + ) + + if new_errors: + print("\nFAILED - Found NEW validation errors:") + for error in new_errors: + print(error) + return False + else: + if self.verbose: + print("\nPASSED - No new XSD validation errors introduced") + return True + + def _get_schema_path(self, xml_file): + """Determine the appropriate schema path for an XML file.""" + # Check exact filename match + if xml_file.name in self.SCHEMA_MAPPINGS: + return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.name] + + # Check .rels files + if xml_file.suffix == ".rels": + return self.schemas_dir / self.SCHEMA_MAPPINGS[".rels"] + + # Check chart files + if "charts/" in str(xml_file) and xml_file.name.startswith("chart"): + return self.schemas_dir / self.SCHEMA_MAPPINGS["chart"] + + # Check theme files + if "theme/" in str(xml_file) and xml_file.name.startswith("theme"): + return self.schemas_dir / self.SCHEMA_MAPPINGS["theme"] + + # Check if file is in a main content folder and use appropriate schema + if xml_file.parent.name in self.MAIN_CONTENT_FOLDERS: + return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.parent.name] + + return None + + def _clean_ignorable_namespaces(self, xml_doc): + """Remove attributes and elements not in allowed namespaces.""" + # Create a clean copy + xml_string = lxml.etree.tostring(xml_doc, encoding="unicode") + xml_copy = lxml.etree.fromstring(xml_string) + + # Remove attributes not in allowed namespaces + for elem in xml_copy.iter(): + attrs_to_remove = [] + + for attr in elem.attrib: + # Check if attribute is from a namespace other than allowed ones + if "{" in attr: + ns = attr.split("}")[0][1:] + if ns not in self.OOXML_NAMESPACES: + attrs_to_remove.append(attr) + + # Remove collected attributes + for attr in attrs_to_remove: + del elem.attrib[attr] + + # Remove elements not in allowed namespaces + self._remove_ignorable_elements(xml_copy) + + return lxml.etree.ElementTree(xml_copy) + + def _remove_ignorable_elements(self, root): + """Recursively remove all elements not in allowed namespaces.""" + elements_to_remove = [] + + # Find elements to remove + for elem in list(root): + # Skip non-element nodes (comments, processing instructions, etc.) + if not hasattr(elem, "tag") or callable(elem.tag): + continue + + tag_str = str(elem.tag) + if tag_str.startswith("{"): + ns = tag_str.split("}")[0][1:] + if ns not in self.OOXML_NAMESPACES: + elements_to_remove.append(elem) + continue + + # Recursively clean child elements + self._remove_ignorable_elements(elem) + + # Remove collected elements + for elem in elements_to_remove: + root.remove(elem) + + def _preprocess_for_mc_ignorable(self, xml_doc): + """Preprocess XML to handle mc:Ignorable attribute properly.""" + # Remove mc:Ignorable attributes before validation + root = xml_doc.getroot() + + # Remove mc:Ignorable attribute from root + if f"{{{self.MC_NAMESPACE}}}Ignorable" in root.attrib: + del root.attrib[f"{{{self.MC_NAMESPACE}}}Ignorable"] + + return xml_doc + + def _validate_single_file_xsd(self, xml_file, base_path): + """Validate a single XML file against XSD schema. Returns (is_valid, errors_set).""" + schema_path = self._get_schema_path(xml_file) + if not schema_path: + return None, None # Skip file + + try: + # Load schema + with open(schema_path, "rb") as xsd_file: + parser = lxml.etree.XMLParser() + xsd_doc = lxml.etree.parse( + xsd_file, parser=parser, base_url=str(schema_path) + ) + schema = lxml.etree.XMLSchema(xsd_doc) + + # Load and preprocess XML + with open(xml_file, "r") as f: + xml_doc = lxml.etree.parse(f) + + xml_doc, _ = self._remove_template_tags_from_text_nodes(xml_doc) + xml_doc = self._preprocess_for_mc_ignorable(xml_doc) + + # Clean ignorable namespaces if needed + relative_path = xml_file.relative_to(base_path) + if ( + relative_path.parts + and relative_path.parts[0] in self.MAIN_CONTENT_FOLDERS + ): + xml_doc = self._clean_ignorable_namespaces(xml_doc) + + # Validate + if schema.validate(xml_doc): + return True, set() + else: + errors = set() + for error in schema.error_log: + # Store normalized error message (without line numbers for comparison) + errors.add(error.message) + return False, errors + + except Exception as e: + return False, {str(e)} + + def _get_original_file_errors(self, xml_file): + """Get XSD validation errors from a single file in the original document. + + Args: + xml_file: Path to the XML file in unpacked_dir to check + + Returns: + set: Set of error messages from the original file + """ + import tempfile + import zipfile + + # Resolve both paths to handle symlinks (e.g., /var vs /private/var on macOS) + xml_file = Path(xml_file).resolve() + unpacked_dir = self.unpacked_dir.resolve() + relative_path = xml_file.relative_to(unpacked_dir) + + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + # Extract original file + with zipfile.ZipFile(self.original_file, "r") as zip_ref: + zip_ref.extractall(temp_path) + + # Find corresponding file in original + original_xml_file = temp_path / relative_path + + if not original_xml_file.exists(): + # File didn't exist in original, so no original errors + return set() + + # Validate the specific file in original + is_valid, errors = self._validate_single_file_xsd( + original_xml_file, temp_path + ) + return errors if errors else set() + + def _remove_template_tags_from_text_nodes(self, xml_doc): + """Remove template tags from XML text nodes and collect warnings. + + Template tags follow the pattern {{ ... }} and are used as placeholders + for content replacement. They should be removed from text content before + XSD validation while preserving XML structure. + + Returns: + tuple: (cleaned_xml_doc, warnings_list) + """ + warnings = [] + template_pattern = re.compile(r"\{\{[^}]*\}\}") + + # Create a copy of the document to avoid modifying the original + xml_string = lxml.etree.tostring(xml_doc, encoding="unicode") + xml_copy = lxml.etree.fromstring(xml_string) + + def process_text_content(text, content_type): + if not text: + return text + matches = list(template_pattern.finditer(text)) + if matches: + for match in matches: + warnings.append( + f"Found template tag in {content_type}: {match.group()}" + ) + return template_pattern.sub("", text) + return text + + # Process all text nodes in the document + for elem in xml_copy.iter(): + # Skip processing if this is a w:t element + if not hasattr(elem, "tag") or callable(elem.tag): + continue + tag_str = str(elem.tag) + if tag_str.endswith("}t") or tag_str == "t": + continue + + elem.text = process_text_content(elem.text, "text content") + elem.tail = process_text_content(elem.tail, "tail content") + + return lxml.etree.ElementTree(xml_copy), warnings + + +if __name__ == "__main__": + raise RuntimeError("This module should not be run directly.") diff --git a/skills/docx/ooxml/scripts/validation/docx.py b/skills/docx/ooxml/scripts/validation/docx.py new file mode 100644 index 0000000..602c470 --- /dev/null +++ b/skills/docx/ooxml/scripts/validation/docx.py @@ -0,0 +1,274 @@ +""" +Validator for Word document XML files against XSD schemas. +""" + +import re +import tempfile +import zipfile + +import lxml.etree + +from .base import BaseSchemaValidator + + +class DOCXSchemaValidator(BaseSchemaValidator): + """Validator for Word document XML files against XSD schemas.""" + + # Word-specific namespace + WORD_2006_NAMESPACE = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" + + # Word-specific element to relationship type mappings + # Start with empty mapping - add specific cases as we discover them + ELEMENT_RELATIONSHIP_TYPES = {} + + def validate(self): + """Run all validation checks and return True if all pass.""" + # Test 0: XML well-formedness + if not self.validate_xml(): + return False + + # Test 1: Namespace declarations + all_valid = True + if not self.validate_namespaces(): + all_valid = False + + # Test 2: Unique IDs + if not self.validate_unique_ids(): + all_valid = False + + # Test 3: Relationship and file reference validation + if not self.validate_file_references(): + all_valid = False + + # Test 4: Content type declarations + if not self.validate_content_types(): + all_valid = False + + # Test 5: XSD schema validation + if not self.validate_against_xsd(): + all_valid = False + + # Test 6: Whitespace preservation + if not self.validate_whitespace_preservation(): + all_valid = False + + # Test 7: Deletion validation + if not self.validate_deletions(): + all_valid = False + + # Test 8: Insertion validation + if not self.validate_insertions(): + all_valid = False + + # Test 9: Relationship ID reference validation + if not self.validate_all_relationship_ids(): + all_valid = False + + # Count and compare paragraphs + self.compare_paragraph_counts() + + return all_valid + + def validate_whitespace_preservation(self): + """ + Validate that w:t elements with whitespace have xml:space='preserve'. + """ + errors = [] + + for xml_file in self.xml_files: + # Only check document.xml files + if xml_file.name != "document.xml": + continue + + try: + root = lxml.etree.parse(str(xml_file)).getroot() + + # Find all w:t elements + for elem in root.iter(f"{{{self.WORD_2006_NAMESPACE}}}t"): + if elem.text: + text = elem.text + # Check if text starts or ends with whitespace + if re.match(r"^\s.*", text) or re.match(r".*\s$", text): + # Check if xml:space="preserve" attribute exists + xml_space_attr = f"{{{self.XML_NAMESPACE}}}space" + if ( + xml_space_attr not in elem.attrib + or elem.attrib[xml_space_attr] != "preserve" + ): + # Show a preview of the text + text_preview = ( + repr(text)[:50] + "..." + if len(repr(text)) > 50 + else repr(text) + ) + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: w:t element with whitespace missing xml:space='preserve': {text_preview}" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} whitespace preservation violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All whitespace is properly preserved") + return True + + def validate_deletions(self): + """ + Validate that w:t elements are not within w:del elements. + For some reason, XSD validation does not catch this, so we do it manually. + """ + errors = [] + + for xml_file in self.xml_files: + # Only check document.xml files + if xml_file.name != "document.xml": + continue + + try: + root = lxml.etree.parse(str(xml_file)).getroot() + + # Find all w:t elements that are descendants of w:del elements + namespaces = {"w": self.WORD_2006_NAMESPACE} + xpath_expression = ".//w:del//w:t" + problematic_t_elements = root.xpath( + xpath_expression, namespaces=namespaces + ) + for t_elem in problematic_t_elements: + if t_elem.text: + # Show a preview of the text + text_preview = ( + repr(t_elem.text)[:50] + "..." + if len(repr(t_elem.text)) > 50 + else repr(t_elem.text) + ) + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {t_elem.sourceline}: found within : {text_preview}" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} deletion validation violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - No w:t elements found within w:del elements") + return True + + def count_paragraphs_in_unpacked(self): + """Count the number of paragraphs in the unpacked document.""" + count = 0 + + for xml_file in self.xml_files: + # Only check document.xml files + if xml_file.name != "document.xml": + continue + + try: + root = lxml.etree.parse(str(xml_file)).getroot() + # Count all w:p elements + paragraphs = root.findall(f".//{{{self.WORD_2006_NAMESPACE}}}p") + count = len(paragraphs) + except Exception as e: + print(f"Error counting paragraphs in unpacked document: {e}") + + return count + + def count_paragraphs_in_original(self): + """Count the number of paragraphs in the original docx file.""" + count = 0 + + try: + # Create temporary directory to unpack original + with tempfile.TemporaryDirectory() as temp_dir: + # Unpack original docx + with zipfile.ZipFile(self.original_file, "r") as zip_ref: + zip_ref.extractall(temp_dir) + + # Parse document.xml + doc_xml_path = temp_dir + "/word/document.xml" + root = lxml.etree.parse(doc_xml_path).getroot() + + # Count all w:p elements + paragraphs = root.findall(f".//{{{self.WORD_2006_NAMESPACE}}}p") + count = len(paragraphs) + + except Exception as e: + print(f"Error counting paragraphs in original document: {e}") + + return count + + def validate_insertions(self): + """ + Validate that w:delText elements are not within w:ins elements. + w:delText is only allowed in w:ins if nested within a w:del. + """ + errors = [] + + for xml_file in self.xml_files: + if xml_file.name != "document.xml": + continue + + try: + root = lxml.etree.parse(str(xml_file)).getroot() + namespaces = {"w": self.WORD_2006_NAMESPACE} + + # Find w:delText in w:ins that are NOT within w:del + invalid_elements = root.xpath( + ".//w:ins//w:delText[not(ancestor::w:del)]", + namespaces=namespaces + ) + + for elem in invalid_elements: + text_preview = ( + repr(elem.text or "")[:50] + "..." + if len(repr(elem.text or "")) > 50 + else repr(elem.text or "") + ) + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: within : {text_preview}" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} insertion validation violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - No w:delText elements within w:ins elements") + return True + + def compare_paragraph_counts(self): + """Compare paragraph counts between original and new document.""" + original_count = self.count_paragraphs_in_original() + new_count = self.count_paragraphs_in_unpacked() + + diff = new_count - original_count + diff_str = f"+{diff}" if diff > 0 else str(diff) + print(f"\nParagraphs: {original_count} → {new_count} ({diff_str})") + + +if __name__ == "__main__": + raise RuntimeError("This module should not be run directly.") diff --git a/skills/docx/ooxml/scripts/validation/pptx.py b/skills/docx/ooxml/scripts/validation/pptx.py new file mode 100644 index 0000000..66d5b1e --- /dev/null +++ b/skills/docx/ooxml/scripts/validation/pptx.py @@ -0,0 +1,315 @@ +""" +Validator for PowerPoint presentation XML files against XSD schemas. +""" + +import re + +from .base import BaseSchemaValidator + + +class PPTXSchemaValidator(BaseSchemaValidator): + """Validator for PowerPoint presentation XML files against XSD schemas.""" + + # PowerPoint presentation namespace + PRESENTATIONML_NAMESPACE = ( + "http://schemas.openxmlformats.org/presentationml/2006/main" + ) + + # PowerPoint-specific element to relationship type mappings + ELEMENT_RELATIONSHIP_TYPES = { + "sldid": "slide", + "sldmasterid": "slidemaster", + "notesmasterid": "notesmaster", + "sldlayoutid": "slidelayout", + "themeid": "theme", + "tablestyleid": "tablestyles", + } + + def validate(self): + """Run all validation checks and return True if all pass.""" + # Test 0: XML well-formedness + if not self.validate_xml(): + return False + + # Test 1: Namespace declarations + all_valid = True + if not self.validate_namespaces(): + all_valid = False + + # Test 2: Unique IDs + if not self.validate_unique_ids(): + all_valid = False + + # Test 3: UUID ID validation + if not self.validate_uuid_ids(): + all_valid = False + + # Test 4: Relationship and file reference validation + if not self.validate_file_references(): + all_valid = False + + # Test 5: Slide layout ID validation + if not self.validate_slide_layout_ids(): + all_valid = False + + # Test 6: Content type declarations + if not self.validate_content_types(): + all_valid = False + + # Test 7: XSD schema validation + if not self.validate_against_xsd(): + all_valid = False + + # Test 8: Notes slide reference validation + if not self.validate_notes_slide_references(): + all_valid = False + + # Test 9: Relationship ID reference validation + if not self.validate_all_relationship_ids(): + all_valid = False + + # Test 10: Duplicate slide layout references validation + if not self.validate_no_duplicate_slide_layouts(): + all_valid = False + + return all_valid + + def validate_uuid_ids(self): + """Validate that ID attributes that look like UUIDs contain only hex values.""" + import lxml.etree + + errors = [] + # UUID pattern: 8-4-4-4-12 hex digits with optional braces/hyphens + uuid_pattern = re.compile( + r"^[\{\(]?[0-9A-Fa-f]{8}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{12}[\}\)]?$" + ) + + for xml_file in self.xml_files: + try: + root = lxml.etree.parse(str(xml_file)).getroot() + + # Check all elements for ID attributes + for elem in root.iter(): + for attr, value in elem.attrib.items(): + # Check if this is an ID attribute + attr_name = attr.split("}")[-1].lower() + if attr_name == "id" or attr_name.endswith("id"): + # Check if value looks like a UUID (has the right length and pattern structure) + if self._looks_like_uuid(value): + # Validate that it contains only hex characters in the right positions + if not uuid_pattern.match(value): + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: ID '{value}' appears to be a UUID but contains invalid hex characters" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} UUID ID validation errors:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All UUID-like IDs contain valid hex values") + return True + + def _looks_like_uuid(self, value): + """Check if a value has the general structure of a UUID.""" + # Remove common UUID delimiters + clean_value = value.strip("{}()").replace("-", "") + # Check if it's 32 hex-like characters (could include invalid hex chars) + return len(clean_value) == 32 and all(c.isalnum() for c in clean_value) + + def validate_slide_layout_ids(self): + """Validate that sldLayoutId elements in slide masters reference valid slide layouts.""" + import lxml.etree + + errors = [] + + # Find all slide master files + slide_masters = list(self.unpacked_dir.glob("ppt/slideMasters/*.xml")) + + if not slide_masters: + if self.verbose: + print("PASSED - No slide masters found") + return True + + for slide_master in slide_masters: + try: + # Parse the slide master file + root = lxml.etree.parse(str(slide_master)).getroot() + + # Find the corresponding _rels file for this slide master + rels_file = slide_master.parent / "_rels" / f"{slide_master.name}.rels" + + if not rels_file.exists(): + errors.append( + f" {slide_master.relative_to(self.unpacked_dir)}: " + f"Missing relationships file: {rels_file.relative_to(self.unpacked_dir)}" + ) + continue + + # Parse the relationships file + rels_root = lxml.etree.parse(str(rels_file)).getroot() + + # Build a set of valid relationship IDs that point to slide layouts + valid_layout_rids = set() + for rel in rels_root.findall( + f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship" + ): + rel_type = rel.get("Type", "") + if "slideLayout" in rel_type: + valid_layout_rids.add(rel.get("Id")) + + # Find all sldLayoutId elements in the slide master + for sld_layout_id in root.findall( + f".//{{{self.PRESENTATIONML_NAMESPACE}}}sldLayoutId" + ): + r_id = sld_layout_id.get( + f"{{{self.OFFICE_RELATIONSHIPS_NAMESPACE}}}id" + ) + layout_id = sld_layout_id.get("id") + + if r_id and r_id not in valid_layout_rids: + errors.append( + f" {slide_master.relative_to(self.unpacked_dir)}: " + f"Line {sld_layout_id.sourceline}: sldLayoutId with id='{layout_id}' " + f"references r:id='{r_id}' which is not found in slide layout relationships" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {slide_master.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} slide layout ID validation errors:") + for error in errors: + print(error) + print( + "Remove invalid references or add missing slide layouts to the relationships file." + ) + return False + else: + if self.verbose: + print("PASSED - All slide layout IDs reference valid slide layouts") + return True + + def validate_no_duplicate_slide_layouts(self): + """Validate that each slide has exactly one slideLayout reference.""" + import lxml.etree + + errors = [] + slide_rels_files = list(self.unpacked_dir.glob("ppt/slides/_rels/*.xml.rels")) + + for rels_file in slide_rels_files: + try: + root = lxml.etree.parse(str(rels_file)).getroot() + + # Find all slideLayout relationships + layout_rels = [ + rel + for rel in root.findall( + f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship" + ) + if "slideLayout" in rel.get("Type", "") + ] + + if len(layout_rels) > 1: + errors.append( + f" {rels_file.relative_to(self.unpacked_dir)}: has {len(layout_rels)} slideLayout references" + ) + + except Exception as e: + errors.append( + f" {rels_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print("FAILED - Found slides with duplicate slideLayout references:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All slides have exactly one slideLayout reference") + return True + + def validate_notes_slide_references(self): + """Validate that each notesSlide file is referenced by only one slide.""" + import lxml.etree + + errors = [] + notes_slide_references = {} # Track which slides reference each notesSlide + + # Find all slide relationship files + slide_rels_files = list(self.unpacked_dir.glob("ppt/slides/_rels/*.xml.rels")) + + if not slide_rels_files: + if self.verbose: + print("PASSED - No slide relationship files found") + return True + + for rels_file in slide_rels_files: + try: + # Parse the relationships file + root = lxml.etree.parse(str(rels_file)).getroot() + + # Find all notesSlide relationships + for rel in root.findall( + f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship" + ): + rel_type = rel.get("Type", "") + if "notesSlide" in rel_type: + target = rel.get("Target", "") + if target: + # Normalize the target path to handle relative paths + normalized_target = target.replace("../", "") + + # Track which slide references this notesSlide + slide_name = rels_file.stem.replace( + ".xml", "" + ) # e.g., "slide1" + + if normalized_target not in notes_slide_references: + notes_slide_references[normalized_target] = [] + notes_slide_references[normalized_target].append( + (slide_name, rels_file) + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {rels_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + # Check for duplicate references + for target, references in notes_slide_references.items(): + if len(references) > 1: + slide_names = [ref[0] for ref in references] + errors.append( + f" Notes slide '{target}' is referenced by multiple slides: {', '.join(slide_names)}" + ) + for slide_name, rels_file in references: + errors.append(f" - {rels_file.relative_to(self.unpacked_dir)}") + + if errors: + print( + f"FAILED - Found {len([e for e in errors if not e.startswith(' ')])} notes slide reference validation errors:" + ) + for error in errors: + print(error) + print("Each slide may optionally have its own slide file.") + return False + else: + if self.verbose: + print("PASSED - All notes slide references are unique") + return True + + +if __name__ == "__main__": + raise RuntimeError("This module should not be run directly.") diff --git a/skills/docx/ooxml/scripts/validation/redlining.py b/skills/docx/ooxml/scripts/validation/redlining.py new file mode 100644 index 0000000..7ed425e --- /dev/null +++ b/skills/docx/ooxml/scripts/validation/redlining.py @@ -0,0 +1,279 @@ +""" +Validator for tracked changes in Word documents. +""" + +import subprocess +import tempfile +import zipfile +from pathlib import Path + + +class RedliningValidator: + """Validator for tracked changes in Word documents.""" + + def __init__(self, unpacked_dir, original_docx, verbose=False): + self.unpacked_dir = Path(unpacked_dir) + self.original_docx = Path(original_docx) + self.verbose = verbose + self.namespaces = { + "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main" + } + + def validate(self): + """Main validation method that returns True if valid, False otherwise.""" + # Verify unpacked directory exists and has correct structure + modified_file = self.unpacked_dir / "word" / "document.xml" + if not modified_file.exists(): + print(f"FAILED - Modified document.xml not found at {modified_file}") + return False + + # First, check if there are any tracked changes by Claude to validate + try: + import xml.etree.ElementTree as ET + + tree = ET.parse(modified_file) + root = tree.getroot() + + # Check for w:del or w:ins tags authored by Claude + del_elements = root.findall(".//w:del", self.namespaces) + ins_elements = root.findall(".//w:ins", self.namespaces) + + # Filter to only include changes by Claude + claude_del_elements = [ + elem + for elem in del_elements + if elem.get(f"{{{self.namespaces['w']}}}author") == "Claude" + ] + claude_ins_elements = [ + elem + for elem in ins_elements + if elem.get(f"{{{self.namespaces['w']}}}author") == "Claude" + ] + + # Redlining validation is only needed if tracked changes by Claude have been used. + if not claude_del_elements and not claude_ins_elements: + if self.verbose: + print("PASSED - No tracked changes by Claude found.") + return True + + except Exception: + # If we can't parse the XML, continue with full validation + pass + + # Create temporary directory for unpacking original docx + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + # Unpack original docx + try: + with zipfile.ZipFile(self.original_docx, "r") as zip_ref: + zip_ref.extractall(temp_path) + except Exception as e: + print(f"FAILED - Error unpacking original docx: {e}") + return False + + original_file = temp_path / "word" / "document.xml" + if not original_file.exists(): + print( + f"FAILED - Original document.xml not found in {self.original_docx}" + ) + return False + + # Parse both XML files using xml.etree.ElementTree for redlining validation + try: + import xml.etree.ElementTree as ET + + modified_tree = ET.parse(modified_file) + modified_root = modified_tree.getroot() + original_tree = ET.parse(original_file) + original_root = original_tree.getroot() + except ET.ParseError as e: + print(f"FAILED - Error parsing XML files: {e}") + return False + + # Remove Claude's tracked changes from both documents + self._remove_claude_tracked_changes(original_root) + self._remove_claude_tracked_changes(modified_root) + + # Extract and compare text content + modified_text = self._extract_text_content(modified_root) + original_text = self._extract_text_content(original_root) + + if modified_text != original_text: + # Show detailed character-level differences for each paragraph + error_message = self._generate_detailed_diff( + original_text, modified_text + ) + print(error_message) + return False + + if self.verbose: + print("PASSED - All changes by Claude are properly tracked") + return True + + def _generate_detailed_diff(self, original_text, modified_text): + """Generate detailed word-level differences using git word diff.""" + error_parts = [ + "FAILED - Document text doesn't match after removing Claude's tracked changes", + "", + "Likely causes:", + " 1. Modified text inside another author's or tags", + " 2. Made edits without proper tracked changes", + " 3. Didn't nest inside when deleting another's insertion", + "", + "For pre-redlined documents, use correct patterns:", + " - To reject another's INSERTION: Nest inside their ", + " - To restore another's DELETION: Add new AFTER their ", + "", + ] + + # Show git word diff + git_diff = self._get_git_word_diff(original_text, modified_text) + if git_diff: + error_parts.extend(["Differences:", "============", git_diff]) + else: + error_parts.append("Unable to generate word diff (git not available)") + + return "\n".join(error_parts) + + def _get_git_word_diff(self, original_text, modified_text): + """Generate word diff using git with character-level precision.""" + try: + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + # Create two files + original_file = temp_path / "original.txt" + modified_file = temp_path / "modified.txt" + + original_file.write_text(original_text, encoding="utf-8") + modified_file.write_text(modified_text, encoding="utf-8") + + # Try character-level diff first for precise differences + result = subprocess.run( + [ + "git", + "diff", + "--word-diff=plain", + "--word-diff-regex=.", # Character-by-character diff + "-U0", # Zero lines of context - show only changed lines + "--no-index", + str(original_file), + str(modified_file), + ], + capture_output=True, + text=True, + ) + + if result.stdout.strip(): + # Clean up the output - remove git diff header lines + lines = result.stdout.split("\n") + # Skip the header lines (diff --git, index, +++, ---, @@) + content_lines = [] + in_content = False + for line in lines: + if line.startswith("@@"): + in_content = True + continue + if in_content and line.strip(): + content_lines.append(line) + + if content_lines: + return "\n".join(content_lines) + + # Fallback to word-level diff if character-level is too verbose + result = subprocess.run( + [ + "git", + "diff", + "--word-diff=plain", + "-U0", # Zero lines of context + "--no-index", + str(original_file), + str(modified_file), + ], + capture_output=True, + text=True, + ) + + if result.stdout.strip(): + lines = result.stdout.split("\n") + content_lines = [] + in_content = False + for line in lines: + if line.startswith("@@"): + in_content = True + continue + if in_content and line.strip(): + content_lines.append(line) + return "\n".join(content_lines) + + except (subprocess.CalledProcessError, FileNotFoundError, Exception): + # Git not available or other error, return None to use fallback + pass + + return None + + def _remove_claude_tracked_changes(self, root): + """Remove tracked changes authored by Claude from the XML root.""" + ins_tag = f"{{{self.namespaces['w']}}}ins" + del_tag = f"{{{self.namespaces['w']}}}del" + author_attr = f"{{{self.namespaces['w']}}}author" + + # Remove w:ins elements + for parent in root.iter(): + to_remove = [] + for child in parent: + if child.tag == ins_tag and child.get(author_attr) == "Claude": + to_remove.append(child) + for elem in to_remove: + parent.remove(elem) + + # Unwrap content in w:del elements where author is "Claude" + deltext_tag = f"{{{self.namespaces['w']}}}delText" + t_tag = f"{{{self.namespaces['w']}}}t" + + for parent in root.iter(): + to_process = [] + for child in parent: + if child.tag == del_tag and child.get(author_attr) == "Claude": + to_process.append((child, list(parent).index(child))) + + # Process in reverse order to maintain indices + for del_elem, del_index in reversed(to_process): + # Convert w:delText to w:t before moving + for elem in del_elem.iter(): + if elem.tag == deltext_tag: + elem.tag = t_tag + + # Move all children of w:del to its parent before removing w:del + for child in reversed(list(del_elem)): + parent.insert(del_index, child) + parent.remove(del_elem) + + def _extract_text_content(self, root): + """Extract text content from Word XML, preserving paragraph structure. + + Empty paragraphs are skipped to avoid false positives when tracked + insertions add only structural elements without text content. + """ + p_tag = f"{{{self.namespaces['w']}}}p" + t_tag = f"{{{self.namespaces['w']}}}t" + + paragraphs = [] + for p_elem in root.findall(f".//{p_tag}"): + # Get all text elements within this paragraph + text_parts = [] + for t_elem in p_elem.findall(f".//{t_tag}"): + if t_elem.text: + text_parts.append(t_elem.text) + paragraph_text = "".join(text_parts) + # Skip empty paragraphs - they don't affect content validation + if paragraph_text: + paragraphs.append(paragraph_text) + + return "\n".join(paragraphs) + + +if __name__ == "__main__": + raise RuntimeError("This module should not be run directly.") diff --git a/skills/docx/scripts/__init__.py b/skills/docx/scripts/__init__.py new file mode 100644 index 0000000..bf9c562 --- /dev/null +++ b/skills/docx/scripts/__init__.py @@ -0,0 +1 @@ +# Make scripts directory a package for relative imports in tests diff --git a/skills/docx/scripts/document.py b/skills/docx/scripts/document.py new file mode 100644 index 0000000..40bb3fb --- /dev/null +++ b/skills/docx/scripts/document.py @@ -0,0 +1,1292 @@ +#!/usr/bin/env python3 +""" +Library for working with Word documents: comments, tracked changes, and editing. + +Usage: + from skills.docx.scripts.document import Document + + # Initialize + doc = Document('workspace/unpacked') + doc = Document('workspace/unpacked', author="John Doe", initials="JD") + + # Find nodes + node = doc["word/document.xml"].get_node(tag="w:del", attrs={"w:id": "1"}) + node = doc["word/document.xml"].get_node(tag="w:p", line_number=10) + + # Add comments + doc.add_comment(start=node, end=node, text="Comment text") + doc.reply_to_comment(parent_comment_id=0, text="Reply text") + + # Suggest tracked changes + doc["word/document.xml"].suggest_deletion(node) # Delete content + doc["word/document.xml"].revert_insertion(ins_node) # Reject insertion + doc["word/document.xml"].revert_deletion(del_node) # Reject deletion + + # Save + doc.save() +""" + +import html +import random +import shutil +import tempfile +from datetime import datetime, timezone +from pathlib import Path + +from defusedxml import minidom +from ooxml.scripts.pack import pack_document +from ooxml.scripts.validation.docx import DOCXSchemaValidator +from ooxml.scripts.validation.redlining import RedliningValidator + +from .utilities import XMLEditor + +# Path to template files +TEMPLATE_DIR = Path(__file__).parent / "templates" + + +class DocxXMLEditor(XMLEditor): + """XMLEditor that automatically applies RSID, author, and date to new elements. + + Automatically adds attributes to elements that support them when inserting new content: + - w:rsidR, w:rsidRDefault, w:rsidP (for w:p and w:r elements) + - w:author and w:date (for w:ins, w:del, w:comment elements) + - w:id (for w:ins and w:del elements) + + Attributes: + dom (defusedxml.minidom.Document): The DOM document for direct manipulation + """ + + def __init__( + self, xml_path, rsid: str, author: str = "Claude", initials: str = "C" + ): + """Initialize with required RSID and optional author. + + Args: + xml_path: Path to XML file to edit + rsid: RSID to automatically apply to new elements + author: Author name for tracked changes and comments (default: "Claude") + initials: Author initials (default: "C") + """ + super().__init__(xml_path) + self.rsid = rsid + self.author = author + self.initials = initials + + def _get_next_change_id(self): + """Get the next available change ID by checking all tracked change elements.""" + max_id = -1 + for tag in ("w:ins", "w:del"): + elements = self.dom.getElementsByTagName(tag) + for elem in elements: + change_id = elem.getAttribute("w:id") + if change_id: + try: + max_id = max(max_id, int(change_id)) + except ValueError: + pass + return max_id + 1 + + def _ensure_w16du_namespace(self): + """Ensure w16du namespace is declared on the root element.""" + root = self.dom.documentElement + if not root.hasAttribute("xmlns:w16du"): # type: ignore + root.setAttribute( # type: ignore + "xmlns:w16du", + "http://schemas.microsoft.com/office/word/2023/wordml/word16du", + ) + + def _ensure_w16cex_namespace(self): + """Ensure w16cex namespace is declared on the root element.""" + root = self.dom.documentElement + if not root.hasAttribute("xmlns:w16cex"): # type: ignore + root.setAttribute( # type: ignore + "xmlns:w16cex", + "http://schemas.microsoft.com/office/word/2018/wordml/cex", + ) + + def _ensure_w14_namespace(self): + """Ensure w14 namespace is declared on the root element.""" + root = self.dom.documentElement + if not root.hasAttribute("xmlns:w14"): # type: ignore + root.setAttribute( # type: ignore + "xmlns:w14", + "http://schemas.microsoft.com/office/word/2010/wordml", + ) + + def _inject_attributes_to_nodes(self, nodes): + """Inject RSID, author, and date attributes into DOM nodes where applicable. + + Adds attributes to elements that support them: + - w:r: gets w:rsidR (or w:rsidDel if inside w:del) + - w:p: gets w:rsidR, w:rsidRDefault, w:rsidP, w14:paraId, w14:textId + - w:t: gets xml:space="preserve" if text has leading/trailing whitespace + - w:ins, w:del: get w:id, w:author, w:date, w16du:dateUtc + - w:comment: gets w:author, w:date, w:initials + - w16cex:commentExtensible: gets w16cex:dateUtc + + Args: + nodes: List of DOM nodes to process + """ + from datetime import datetime, timezone + + timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + + def is_inside_deletion(elem): + """Check if element is inside a w:del element.""" + parent = elem.parentNode + while parent: + if parent.nodeType == parent.ELEMENT_NODE and parent.tagName == "w:del": + return True + parent = parent.parentNode + return False + + def add_rsid_to_p(elem): + if not elem.hasAttribute("w:rsidR"): + elem.setAttribute("w:rsidR", self.rsid) + if not elem.hasAttribute("w:rsidRDefault"): + elem.setAttribute("w:rsidRDefault", self.rsid) + if not elem.hasAttribute("w:rsidP"): + elem.setAttribute("w:rsidP", self.rsid) + # Add w14:paraId and w14:textId if not present + if not elem.hasAttribute("w14:paraId"): + self._ensure_w14_namespace() + elem.setAttribute("w14:paraId", _generate_hex_id()) + if not elem.hasAttribute("w14:textId"): + self._ensure_w14_namespace() + elem.setAttribute("w14:textId", _generate_hex_id()) + + def add_rsid_to_r(elem): + # Use w:rsidDel for inside , otherwise w:rsidR + if is_inside_deletion(elem): + if not elem.hasAttribute("w:rsidDel"): + elem.setAttribute("w:rsidDel", self.rsid) + else: + if not elem.hasAttribute("w:rsidR"): + elem.setAttribute("w:rsidR", self.rsid) + + def add_tracked_change_attrs(elem): + # Auto-assign w:id if not present + if not elem.hasAttribute("w:id"): + elem.setAttribute("w:id", str(self._get_next_change_id())) + if not elem.hasAttribute("w:author"): + elem.setAttribute("w:author", self.author) + if not elem.hasAttribute("w:date"): + elem.setAttribute("w:date", timestamp) + # Add w16du:dateUtc for tracked changes (same as w:date since we generate UTC timestamps) + if elem.tagName in ("w:ins", "w:del") and not elem.hasAttribute( + "w16du:dateUtc" + ): + self._ensure_w16du_namespace() + elem.setAttribute("w16du:dateUtc", timestamp) + + def add_comment_attrs(elem): + if not elem.hasAttribute("w:author"): + elem.setAttribute("w:author", self.author) + if not elem.hasAttribute("w:date"): + elem.setAttribute("w:date", timestamp) + if not elem.hasAttribute("w:initials"): + elem.setAttribute("w:initials", self.initials) + + def add_comment_extensible_date(elem): + # Add w16cex:dateUtc for comment extensible elements + if not elem.hasAttribute("w16cex:dateUtc"): + self._ensure_w16cex_namespace() + elem.setAttribute("w16cex:dateUtc", timestamp) + + def add_xml_space_to_t(elem): + # Add xml:space="preserve" to w:t if text has leading/trailing whitespace + if ( + elem.firstChild + and elem.firstChild.nodeType == elem.firstChild.TEXT_NODE + ): + text = elem.firstChild.data + if text and (text[0].isspace() or text[-1].isspace()): + if not elem.hasAttribute("xml:space"): + elem.setAttribute("xml:space", "preserve") + + for node in nodes: + if node.nodeType != node.ELEMENT_NODE: + continue + + # Handle the node itself + if node.tagName == "w:p": + add_rsid_to_p(node) + elif node.tagName == "w:r": + add_rsid_to_r(node) + elif node.tagName == "w:t": + add_xml_space_to_t(node) + elif node.tagName in ("w:ins", "w:del"): + add_tracked_change_attrs(node) + elif node.tagName == "w:comment": + add_comment_attrs(node) + elif node.tagName == "w16cex:commentExtensible": + add_comment_extensible_date(node) + + # Process descendants (getElementsByTagName doesn't return the element itself) + for elem in node.getElementsByTagName("w:p"): + add_rsid_to_p(elem) + for elem in node.getElementsByTagName("w:r"): + add_rsid_to_r(elem) + for elem in node.getElementsByTagName("w:t"): + add_xml_space_to_t(elem) + for tag in ("w:ins", "w:del"): + for elem in node.getElementsByTagName(tag): + add_tracked_change_attrs(elem) + for elem in node.getElementsByTagName("w:comment"): + add_comment_attrs(elem) + for elem in node.getElementsByTagName("w16cex:commentExtensible"): + add_comment_extensible_date(elem) + + def replace_node(self, elem, new_content): + """Replace node with automatic attribute injection.""" + nodes = super().replace_node(elem, new_content) + self._inject_attributes_to_nodes(nodes) + return nodes + + def insert_after(self, elem, xml_content): + """Insert after with automatic attribute injection.""" + nodes = super().insert_after(elem, xml_content) + self._inject_attributes_to_nodes(nodes) + return nodes + + def insert_before(self, elem, xml_content): + """Insert before with automatic attribute injection.""" + nodes = super().insert_before(elem, xml_content) + self._inject_attributes_to_nodes(nodes) + return nodes + + def append_to(self, elem, xml_content): + """Append to with automatic attribute injection.""" + nodes = super().append_to(elem, xml_content) + self._inject_attributes_to_nodes(nodes) + return nodes + + def revert_insertion(self, elem): + """Reject an insertion by wrapping its content in a deletion. + + Wraps all runs inside w:ins in w:del, converting w:t to w:delText. + Can process a single w:ins element or a container element with multiple w:ins. + + Args: + elem: Element to process (w:ins, w:p, w:body, etc.) + + Returns: + list: List containing the processed element(s) + + Raises: + ValueError: If the element contains no w:ins elements + + Example: + # Reject a single insertion + ins = doc["word/document.xml"].get_node(tag="w:ins", attrs={"w:id": "5"}) + doc["word/document.xml"].revert_insertion(ins) + + # Reject all insertions in a paragraph + para = doc["word/document.xml"].get_node(tag="w:p", line_number=42) + doc["word/document.xml"].revert_insertion(para) + """ + # Collect insertions + ins_elements = [] + if elem.tagName == "w:ins": + ins_elements.append(elem) + else: + ins_elements.extend(elem.getElementsByTagName("w:ins")) + + # Validate that there are insertions to reject + if not ins_elements: + raise ValueError( + f"revert_insertion requires w:ins elements. " + f"The provided element <{elem.tagName}> contains no insertions. " + ) + + # Process all insertions - wrap all children in w:del + for ins_elem in ins_elements: + runs = list(ins_elem.getElementsByTagName("w:r")) + if not runs: + continue + + # Create deletion wrapper + del_wrapper = self.dom.createElement("w:del") + + # Process each run + for run in runs: + # Convert w:t → w:delText and w:rsidR → w:rsidDel + if run.hasAttribute("w:rsidR"): + run.setAttribute("w:rsidDel", run.getAttribute("w:rsidR")) + run.removeAttribute("w:rsidR") + elif not run.hasAttribute("w:rsidDel"): + run.setAttribute("w:rsidDel", self.rsid) + + for t_elem in list(run.getElementsByTagName("w:t")): + del_text = self.dom.createElement("w:delText") + # Copy ALL child nodes (not just firstChild) to handle entities + while t_elem.firstChild: + del_text.appendChild(t_elem.firstChild) + for i in range(t_elem.attributes.length): + attr = t_elem.attributes.item(i) + del_text.setAttribute(attr.name, attr.value) + t_elem.parentNode.replaceChild(del_text, t_elem) + + # Move all children from ins to del wrapper + while ins_elem.firstChild: + del_wrapper.appendChild(ins_elem.firstChild) + + # Add del wrapper back to ins + ins_elem.appendChild(del_wrapper) + + # Inject attributes to the deletion wrapper + self._inject_attributes_to_nodes([del_wrapper]) + + return [elem] + + def revert_deletion(self, elem): + """Reject a deletion by re-inserting the deleted content. + + Creates w:ins elements after each w:del, copying deleted content and + converting w:delText back to w:t. + Can process a single w:del element or a container element with multiple w:del. + + Args: + elem: Element to process (w:del, w:p, w:body, etc.) + + Returns: + list: If elem is w:del, returns [elem, new_ins]. Otherwise returns [elem]. + + Raises: + ValueError: If the element contains no w:del elements + + Example: + # Reject a single deletion - returns [w:del, w:ins] + del_elem = doc["word/document.xml"].get_node(tag="w:del", attrs={"w:id": "3"}) + nodes = doc["word/document.xml"].revert_deletion(del_elem) + + # Reject all deletions in a paragraph - returns [para] + para = doc["word/document.xml"].get_node(tag="w:p", line_number=42) + nodes = doc["word/document.xml"].revert_deletion(para) + """ + # Collect deletions FIRST - before we modify the DOM + del_elements = [] + is_single_del = elem.tagName == "w:del" + + if is_single_del: + del_elements.append(elem) + else: + del_elements.extend(elem.getElementsByTagName("w:del")) + + # Validate that there are deletions to reject + if not del_elements: + raise ValueError( + f"revert_deletion requires w:del elements. " + f"The provided element <{elem.tagName}> contains no deletions. " + ) + + # Track created insertion (only relevant if elem is a single w:del) + created_insertion = None + + # Process all deletions - create insertions that copy the deleted content + for del_elem in del_elements: + # Clone the deleted runs and convert them to insertions + runs = list(del_elem.getElementsByTagName("w:r")) + if not runs: + continue + + # Create insertion wrapper + ins_elem = self.dom.createElement("w:ins") + + for run in runs: + # Clone the run + new_run = run.cloneNode(True) + + # Convert w:delText → w:t + for del_text in list(new_run.getElementsByTagName("w:delText")): + t_elem = self.dom.createElement("w:t") + # Copy ALL child nodes (not just firstChild) to handle entities + while del_text.firstChild: + t_elem.appendChild(del_text.firstChild) + for i in range(del_text.attributes.length): + attr = del_text.attributes.item(i) + t_elem.setAttribute(attr.name, attr.value) + del_text.parentNode.replaceChild(t_elem, del_text) + + # Update run attributes: w:rsidDel → w:rsidR + if new_run.hasAttribute("w:rsidDel"): + new_run.setAttribute("w:rsidR", new_run.getAttribute("w:rsidDel")) + new_run.removeAttribute("w:rsidDel") + elif not new_run.hasAttribute("w:rsidR"): + new_run.setAttribute("w:rsidR", self.rsid) + + ins_elem.appendChild(new_run) + + # Insert the new insertion after the deletion + nodes = self.insert_after(del_elem, ins_elem.toxml()) + + # If processing a single w:del, track the created insertion + if is_single_del and nodes: + created_insertion = nodes[0] + + # Return based on input type + if is_single_del and created_insertion: + return [elem, created_insertion] + else: + return [elem] + + @staticmethod + def suggest_paragraph(xml_content: str) -> str: + """Transform paragraph XML to add tracked change wrapping for insertion. + + Wraps runs in and adds to w:rPr in w:pPr for numbered lists. + + Args: + xml_content: XML string containing a element + + Returns: + str: Transformed XML with tracked change wrapping + """ + wrapper = f'{xml_content}' + doc = minidom.parseString(wrapper) + para = doc.getElementsByTagName("w:p")[0] + + # Ensure w:pPr exists + pPr_list = para.getElementsByTagName("w:pPr") + if not pPr_list: + pPr = doc.createElement("w:pPr") + para.insertBefore( + pPr, para.firstChild + ) if para.firstChild else para.appendChild(pPr) + else: + pPr = pPr_list[0] + + # Ensure w:rPr exists in w:pPr + rPr_list = pPr.getElementsByTagName("w:rPr") + if not rPr_list: + rPr = doc.createElement("w:rPr") + pPr.appendChild(rPr) + else: + rPr = rPr_list[0] + + # Add to w:rPr + ins_marker = doc.createElement("w:ins") + rPr.insertBefore( + ins_marker, rPr.firstChild + ) if rPr.firstChild else rPr.appendChild(ins_marker) + + # Wrap all non-pPr children in + ins_wrapper = doc.createElement("w:ins") + for child in [c for c in para.childNodes if c.nodeName != "w:pPr"]: + para.removeChild(child) + ins_wrapper.appendChild(child) + para.appendChild(ins_wrapper) + + return para.toxml() + + def suggest_deletion(self, elem): + """Mark a w:r or w:p element as deleted with tracked changes (in-place DOM manipulation). + + For w:r: wraps in , converts to , preserves w:rPr + For w:p (regular): wraps content in , converts to + For w:p (numbered list): adds to w:rPr in w:pPr, wraps content in + + Args: + elem: A w:r or w:p DOM element without existing tracked changes + + Returns: + Element: The modified element + + Raises: + ValueError: If element has existing tracked changes or invalid structure + """ + if elem.nodeName == "w:r": + # Check for existing w:delText + if elem.getElementsByTagName("w:delText"): + raise ValueError("w:r element already contains w:delText") + + # Convert w:t → w:delText + for t_elem in list(elem.getElementsByTagName("w:t")): + del_text = self.dom.createElement("w:delText") + # Copy ALL child nodes (not just firstChild) to handle entities + while t_elem.firstChild: + del_text.appendChild(t_elem.firstChild) + # Preserve attributes like xml:space + for i in range(t_elem.attributes.length): + attr = t_elem.attributes.item(i) + del_text.setAttribute(attr.name, attr.value) + t_elem.parentNode.replaceChild(del_text, t_elem) + + # Update run attributes: w:rsidR → w:rsidDel + if elem.hasAttribute("w:rsidR"): + elem.setAttribute("w:rsidDel", elem.getAttribute("w:rsidR")) + elem.removeAttribute("w:rsidR") + elif not elem.hasAttribute("w:rsidDel"): + elem.setAttribute("w:rsidDel", self.rsid) + + # Wrap in w:del + del_wrapper = self.dom.createElement("w:del") + parent = elem.parentNode + parent.insertBefore(del_wrapper, elem) + parent.removeChild(elem) + del_wrapper.appendChild(elem) + + # Inject attributes to the deletion wrapper + self._inject_attributes_to_nodes([del_wrapper]) + + return del_wrapper + + elif elem.nodeName == "w:p": + # Check for existing tracked changes + if elem.getElementsByTagName("w:ins") or elem.getElementsByTagName("w:del"): + raise ValueError("w:p element already contains tracked changes") + + # Check if it's a numbered list item + pPr_list = elem.getElementsByTagName("w:pPr") + is_numbered = pPr_list and pPr_list[0].getElementsByTagName("w:numPr") + + if is_numbered: + # Add to w:rPr in w:pPr + pPr = pPr_list[0] + rPr_list = pPr.getElementsByTagName("w:rPr") + + if not rPr_list: + rPr = self.dom.createElement("w:rPr") + pPr.appendChild(rPr) + else: + rPr = rPr_list[0] + + # Add marker + del_marker = self.dom.createElement("w:del") + rPr.insertBefore( + del_marker, rPr.firstChild + ) if rPr.firstChild else rPr.appendChild(del_marker) + + # Inject attributes into the marker + self._inject_attributes_to_nodes([del_marker]) + + # Convert w:t → w:delText in all runs + for t_elem in list(elem.getElementsByTagName("w:t")): + del_text = self.dom.createElement("w:delText") + # Copy ALL child nodes (not just firstChild) to handle entities + while t_elem.firstChild: + del_text.appendChild(t_elem.firstChild) + # Preserve attributes like xml:space + for i in range(t_elem.attributes.length): + attr = t_elem.attributes.item(i) + del_text.setAttribute(attr.name, attr.value) + t_elem.parentNode.replaceChild(del_text, t_elem) + + # Update run attributes: w:rsidR → w:rsidDel + for run in elem.getElementsByTagName("w:r"): + if run.hasAttribute("w:rsidR"): + run.setAttribute("w:rsidDel", run.getAttribute("w:rsidR")) + run.removeAttribute("w:rsidR") + elif not run.hasAttribute("w:rsidDel"): + run.setAttribute("w:rsidDel", self.rsid) + + # Wrap all non-pPr children in + del_wrapper = self.dom.createElement("w:del") + for child in [c for c in elem.childNodes if c.nodeName != "w:pPr"]: + elem.removeChild(child) + del_wrapper.appendChild(child) + elem.appendChild(del_wrapper) + + # Inject attributes to the deletion wrapper + self._inject_attributes_to_nodes([del_wrapper]) + + return elem + + else: + raise ValueError(f"Element must be w:r or w:p, got {elem.nodeName}") + + +def _generate_hex_id() -> str: + """Generate random 8-character hex ID for para/durable IDs. + + Values are constrained to be less than 0x7FFFFFFF per OOXML spec: + - paraId must be < 0x80000000 + - durableId must be < 0x7FFFFFFF + We use the stricter constraint (0x7FFFFFFF) for both. + """ + return f"{random.randint(1, 0x7FFFFFFE):08X}" + + +def _generate_rsid() -> str: + """Generate random 8-character hex RSID.""" + return "".join(random.choices("0123456789ABCDEF", k=8)) + + +class Document: + """Manages comments in unpacked Word documents.""" + + def __init__( + self, + unpacked_dir, + rsid=None, + track_revisions=False, + author="Claude", + initials="C", + ): + """ + Initialize with path to unpacked Word document directory. + Automatically sets up comment infrastructure (people.xml, RSIDs). + + Args: + unpacked_dir: Path to unpacked DOCX directory (must contain word/ subdirectory) + rsid: Optional RSID to use for all comment elements. If not provided, one will be generated. + track_revisions: If True, enables track revisions in settings.xml (default: False) + author: Default author name for comments (default: "Claude") + initials: Default author initials for comments (default: "C") + """ + self.original_path = Path(unpacked_dir) + + if not self.original_path.exists() or not self.original_path.is_dir(): + raise ValueError(f"Directory not found: {unpacked_dir}") + + # Create temporary directory with subdirectories for unpacked content and baseline + self.temp_dir = tempfile.mkdtemp(prefix="docx_") + self.unpacked_path = Path(self.temp_dir) / "unpacked" + shutil.copytree(self.original_path, self.unpacked_path) + + # Pack original directory into temporary .docx for validation baseline (outside unpacked dir) + self.original_docx = Path(self.temp_dir) / "original.docx" + pack_document(self.original_path, self.original_docx, validate=False) + + self.word_path = self.unpacked_path / "word" + + # Generate RSID if not provided + self.rsid = rsid if rsid else _generate_rsid() + print(f"Using RSID: {self.rsid}") + + # Set default author and initials + self.author = author + self.initials = initials + + # Cache for lazy-loaded editors + self._editors = {} + + # Comment file paths + self.comments_path = self.word_path / "comments.xml" + self.comments_extended_path = self.word_path / "commentsExtended.xml" + self.comments_ids_path = self.word_path / "commentsIds.xml" + self.comments_extensible_path = self.word_path / "commentsExtensible.xml" + + # Load existing comments and determine next ID (before setup modifies files) + self.existing_comments = self._load_existing_comments() + self.next_comment_id = self._get_next_comment_id() + + # Convenient access to document.xml editor (semi-private) + self._document = self["word/document.xml"] + + # Setup tracked changes infrastructure + self._setup_tracking(track_revisions=track_revisions) + + # Add author to people.xml + self._add_author_to_people(author) + + def __getitem__(self, xml_path: str) -> DocxXMLEditor: + """ + Get or create a DocxXMLEditor for the specified XML file. + + Enables lazy-loaded editors with bracket notation: + node = doc["word/document.xml"].get_node(tag="w:p", line_number=42) + + Args: + xml_path: Relative path to XML file (e.g., "word/document.xml", "word/comments.xml") + + Returns: + DocxXMLEditor instance for the specified file + + Raises: + ValueError: If the file does not exist + + Example: + # Get node from document.xml + node = doc["word/document.xml"].get_node(tag="w:del", attrs={"w:id": "1"}) + + # Get node from comments.xml + comment = doc["word/comments.xml"].get_node(tag="w:comment", attrs={"w:id": "0"}) + """ + if xml_path not in self._editors: + file_path = self.unpacked_path / xml_path + if not file_path.exists(): + raise ValueError(f"XML file not found: {xml_path}") + # Use DocxXMLEditor with RSID, author, and initials for all editors + self._editors[xml_path] = DocxXMLEditor( + file_path, rsid=self.rsid, author=self.author, initials=self.initials + ) + return self._editors[xml_path] + + def add_comment(self, start, end, text: str) -> int: + """ + Add a comment spanning from one element to another. + + Args: + start: DOM element for the starting point + end: DOM element for the ending point + text: Comment content + + Returns: + The comment ID that was created + + Example: + start_node = cm.get_document_node(tag="w:del", id="1") + end_node = cm.get_document_node(tag="w:ins", id="2") + cm.add_comment(start=start_node, end=end_node, text="Explanation") + """ + comment_id = self.next_comment_id + para_id = _generate_hex_id() + durable_id = _generate_hex_id() + timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + + # Add comment ranges to document.xml immediately + self._document.insert_before(start, self._comment_range_start_xml(comment_id)) + + # If end node is a paragraph, append comment markup inside it + # Otherwise insert after it (for run-level anchors) + if end.tagName == "w:p": + self._document.append_to(end, self._comment_range_end_xml(comment_id)) + else: + self._document.insert_after(end, self._comment_range_end_xml(comment_id)) + + # Add to comments.xml immediately + self._add_to_comments_xml( + comment_id, para_id, text, self.author, self.initials, timestamp + ) + + # Add to commentsExtended.xml immediately + self._add_to_comments_extended_xml(para_id, parent_para_id=None) + + # Add to commentsIds.xml immediately + self._add_to_comments_ids_xml(para_id, durable_id) + + # Add to commentsExtensible.xml immediately + self._add_to_comments_extensible_xml(durable_id) + + # Update existing_comments so replies work + self.existing_comments[comment_id] = {"para_id": para_id} + + self.next_comment_id += 1 + return comment_id + + def reply_to_comment( + self, + parent_comment_id: int, + text: str, + ) -> int: + """ + Add a reply to an existing comment. + + Args: + parent_comment_id: The w:id of the parent comment to reply to + text: Reply text + + Returns: + The comment ID that was created for the reply + + Example: + cm.reply_to_comment(parent_comment_id=0, text="I agree with this change") + """ + if parent_comment_id not in self.existing_comments: + raise ValueError(f"Parent comment with id={parent_comment_id} not found") + + parent_info = self.existing_comments[parent_comment_id] + comment_id = self.next_comment_id + para_id = _generate_hex_id() + durable_id = _generate_hex_id() + timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + + # Add comment ranges to document.xml immediately + parent_start_elem = self._document.get_node( + tag="w:commentRangeStart", attrs={"w:id": str(parent_comment_id)} + ) + parent_ref_elem = self._document.get_node( + tag="w:commentReference", attrs={"w:id": str(parent_comment_id)} + ) + + self._document.insert_after( + parent_start_elem, self._comment_range_start_xml(comment_id) + ) + parent_ref_run = parent_ref_elem.parentNode + self._document.insert_after( + parent_ref_run, f'' + ) + self._document.insert_after( + parent_ref_run, self._comment_ref_run_xml(comment_id) + ) + + # Add to comments.xml immediately + self._add_to_comments_xml( + comment_id, para_id, text, self.author, self.initials, timestamp + ) + + # Add to commentsExtended.xml immediately (with parent) + self._add_to_comments_extended_xml( + para_id, parent_para_id=parent_info["para_id"] + ) + + # Add to commentsIds.xml immediately + self._add_to_comments_ids_xml(para_id, durable_id) + + # Add to commentsExtensible.xml immediately + self._add_to_comments_extensible_xml(durable_id) + + # Update existing_comments so replies work + self.existing_comments[comment_id] = {"para_id": para_id} + + self.next_comment_id += 1 + return comment_id + + def suggest_paragraph(self, xml_content: str) -> str: + """Transform paragraph XML to add tracked change wrapping for insertion. + + Wraps runs in and adds to w:rPr in w:pPr for numbered lists. + + Args: + xml_content: XML string containing a element + + Returns: + str: Transformed XML with tracked change wrapping + """ + return DocxXMLEditor.suggest_paragraph(xml_content) + + def __del__(self): + """Clean up temporary directory on deletion.""" + if hasattr(self, "temp_dir") and Path(self.temp_dir).exists(): + shutil.rmtree(self.temp_dir) + + def validate(self) -> None: + """ + Validate the document against XSD schema and redlining rules. + + Raises: + ValueError: If validation fails. + """ + # Create validators with current state + schema_validator = DOCXSchemaValidator( + self.unpacked_path, self.original_docx, verbose=False + ) + redlining_validator = RedliningValidator( + self.unpacked_path, self.original_docx, verbose=False + ) + + # Run validations + if not schema_validator.validate(): + raise ValueError("Schema validation failed") + if not redlining_validator.validate(): + raise ValueError("Redlining validation failed") + + def save(self, destination=None, validate=True) -> None: + """ + Save all modified XML files to disk and copy to destination directory. + + This persists all changes made via add_comment() and reply_to_comment(). + + Args: + destination: Optional path to save to. If None, saves back to original directory. + validate: If True, validates document before saving (default: True). + """ + # Only ensure comment relationships and content types if comment files exist + if self.comments_path.exists(): + self._ensure_comment_relationships() + self._ensure_comment_content_types() + + # Save all modified XML files in temp directory + for editor in self._editors.values(): + editor.save() + + # Validate by default + if validate: + self.validate() + + # Copy contents from temp directory to destination (or original directory) + target_path = Path(destination) if destination else self.original_path + shutil.copytree(self.unpacked_path, target_path, dirs_exist_ok=True) + + # ==================== Private: Initialization ==================== + + def _get_next_comment_id(self): + """Get the next available comment ID.""" + if not self.comments_path.exists(): + return 0 + + editor = self["word/comments.xml"] + max_id = -1 + for comment_elem in editor.dom.getElementsByTagName("w:comment"): + comment_id = comment_elem.getAttribute("w:id") + if comment_id: + try: + max_id = max(max_id, int(comment_id)) + except ValueError: + pass + return max_id + 1 + + def _load_existing_comments(self): + """Load existing comments from files to enable replies.""" + if not self.comments_path.exists(): + return {} + + editor = self["word/comments.xml"] + existing = {} + + for comment_elem in editor.dom.getElementsByTagName("w:comment"): + comment_id = comment_elem.getAttribute("w:id") + if not comment_id: + continue + + # Find para_id from the w:p element within the comment + para_id = None + for p_elem in comment_elem.getElementsByTagName("w:p"): + para_id = p_elem.getAttribute("w14:paraId") + if para_id: + break + + if not para_id: + continue + + existing[int(comment_id)] = {"para_id": para_id} + + return existing + + # ==================== Private: Setup Methods ==================== + + def _setup_tracking(self, track_revisions=False): + """Set up comment infrastructure in unpacked directory. + + Args: + track_revisions: If True, enables track revisions in settings.xml + """ + # Create or update word/people.xml + people_file = self.word_path / "people.xml" + self._update_people_xml(people_file) + + # Update XML files + self._add_content_type_for_people(self.unpacked_path / "[Content_Types].xml") + self._add_relationship_for_people( + self.word_path / "_rels" / "document.xml.rels" + ) + + # Always add RSID to settings.xml, optionally enable trackRevisions + self._update_settings( + self.word_path / "settings.xml", track_revisions=track_revisions + ) + + def _update_people_xml(self, path): + """Create people.xml if it doesn't exist.""" + if not path.exists(): + # Copy from template + shutil.copy(TEMPLATE_DIR / "people.xml", path) + + def _add_content_type_for_people(self, path): + """Add people.xml content type to [Content_Types].xml if not already present.""" + editor = self["[Content_Types].xml"] + + if self._has_override(editor, "/word/people.xml"): + return + + # Add Override element + root = editor.dom.documentElement + override_xml = '' + editor.append_to(root, override_xml) + + def _add_relationship_for_people(self, path): + """Add people.xml relationship to document.xml.rels if not already present.""" + editor = self["word/_rels/document.xml.rels"] + + if self._has_relationship(editor, "people.xml"): + return + + root = editor.dom.documentElement + root_tag = root.tagName # type: ignore + prefix = root_tag.split(":")[0] + ":" if ":" in root_tag else "" + next_rid = editor.get_next_rid() + + # Create the relationship entry + rel_xml = f'<{prefix}Relationship Id="{next_rid}" Type="http://schemas.microsoft.com/office/2011/relationships/people" Target="people.xml"/>' + editor.append_to(root, rel_xml) + + def _update_settings(self, path, track_revisions=False): + """Add RSID and optionally enable track revisions in settings.xml. + + Args: + path: Path to settings.xml + track_revisions: If True, adds trackRevisions element + + Places elements per OOXML schema order: + - trackRevisions: early (before defaultTabStop) + - rsids: late (after compat) + """ + editor = self["word/settings.xml"] + root = editor.get_node(tag="w:settings") + prefix = root.tagName.split(":")[0] if ":" in root.tagName else "w" + + # Conditionally add trackRevisions if requested + if track_revisions: + track_revisions_exists = any( + elem.tagName == f"{prefix}:trackRevisions" + for elem in editor.dom.getElementsByTagName(f"{prefix}:trackRevisions") + ) + + if not track_revisions_exists: + track_rev_xml = f"<{prefix}:trackRevisions/>" + # Try to insert before documentProtection, defaultTabStop, or at start + inserted = False + for tag in [f"{prefix}:documentProtection", f"{prefix}:defaultTabStop"]: + elements = editor.dom.getElementsByTagName(tag) + if elements: + editor.insert_before(elements[0], track_rev_xml) + inserted = True + break + if not inserted: + # Insert as first child of settings + if root.firstChild: + editor.insert_before(root.firstChild, track_rev_xml) + else: + editor.append_to(root, track_rev_xml) + + # Always check if rsids section exists + rsids_elements = editor.dom.getElementsByTagName(f"{prefix}:rsids") + + if not rsids_elements: + # Add new rsids section + rsids_xml = f'''<{prefix}:rsids> + <{prefix}:rsidRoot {prefix}:val="{self.rsid}"/> + <{prefix}:rsid {prefix}:val="{self.rsid}"/> +''' + + # Try to insert after compat, before clrSchemeMapping, or before closing tag + inserted = False + compat_elements = editor.dom.getElementsByTagName(f"{prefix}:compat") + if compat_elements: + editor.insert_after(compat_elements[0], rsids_xml) + inserted = True + + if not inserted: + clr_elements = editor.dom.getElementsByTagName( + f"{prefix}:clrSchemeMapping" + ) + if clr_elements: + editor.insert_before(clr_elements[0], rsids_xml) + inserted = True + + if not inserted: + editor.append_to(root, rsids_xml) + else: + # Check if this rsid already exists + rsids_elem = rsids_elements[0] + rsid_exists = any( + elem.getAttribute(f"{prefix}:val") == self.rsid + for elem in rsids_elem.getElementsByTagName(f"{prefix}:rsid") + ) + + if not rsid_exists: + rsid_xml = f'<{prefix}:rsid {prefix}:val="{self.rsid}"/>' + editor.append_to(rsids_elem, rsid_xml) + + # ==================== Private: XML File Creation ==================== + + def _add_to_comments_xml( + self, comment_id, para_id, text, author, initials, timestamp + ): + """Add a single comment to comments.xml.""" + if not self.comments_path.exists(): + shutil.copy(TEMPLATE_DIR / "comments.xml", self.comments_path) + + editor = self["word/comments.xml"] + root = editor.get_node(tag="w:comments") + + escaped_text = ( + text.replace("&", "&").replace("<", "<").replace(">", ">") + ) + # Note: w:rsidR, w:rsidRDefault, w:rsidP on w:p, w:rsidR on w:r, + # and w:author, w:date, w:initials on w:comment are automatically added by DocxXMLEditor + comment_xml = f''' + + + {escaped_text} + +''' + editor.append_to(root, comment_xml) + + def _add_to_comments_extended_xml(self, para_id, parent_para_id): + """Add a single comment to commentsExtended.xml.""" + if not self.comments_extended_path.exists(): + shutil.copy( + TEMPLATE_DIR / "commentsExtended.xml", self.comments_extended_path + ) + + editor = self["word/commentsExtended.xml"] + root = editor.get_node(tag="w15:commentsEx") + + if parent_para_id: + xml = f'' + else: + xml = f'' + editor.append_to(root, xml) + + def _add_to_comments_ids_xml(self, para_id, durable_id): + """Add a single comment to commentsIds.xml.""" + if not self.comments_ids_path.exists(): + shutil.copy(TEMPLATE_DIR / "commentsIds.xml", self.comments_ids_path) + + editor = self["word/commentsIds.xml"] + root = editor.get_node(tag="w16cid:commentsIds") + + xml = f'' + editor.append_to(root, xml) + + def _add_to_comments_extensible_xml(self, durable_id): + """Add a single comment to commentsExtensible.xml.""" + if not self.comments_extensible_path.exists(): + shutil.copy( + TEMPLATE_DIR / "commentsExtensible.xml", self.comments_extensible_path + ) + + editor = self["word/commentsExtensible.xml"] + root = editor.get_node(tag="w16cex:commentsExtensible") + + xml = f'' + editor.append_to(root, xml) + + # ==================== Private: XML Fragments ==================== + + def _comment_range_start_xml(self, comment_id): + """Generate XML for comment range start.""" + return f'' + + def _comment_range_end_xml(self, comment_id): + """Generate XML for comment range end with reference run. + + Note: w:rsidR is automatically added by DocxXMLEditor. + """ + return f''' + + + +''' + + def _comment_ref_run_xml(self, comment_id): + """Generate XML for comment reference run. + + Note: w:rsidR is automatically added by DocxXMLEditor. + """ + return f''' + + +''' + + # ==================== Private: Metadata Updates ==================== + + def _has_relationship(self, editor, target): + """Check if a relationship with given target exists.""" + for rel_elem in editor.dom.getElementsByTagName("Relationship"): + if rel_elem.getAttribute("Target") == target: + return True + return False + + def _has_override(self, editor, part_name): + """Check if an override with given part name exists.""" + for override_elem in editor.dom.getElementsByTagName("Override"): + if override_elem.getAttribute("PartName") == part_name: + return True + return False + + def _has_author(self, editor, author): + """Check if an author already exists in people.xml.""" + for person_elem in editor.dom.getElementsByTagName("w15:person"): + if person_elem.getAttribute("w15:author") == author: + return True + return False + + def _add_author_to_people(self, author): + """Add author to people.xml (called during initialization).""" + people_path = self.word_path / "people.xml" + + # people.xml should already exist from _setup_tracking + if not people_path.exists(): + raise ValueError("people.xml should exist after _setup_tracking") + + editor = self["word/people.xml"] + root = editor.get_node(tag="w15:people") + + # Check if author already exists + if self._has_author(editor, author): + return + + # Add author with proper XML escaping to prevent injection + escaped_author = html.escape(author, quote=True) + person_xml = f''' + +''' + editor.append_to(root, person_xml) + + def _ensure_comment_relationships(self): + """Ensure word/_rels/document.xml.rels has comment relationships.""" + editor = self["word/_rels/document.xml.rels"] + + if self._has_relationship(editor, "comments.xml"): + return + + root = editor.dom.documentElement + root_tag = root.tagName # type: ignore + prefix = root_tag.split(":")[0] + ":" if ":" in root_tag else "" + next_rid_num = int(editor.get_next_rid()[3:]) + + # Add relationship elements + rels = [ + ( + next_rid_num, + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments", + "comments.xml", + ), + ( + next_rid_num + 1, + "http://schemas.microsoft.com/office/2011/relationships/commentsExtended", + "commentsExtended.xml", + ), + ( + next_rid_num + 2, + "http://schemas.microsoft.com/office/2016/09/relationships/commentsIds", + "commentsIds.xml", + ), + ( + next_rid_num + 3, + "http://schemas.microsoft.com/office/2018/08/relationships/commentsExtensible", + "commentsExtensible.xml", + ), + ] + + for rel_id, rel_type, target in rels: + rel_xml = f'<{prefix}Relationship Id="rId{rel_id}" Type="{rel_type}" Target="{target}"/>' + editor.append_to(root, rel_xml) + + def _ensure_comment_content_types(self): + """Ensure [Content_Types].xml has comment content types.""" + editor = self["[Content_Types].xml"] + + if self._has_override(editor, "/word/comments.xml"): + return + + root = editor.dom.documentElement + + # Add Override elements + overrides = [ + ( + "/word/comments.xml", + "application/vnd.openxmlformats-officedocument.wordprocessingml.comments+xml", + ), + ( + "/word/commentsExtended.xml", + "application/vnd.openxmlformats-officedocument.wordprocessingml.commentsExtended+xml", + ), + ( + "/word/commentsIds.xml", + "application/vnd.openxmlformats-officedocument.wordprocessingml.commentsIds+xml", + ), + ( + "/word/commentsExtensible.xml", + "application/vnd.openxmlformats-officedocument.wordprocessingml.commentsExtensible+xml", + ), + ] + + for part_name, content_type in overrides: + override_xml = ( + f'' + ) + editor.append_to(root, override_xml) diff --git a/skills/docx/scripts/templates/comments.xml b/skills/docx/scripts/templates/comments.xml new file mode 100644 index 0000000..b5dace0 --- /dev/null +++ b/skills/docx/scripts/templates/comments.xml @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/skills/docx/scripts/templates/commentsExtended.xml b/skills/docx/scripts/templates/commentsExtended.xml new file mode 100644 index 0000000..b4cf23e --- /dev/null +++ b/skills/docx/scripts/templates/commentsExtended.xml @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/skills/docx/scripts/templates/commentsExtensible.xml b/skills/docx/scripts/templates/commentsExtensible.xml new file mode 100644 index 0000000..e32a05e --- /dev/null +++ b/skills/docx/scripts/templates/commentsExtensible.xml @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/skills/docx/scripts/templates/commentsIds.xml b/skills/docx/scripts/templates/commentsIds.xml new file mode 100644 index 0000000..d04bc8e --- /dev/null +++ b/skills/docx/scripts/templates/commentsIds.xml @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/skills/docx/scripts/templates/people.xml b/skills/docx/scripts/templates/people.xml new file mode 100644 index 0000000..a839caf --- /dev/null +++ b/skills/docx/scripts/templates/people.xml @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/skills/docx/scripts/utilities.py b/skills/docx/scripts/utilities.py new file mode 100644 index 0000000..d92dae6 --- /dev/null +++ b/skills/docx/scripts/utilities.py @@ -0,0 +1,374 @@ +#!/usr/bin/env python3 +""" +Utilities for editing OOXML documents. + +This module provides XMLEditor, a tool for manipulating XML files with support for +line-number-based node finding and DOM manipulation. Each element is automatically +annotated with its original line and column position during parsing. + +Example usage: + editor = XMLEditor("document.xml") + + # Find node by line number or range + elem = editor.get_node(tag="w:r", line_number=519) + elem = editor.get_node(tag="w:p", line_number=range(100, 200)) + + # Find node by text content + elem = editor.get_node(tag="w:p", contains="specific text") + + # Find node by attributes + elem = editor.get_node(tag="w:r", attrs={"w:id": "target"}) + + # Combine filters + elem = editor.get_node(tag="w:p", line_number=range(1, 50), contains="text") + + # Replace, insert, or manipulate + new_elem = editor.replace_node(elem, "new text") + editor.insert_after(new_elem, "more") + + # Save changes + editor.save() +""" + +import html +from pathlib import Path +from typing import Optional, Union + +import defusedxml.minidom +import defusedxml.sax + + +class XMLEditor: + """ + Editor for manipulating OOXML XML files with line-number-based node finding. + + This class parses XML files and tracks the original line and column position + of each element. This enables finding nodes by their line number in the original + file, which is useful when working with Read tool output. + + Attributes: + xml_path: Path to the XML file being edited + encoding: Detected encoding of the XML file ('ascii' or 'utf-8') + dom: Parsed DOM tree with parse_position attributes on elements + """ + + def __init__(self, xml_path): + """ + Initialize with path to XML file and parse with line number tracking. + + Args: + xml_path: Path to XML file to edit (str or Path) + + Raises: + ValueError: If the XML file does not exist + """ + self.xml_path = Path(xml_path) + if not self.xml_path.exists(): + raise ValueError(f"XML file not found: {xml_path}") + + with open(self.xml_path, "rb") as f: + header = f.read(200).decode("utf-8", errors="ignore") + self.encoding = "ascii" if 'encoding="ascii"' in header else "utf-8" + + parser = _create_line_tracking_parser() + self.dom = defusedxml.minidom.parse(str(self.xml_path), parser) + + def get_node( + self, + tag: str, + attrs: Optional[dict[str, str]] = None, + line_number: Optional[Union[int, range]] = None, + contains: Optional[str] = None, + ): + """ + Get a DOM element by tag and identifier. + + Finds an element by either its line number in the original file or by + matching attribute values. Exactly one match must be found. + + Args: + tag: The XML tag name (e.g., "w:del", "w:ins", "w:r") + attrs: Dictionary of attribute name-value pairs to match (e.g., {"w:id": "1"}) + line_number: Line number (int) or line range (range) in original XML file (1-indexed) + contains: Text string that must appear in any text node within the element. + Supports both entity notation (“) and Unicode characters (\u201c). + + Returns: + defusedxml.minidom.Element: The matching DOM element + + Raises: + ValueError: If node not found or multiple matches found + + Example: + elem = editor.get_node(tag="w:r", line_number=519) + elem = editor.get_node(tag="w:r", line_number=range(100, 200)) + elem = editor.get_node(tag="w:del", attrs={"w:id": "1"}) + elem = editor.get_node(tag="w:p", attrs={"w14:paraId": "12345678"}) + elem = editor.get_node(tag="w:commentRangeStart", attrs={"w:id": "0"}) + elem = editor.get_node(tag="w:p", contains="specific text") + elem = editor.get_node(tag="w:t", contains="“Agreement") # Entity notation + elem = editor.get_node(tag="w:t", contains="\u201cAgreement") # Unicode character + """ + matches = [] + for elem in self.dom.getElementsByTagName(tag): + # Check line_number filter + if line_number is not None: + parse_pos = getattr(elem, "parse_position", (None,)) + elem_line = parse_pos[0] + + # Handle both single line number and range + if isinstance(line_number, range): + if elem_line not in line_number: + continue + else: + if elem_line != line_number: + continue + + # Check attrs filter + if attrs is not None: + if not all( + elem.getAttribute(attr_name) == attr_value + for attr_name, attr_value in attrs.items() + ): + continue + + # Check contains filter + if contains is not None: + elem_text = self._get_element_text(elem) + # Normalize the search string: convert HTML entities to Unicode characters + # This allows searching for both "“Rowan" and ""Rowan" + normalized_contains = html.unescape(contains) + if normalized_contains not in elem_text: + continue + + # If all applicable filters passed, this is a match + matches.append(elem) + + if not matches: + # Build descriptive error message + filters = [] + if line_number is not None: + line_str = ( + f"lines {line_number.start}-{line_number.stop - 1}" + if isinstance(line_number, range) + else f"line {line_number}" + ) + filters.append(f"at {line_str}") + if attrs is not None: + filters.append(f"with attributes {attrs}") + if contains is not None: + filters.append(f"containing '{contains}'") + + filter_desc = " ".join(filters) if filters else "" + base_msg = f"Node not found: <{tag}> {filter_desc}".strip() + + # Add helpful hint based on filters used + if contains: + hint = "Text may be split across elements or use different wording." + elif line_number: + hint = "Line numbers may have changed if document was modified." + elif attrs: + hint = "Verify attribute values are correct." + else: + hint = "Try adding filters (attrs, line_number, or contains)." + + raise ValueError(f"{base_msg}. {hint}") + if len(matches) > 1: + raise ValueError( + f"Multiple nodes found: <{tag}>. " + f"Add more filters (attrs, line_number, or contains) to narrow the search." + ) + return matches[0] + + def _get_element_text(self, elem): + """ + Recursively extract all text content from an element. + + Skips text nodes that contain only whitespace (spaces, tabs, newlines), + which typically represent XML formatting rather than document content. + + Args: + elem: defusedxml.minidom.Element to extract text from + + Returns: + str: Concatenated text from all non-whitespace text nodes within the element + """ + text_parts = [] + for node in elem.childNodes: + if node.nodeType == node.TEXT_NODE: + # Skip whitespace-only text nodes (XML formatting) + if node.data.strip(): + text_parts.append(node.data) + elif node.nodeType == node.ELEMENT_NODE: + text_parts.append(self._get_element_text(node)) + return "".join(text_parts) + + def replace_node(self, elem, new_content): + """ + Replace a DOM element with new XML content. + + Args: + elem: defusedxml.minidom.Element to replace + new_content: String containing XML to replace the node with + + Returns: + List[defusedxml.minidom.Node]: All inserted nodes + + Example: + new_nodes = editor.replace_node(old_elem, "text") + """ + parent = elem.parentNode + nodes = self._parse_fragment(new_content) + for node in nodes: + parent.insertBefore(node, elem) + parent.removeChild(elem) + return nodes + + def insert_after(self, elem, xml_content): + """ + Insert XML content after a DOM element. + + Args: + elem: defusedxml.minidom.Element to insert after + xml_content: String containing XML to insert + + Returns: + List[defusedxml.minidom.Node]: All inserted nodes + + Example: + new_nodes = editor.insert_after(elem, "text") + """ + parent = elem.parentNode + next_sibling = elem.nextSibling + nodes = self._parse_fragment(xml_content) + for node in nodes: + if next_sibling: + parent.insertBefore(node, next_sibling) + else: + parent.appendChild(node) + return nodes + + def insert_before(self, elem, xml_content): + """ + Insert XML content before a DOM element. + + Args: + elem: defusedxml.minidom.Element to insert before + xml_content: String containing XML to insert + + Returns: + List[defusedxml.minidom.Node]: All inserted nodes + + Example: + new_nodes = editor.insert_before(elem, "text") + """ + parent = elem.parentNode + nodes = self._parse_fragment(xml_content) + for node in nodes: + parent.insertBefore(node, elem) + return nodes + + def append_to(self, elem, xml_content): + """ + Append XML content as a child of a DOM element. + + Args: + elem: defusedxml.minidom.Element to append to + xml_content: String containing XML to append + + Returns: + List[defusedxml.minidom.Node]: All inserted nodes + + Example: + new_nodes = editor.append_to(elem, "text") + """ + nodes = self._parse_fragment(xml_content) + for node in nodes: + elem.appendChild(node) + return nodes + + def get_next_rid(self): + """Get the next available rId for relationships files.""" + max_id = 0 + for rel_elem in self.dom.getElementsByTagName("Relationship"): + rel_id = rel_elem.getAttribute("Id") + if rel_id.startswith("rId"): + try: + max_id = max(max_id, int(rel_id[3:])) + except ValueError: + pass + return f"rId{max_id + 1}" + + def save(self): + """ + Save the edited XML back to the file. + + Serializes the DOM tree and writes it back to the original file path, + preserving the original encoding (ascii or utf-8). + """ + content = self.dom.toxml(encoding=self.encoding) + self.xml_path.write_bytes(content) + + def _parse_fragment(self, xml_content): + """ + Parse XML fragment and return list of imported nodes. + + Args: + xml_content: String containing XML fragment + + Returns: + List of defusedxml.minidom.Node objects imported into this document + + Raises: + AssertionError: If fragment contains no element nodes + """ + # Extract namespace declarations from the root document element + root_elem = self.dom.documentElement + namespaces = [] + if root_elem and root_elem.attributes: + for i in range(root_elem.attributes.length): + attr = root_elem.attributes.item(i) + if attr.name.startswith("xmlns"): # type: ignore + namespaces.append(f'{attr.name}="{attr.value}"') # type: ignore + + ns_decl = " ".join(namespaces) + wrapper = f"{xml_content}" + fragment_doc = defusedxml.minidom.parseString(wrapper) + nodes = [ + self.dom.importNode(child, deep=True) + for child in fragment_doc.documentElement.childNodes # type: ignore + ] + elements = [n for n in nodes if n.nodeType == n.ELEMENT_NODE] + assert elements, "Fragment must contain at least one element" + return nodes + + +def _create_line_tracking_parser(): + """ + Create a SAX parser that tracks line and column numbers for each element. + + Monkey patches the SAX content handler to store the current line and column + position from the underlying expat parser onto each element as a parse_position + attribute (line, column) tuple. + + Returns: + defusedxml.sax.xmlreader.XMLReader: Configured SAX parser + """ + + def set_content_handler(dom_handler): + def startElementNS(name, tagName, attrs): + orig_start_cb(name, tagName, attrs) + cur_elem = dom_handler.elementStack[-1] + cur_elem.parse_position = ( + parser._parser.CurrentLineNumber, # type: ignore + parser._parser.CurrentColumnNumber, # type: ignore + ) + + orig_start_cb = dom_handler.startElementNS + dom_handler.startElementNS = startElementNS + orig_set_content_handler(dom_handler) + + parser = defusedxml.sax.make_parser() + orig_set_content_handler = parser.setContentHandler + parser.setContentHandler = set_content_handler # type: ignore + return parser diff --git a/skills/pdf/FORMS.md b/skills/pdf/FORMS.md new file mode 100644 index 0000000..4e23450 --- /dev/null +++ b/skills/pdf/FORMS.md @@ -0,0 +1,205 @@ +**CRITICAL: You MUST complete these steps in order. Do not skip ahead to writing code.** + +If you need to fill out a PDF form, first check to see if the PDF has fillable form fields. Run this script from this file's directory: + `python scripts/check_fillable_fields `, and depending on the result go to either the "Fillable fields" or "Non-fillable fields" and follow those instructions. + +# Fillable fields +If the PDF has fillable form fields: +- Run this script from this file's directory: `python scripts/extract_form_field_info.py `. It will create a JSON file with a list of fields in this format: +``` +[ + { + "field_id": (unique ID for the field), + "page": (page number, 1-based), + "rect": ([left, bottom, right, top] bounding box in PDF coordinates, y=0 is the bottom of the page), + "type": ("text", "checkbox", "radio_group", or "choice"), + }, + // Checkboxes have "checked_value" and "unchecked_value" properties: + { + "field_id": (unique ID for the field), + "page": (page number, 1-based), + "type": "checkbox", + "checked_value": (Set the field to this value to check the checkbox), + "unchecked_value": (Set the field to this value to uncheck the checkbox), + }, + // Radio groups have a "radio_options" list with the possible choices. + { + "field_id": (unique ID for the field), + "page": (page number, 1-based), + "type": "radio_group", + "radio_options": [ + { + "value": (set the field to this value to select this radio option), + "rect": (bounding box for the radio button for this option) + }, + // Other radio options + ] + }, + // Multiple choice fields have a "choice_options" list with the possible choices: + { + "field_id": (unique ID for the field), + "page": (page number, 1-based), + "type": "choice", + "choice_options": [ + { + "value": (set the field to this value to select this option), + "text": (display text of the option) + }, + // Other choice options + ], + } +] +``` +- Convert the PDF to PNGs (one image for each page) with this script (run from this file's directory): +`python scripts/convert_pdf_to_images.py ` +Then analyze the images to determine the purpose of each form field (make sure to convert the bounding box PDF coordinates to image coordinates). +- Create a `field_values.json` file in this format with the values to be entered for each field: +``` +[ + { + "field_id": "last_name", // Must match the field_id from `extract_form_field_info.py` + "description": "The user's last name", + "page": 1, // Must match the "page" value in field_info.json + "value": "Simpson" + }, + { + "field_id": "Checkbox12", + "description": "Checkbox to be checked if the user is 18 or over", + "page": 1, + "value": "/On" // If this is a checkbox, use its "checked_value" value to check it. If it's a radio button group, use one of the "value" values in "radio_options". + }, + // more fields +] +``` +- Run the `fill_fillable_fields.py` script from this file's directory to create a filled-in PDF: +`python scripts/fill_fillable_fields.py ` +This script will verify that the field IDs and values you provide are valid; if it prints error messages, correct the appropriate fields and try again. + +# Non-fillable fields +If the PDF doesn't have fillable form fields, you'll need to visually determine where the data should be added and create text annotations. Follow the below steps *exactly*. You MUST perform all of these steps to ensure that the the form is accurately completed. Details for each step are below. +- Convert the PDF to PNG images and determine field bounding boxes. +- Create a JSON file with field information and validation images showing the bounding boxes. +- Validate the the bounding boxes. +- Use the bounding boxes to fill in the form. + +## Step 1: Visual Analysis (REQUIRED) +- Convert the PDF to PNG images. Run this script from this file's directory: +`python scripts/convert_pdf_to_images.py ` +The script will create a PNG image for each page in the PDF. +- Carefully examine each PNG image and identify all form fields and areas where the user should enter data. For each form field where the user should enter text, determine bounding boxes for both the form field label, and the area where the user should enter text. The label and entry bounding boxes MUST NOT INTERSECT; the text entry box should only include the area where data should be entered. Usually this area will be immediately to the side, above, or below its label. Entry bounding boxes must be tall and wide enough to contain their text. + +These are some examples of form structures that you might see: + +*Label inside box* +``` +┌────────────────────────┐ +│ Name: │ +└────────────────────────┘ +``` +The input area should be to the right of the "Name" label and extend to the edge of the box. + +*Label before line* +``` +Email: _______________________ +``` +The input area should be above the line and include its entire width. + +*Label under line* +``` +_________________________ +Name +``` +The input area should be above the line and include the entire width of the line. This is common for signature and date fields. + +*Label above line* +``` +Please enter any special requests: +________________________________________________ +``` +The input area should extend from the bottom of the label to the line, and should include the entire width of the line. + +*Checkboxes* +``` +Are you a US citizen? Yes □ No □ +``` +For checkboxes: +- Look for small square boxes (□) - these are the actual checkboxes to target. They may be to the left or right of their labels. +- Distinguish between label text ("Yes", "No") and the clickable checkbox squares. +- The entry bounding box should cover ONLY the small square, not the text label. + +### Step 2: Create fields.json and validation images (REQUIRED) +- Create a file named `fields.json` with information for the form fields and bounding boxes in this format: +``` +{ + "pages": [ + { + "page_number": 1, + "image_width": (first page image width in pixels), + "image_height": (first page image height in pixels), + }, + { + "page_number": 2, + "image_width": (second page image width in pixels), + "image_height": (second page image height in pixels), + } + // additional pages + ], + "form_fields": [ + // Example for a text field. + { + "page_number": 1, + "description": "The user's last name should be entered here", + // Bounding boxes are [left, top, right, bottom]. The bounding boxes for the label and text entry should not overlap. + "field_label": "Last name", + "label_bounding_box": [30, 125, 95, 142], + "entry_bounding_box": [100, 125, 280, 142], + "entry_text": { + "text": "Johnson", // This text will be added as an annotation at the entry_bounding_box location + "font_size": 14, // optional, defaults to 14 + "font_color": "000000", // optional, RRGGBB format, defaults to 000000 (black) + } + }, + // Example for a checkbox. TARGET THE SQUARE for the entry bounding box, NOT THE TEXT + { + "page_number": 2, + "description": "Checkbox that should be checked if the user is over 18", + "entry_bounding_box": [140, 525, 155, 540], // Small box over checkbox square + "field_label": "Yes", + "label_bounding_box": [100, 525, 132, 540], // Box containing "Yes" text + // Use "X" to check a checkbox. + "entry_text": { + "text": "X", + } + } + // additional form field entries + ] +} +``` + +Create validation images by running this script from this file's directory for each page: +`python scripts/create_validation_image.py + +The validation images will have red rectangles where text should be entered, and blue rectangles covering label text. + +### Step 3: Validate Bounding Boxes (REQUIRED) +#### Automated intersection check +- Verify that none of bounding boxes intersect and that the entry bounding boxes are tall enough by checking the fields.json file with the `check_bounding_boxes.py` script (run from this file's directory): +`python scripts/check_bounding_boxes.py ` + +If there are errors, reanalyze the relevant fields, adjust the bounding boxes, and iterate until there are no remaining errors. Remember: label (blue) bounding boxes should contain text labels, entry (red) boxes should not. + +#### Manual image inspection +**CRITICAL: Do not proceed without visually inspecting validation images** +- Red rectangles must ONLY cover input areas +- Red rectangles MUST NOT contain any text +- Blue rectangles should contain label text +- For checkboxes: + - Red rectangle MUST be centered on the checkbox square + - Blue rectangle should cover the text label for the checkbox + +- If any rectangles look wrong, fix fields.json, regenerate the validation images, and verify again. Repeat this process until the bounding boxes are fully accurate. + + +### Step 4: Add annotations to the PDF +Run this script from this file's directory to create a filled-out PDF using the information in fields.json: +`python scripts/fill_pdf_form_with_annotations.py diff --git a/skills/pdf/LICENSE.txt b/skills/pdf/LICENSE.txt new file mode 100644 index 0000000..c55ab42 --- /dev/null +++ b/skills/pdf/LICENSE.txt @@ -0,0 +1,30 @@ +© 2025 Anthropic, PBC. All rights reserved. + +LICENSE: Use of these materials (including all code, prompts, assets, files, +and other components of this Skill) is governed by your agreement with +Anthropic regarding use of Anthropic's services. If no separate agreement +exists, use is governed by Anthropic's Consumer Terms of Service or +Commercial Terms of Service, as applicable: +https://www.anthropic.com/legal/consumer-terms +https://www.anthropic.com/legal/commercial-terms +Your applicable agreement is referred to as the "Agreement." "Services" are +as defined in the Agreement. + +ADDITIONAL RESTRICTIONS: Notwithstanding anything in the Agreement to the +contrary, users may not: + +- Extract these materials from the Services or retain copies of these + materials outside the Services +- Reproduce or copy these materials, except for temporary copies created + automatically during authorized use of the Services +- Create derivative works based on these materials +- Distribute, sublicense, or transfer these materials to any third party +- Make, offer to sell, sell, or import any inventions embodied in these + materials +- Reverse engineer, decompile, or disassemble these materials + +The receipt, viewing, or possession of these materials does not convey or +imply any license or right beyond those expressly granted above. + +Anthropic retains all right, title, and interest in these materials, +including all copyrights, patents, and other intellectual property rights. diff --git a/skills/pdf/REFERENCE.md b/skills/pdf/REFERENCE.md new file mode 100644 index 0000000..41400bf --- /dev/null +++ b/skills/pdf/REFERENCE.md @@ -0,0 +1,612 @@ +# PDF Processing Advanced Reference + +This document contains advanced PDF processing features, detailed examples, and additional libraries not covered in the main skill instructions. + +## pypdfium2 Library (Apache/BSD License) + +### Overview +pypdfium2 is a Python binding for PDFium (Chromium's PDF library). It's excellent for fast PDF rendering, image generation, and serves as a PyMuPDF replacement. + +### Render PDF to Images +```python +import pypdfium2 as pdfium +from PIL import Image + +# Load PDF +pdf = pdfium.PdfDocument("document.pdf") + +# Render page to image +page = pdf[0] # First page +bitmap = page.render( + scale=2.0, # Higher resolution + rotation=0 # No rotation +) + +# Convert to PIL Image +img = bitmap.to_pil() +img.save("page_1.png", "PNG") + +# Process multiple pages +for i, page in enumerate(pdf): + bitmap = page.render(scale=1.5) + img = bitmap.to_pil() + img.save(f"page_{i+1}.jpg", "JPEG", quality=90) +``` + +### Extract Text with pypdfium2 +```python +import pypdfium2 as pdfium + +pdf = pdfium.PdfDocument("document.pdf") +for i, page in enumerate(pdf): + text = page.get_text() + print(f"Page {i+1} text length: {len(text)} chars") +``` + +## JavaScript Libraries + +### pdf-lib (MIT License) + +pdf-lib is a powerful JavaScript library for creating and modifying PDF documents in any JavaScript environment. + +#### Load and Manipulate Existing PDF +```javascript +import { PDFDocument } from 'pdf-lib'; +import fs from 'fs'; + +async function manipulatePDF() { + // Load existing PDF + const existingPdfBytes = fs.readFileSync('input.pdf'); + const pdfDoc = await PDFDocument.load(existingPdfBytes); + + // Get page count + const pageCount = pdfDoc.getPageCount(); + console.log(`Document has ${pageCount} pages`); + + // Add new page + const newPage = pdfDoc.addPage([600, 400]); + newPage.drawText('Added by pdf-lib', { + x: 100, + y: 300, + size: 16 + }); + + // Save modified PDF + const pdfBytes = await pdfDoc.save(); + fs.writeFileSync('modified.pdf', pdfBytes); +} +``` + +#### Create Complex PDFs from Scratch +```javascript +import { PDFDocument, rgb, StandardFonts } from 'pdf-lib'; +import fs from 'fs'; + +async function createPDF() { + const pdfDoc = await PDFDocument.create(); + + // Add fonts + const helveticaFont = await pdfDoc.embedFont(StandardFonts.Helvetica); + const helveticaBold = await pdfDoc.embedFont(StandardFonts.HelveticaBold); + + // Add page + const page = pdfDoc.addPage([595, 842]); // A4 size + const { width, height } = page.getSize(); + + // Add text with styling + page.drawText('Invoice #12345', { + x: 50, + y: height - 50, + size: 18, + font: helveticaBold, + color: rgb(0.2, 0.2, 0.8) + }); + + // Add rectangle (header background) + page.drawRectangle({ + x: 40, + y: height - 100, + width: width - 80, + height: 30, + color: rgb(0.9, 0.9, 0.9) + }); + + // Add table-like content + const items = [ + ['Item', 'Qty', 'Price', 'Total'], + ['Widget', '2', '$50', '$100'], + ['Gadget', '1', '$75', '$75'] + ]; + + let yPos = height - 150; + items.forEach(row => { + let xPos = 50; + row.forEach(cell => { + page.drawText(cell, { + x: xPos, + y: yPos, + size: 12, + font: helveticaFont + }); + xPos += 120; + }); + yPos -= 25; + }); + + const pdfBytes = await pdfDoc.save(); + fs.writeFileSync('created.pdf', pdfBytes); +} +``` + +#### Advanced Merge and Split Operations +```javascript +import { PDFDocument } from 'pdf-lib'; +import fs from 'fs'; + +async function mergePDFs() { + // Create new document + const mergedPdf = await PDFDocument.create(); + + // Load source PDFs + const pdf1Bytes = fs.readFileSync('doc1.pdf'); + const pdf2Bytes = fs.readFileSync('doc2.pdf'); + + const pdf1 = await PDFDocument.load(pdf1Bytes); + const pdf2 = await PDFDocument.load(pdf2Bytes); + + // Copy pages from first PDF + const pdf1Pages = await mergedPdf.copyPages(pdf1, pdf1.getPageIndices()); + pdf1Pages.forEach(page => mergedPdf.addPage(page)); + + // Copy specific pages from second PDF (pages 0, 2, 4) + const pdf2Pages = await mergedPdf.copyPages(pdf2, [0, 2, 4]); + pdf2Pages.forEach(page => mergedPdf.addPage(page)); + + const mergedPdfBytes = await mergedPdf.save(); + fs.writeFileSync('merged.pdf', mergedPdfBytes); +} +``` + +### pdfjs-dist (Apache License) + +PDF.js is Mozilla's JavaScript library for rendering PDFs in the browser. + +#### Basic PDF Loading and Rendering +```javascript +import * as pdfjsLib from 'pdfjs-dist'; + +// Configure worker (important for performance) +pdfjsLib.GlobalWorkerOptions.workerSrc = './pdf.worker.js'; + +async function renderPDF() { + // Load PDF + const loadingTask = pdfjsLib.getDocument('document.pdf'); + const pdf = await loadingTask.promise; + + console.log(`Loaded PDF with ${pdf.numPages} pages`); + + // Get first page + const page = await pdf.getPage(1); + const viewport = page.getViewport({ scale: 1.5 }); + + // Render to canvas + const canvas = document.createElement('canvas'); + const context = canvas.getContext('2d'); + canvas.height = viewport.height; + canvas.width = viewport.width; + + const renderContext = { + canvasContext: context, + viewport: viewport + }; + + await page.render(renderContext).promise; + document.body.appendChild(canvas); +} +``` + +#### Extract Text with Coordinates +```javascript +import * as pdfjsLib from 'pdfjs-dist'; + +async function extractText() { + const loadingTask = pdfjsLib.getDocument('document.pdf'); + const pdf = await loadingTask.promise; + + let fullText = ''; + + // Extract text from all pages + for (let i = 1; i <= pdf.numPages; i++) { + const page = await pdf.getPage(i); + const textContent = await page.getTextContent(); + + const pageText = textContent.items + .map(item => item.str) + .join(' '); + + fullText += `\n--- Page ${i} ---\n${pageText}`; + + // Get text with coordinates for advanced processing + const textWithCoords = textContent.items.map(item => ({ + text: item.str, + x: item.transform[4], + y: item.transform[5], + width: item.width, + height: item.height + })); + } + + console.log(fullText); + return fullText; +} +``` + +#### Extract Annotations and Forms +```javascript +import * as pdfjsLib from 'pdfjs-dist'; + +async function extractAnnotations() { + const loadingTask = pdfjsLib.getDocument('annotated.pdf'); + const pdf = await loadingTask.promise; + + for (let i = 1; i <= pdf.numPages; i++) { + const page = await pdf.getPage(i); + const annotations = await page.getAnnotations(); + + annotations.forEach(annotation => { + console.log(`Annotation type: ${annotation.subtype}`); + console.log(`Content: ${annotation.contents}`); + console.log(`Coordinates: ${JSON.stringify(annotation.rect)}`); + }); + } +} +``` + +## Advanced Command-Line Operations + +### poppler-utils Advanced Features + +#### Extract Text with Bounding Box Coordinates +```bash +# Extract text with bounding box coordinates (essential for structured data) +pdftotext -bbox-layout document.pdf output.xml + +# The XML output contains precise coordinates for each text element +``` + +#### Advanced Image Conversion +```bash +# Convert to PNG images with specific resolution +pdftoppm -png -r 300 document.pdf output_prefix + +# Convert specific page range with high resolution +pdftoppm -png -r 600 -f 1 -l 3 document.pdf high_res_pages + +# Convert to JPEG with quality setting +pdftoppm -jpeg -jpegopt quality=85 -r 200 document.pdf jpeg_output +``` + +#### Extract Embedded Images +```bash +# Extract all embedded images with metadata +pdfimages -j -p document.pdf page_images + +# List image info without extracting +pdfimages -list document.pdf + +# Extract images in their original format +pdfimages -all document.pdf images/img +``` + +### qpdf Advanced Features + +#### Complex Page Manipulation +```bash +# Split PDF into groups of pages +qpdf --split-pages=3 input.pdf output_group_%02d.pdf + +# Extract specific pages with complex ranges +qpdf input.pdf --pages input.pdf 1,3-5,8,10-end -- extracted.pdf + +# Merge specific pages from multiple PDFs +qpdf --empty --pages doc1.pdf 1-3 doc2.pdf 5-7 doc3.pdf 2,4 -- combined.pdf +``` + +#### PDF Optimization and Repair +```bash +# Optimize PDF for web (linearize for streaming) +qpdf --linearize input.pdf optimized.pdf + +# Remove unused objects and compress +qpdf --optimize-level=all input.pdf compressed.pdf + +# Attempt to repair corrupted PDF structure +qpdf --check input.pdf +qpdf --fix-qdf damaged.pdf repaired.pdf + +# Show detailed PDF structure for debugging +qpdf --show-all-pages input.pdf > structure.txt +``` + +#### Advanced Encryption +```bash +# Add password protection with specific permissions +qpdf --encrypt user_pass owner_pass 256 --print=none --modify=none -- input.pdf encrypted.pdf + +# Check encryption status +qpdf --show-encryption encrypted.pdf + +# Remove password protection (requires password) +qpdf --password=secret123 --decrypt encrypted.pdf decrypted.pdf +``` + +## Advanced Python Techniques + +### pdfplumber Advanced Features + +#### Extract Text with Precise Coordinates +```python +import pdfplumber + +with pdfplumber.open("document.pdf") as pdf: + page = pdf.pages[0] + + # Extract all text with coordinates + chars = page.chars + for char in chars[:10]: # First 10 characters + print(f"Char: '{char['text']}' at x:{char['x0']:.1f} y:{char['y0']:.1f}") + + # Extract text by bounding box (left, top, right, bottom) + bbox_text = page.within_bbox((100, 100, 400, 200)).extract_text() +``` + +#### Advanced Table Extraction with Custom Settings +```python +import pdfplumber +import pandas as pd + +with pdfplumber.open("complex_table.pdf") as pdf: + page = pdf.pages[0] + + # Extract tables with custom settings for complex layouts + table_settings = { + "vertical_strategy": "lines", + "horizontal_strategy": "lines", + "snap_tolerance": 3, + "intersection_tolerance": 15 + } + tables = page.extract_tables(table_settings) + + # Visual debugging for table extraction + img = page.to_image(resolution=150) + img.save("debug_layout.png") +``` + +### reportlab Advanced Features + +#### Create Professional Reports with Tables +```python +from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph +from reportlab.lib.styles import getSampleStyleSheet +from reportlab.lib import colors + +# Sample data +data = [ + ['Product', 'Q1', 'Q2', 'Q3', 'Q4'], + ['Widgets', '120', '135', '142', '158'], + ['Gadgets', '85', '92', '98', '105'] +] + +# Create PDF with table +doc = SimpleDocTemplate("report.pdf") +elements = [] + +# Add title +styles = getSampleStyleSheet() +title = Paragraph("Quarterly Sales Report", styles['Title']) +elements.append(title) + +# Add table with advanced styling +table = Table(data) +table.setStyle(TableStyle([ + ('BACKGROUND', (0, 0), (-1, 0), colors.grey), + ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke), + ('ALIGN', (0, 0), (-1, -1), 'CENTER'), + ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'), + ('FONTSIZE', (0, 0), (-1, 0), 14), + ('BOTTOMPADDING', (0, 0), (-1, 0), 12), + ('BACKGROUND', (0, 1), (-1, -1), colors.beige), + ('GRID', (0, 0), (-1, -1), 1, colors.black) +])) +elements.append(table) + +doc.build(elements) +``` + +## Complex Workflows + +### Extract Figures/Images from PDF + +#### Method 1: Using pdfimages (fastest) +```bash +# Extract all images with original quality +pdfimages -all document.pdf images/img +``` + +#### Method 2: Using pypdfium2 + Image Processing +```python +import pypdfium2 as pdfium +from PIL import Image +import numpy as np + +def extract_figures(pdf_path, output_dir): + pdf = pdfium.PdfDocument(pdf_path) + + for page_num, page in enumerate(pdf): + # Render high-resolution page + bitmap = page.render(scale=3.0) + img = bitmap.to_pil() + + # Convert to numpy for processing + img_array = np.array(img) + + # Simple figure detection (non-white regions) + mask = np.any(img_array != [255, 255, 255], axis=2) + + # Find contours and extract bounding boxes + # (This is simplified - real implementation would need more sophisticated detection) + + # Save detected figures + # ... implementation depends on specific needs +``` + +### Batch PDF Processing with Error Handling +```python +import os +import glob +from pypdf import PdfReader, PdfWriter +import logging + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +def batch_process_pdfs(input_dir, operation='merge'): + pdf_files = glob.glob(os.path.join(input_dir, "*.pdf")) + + if operation == 'merge': + writer = PdfWriter() + for pdf_file in pdf_files: + try: + reader = PdfReader(pdf_file) + for page in reader.pages: + writer.add_page(page) + logger.info(f"Processed: {pdf_file}") + except Exception as e: + logger.error(f"Failed to process {pdf_file}: {e}") + continue + + with open("batch_merged.pdf", "wb") as output: + writer.write(output) + + elif operation == 'extract_text': + for pdf_file in pdf_files: + try: + reader = PdfReader(pdf_file) + text = "" + for page in reader.pages: + text += page.extract_text() + + output_file = pdf_file.replace('.pdf', '.txt') + with open(output_file, 'w', encoding='utf-8') as f: + f.write(text) + logger.info(f"Extracted text from: {pdf_file}") + + except Exception as e: + logger.error(f"Failed to extract text from {pdf_file}: {e}") + continue +``` + +### Advanced PDF Cropping +```python +from pypdf import PdfWriter, PdfReader + +reader = PdfReader("input.pdf") +writer = PdfWriter() + +# Crop page (left, bottom, right, top in points) +page = reader.pages[0] +page.mediabox.left = 50 +page.mediabox.bottom = 50 +page.mediabox.right = 550 +page.mediabox.top = 750 + +writer.add_page(page) +with open("cropped.pdf", "wb") as output: + writer.write(output) +``` + +## Performance Optimization Tips + +### 1. For Large PDFs +- Use streaming approaches instead of loading entire PDF in memory +- Use `qpdf --split-pages` for splitting large files +- Process pages individually with pypdfium2 + +### 2. For Text Extraction +- `pdftotext -bbox-layout` is fastest for plain text extraction +- Use pdfplumber for structured data and tables +- Avoid `pypdf.extract_text()` for very large documents + +### 3. For Image Extraction +- `pdfimages` is much faster than rendering pages +- Use low resolution for previews, high resolution for final output + +### 4. For Form Filling +- pdf-lib maintains form structure better than most alternatives +- Pre-validate form fields before processing + +### 5. Memory Management +```python +# Process PDFs in chunks +def process_large_pdf(pdf_path, chunk_size=10): + reader = PdfReader(pdf_path) + total_pages = len(reader.pages) + + for start_idx in range(0, total_pages, chunk_size): + end_idx = min(start_idx + chunk_size, total_pages) + writer = PdfWriter() + + for i in range(start_idx, end_idx): + writer.add_page(reader.pages[i]) + + # Process chunk + with open(f"chunk_{start_idx//chunk_size}.pdf", "wb") as output: + writer.write(output) +``` + +## Troubleshooting Common Issues + +### Encrypted PDFs +```python +# Handle password-protected PDFs +from pypdf import PdfReader + +try: + reader = PdfReader("encrypted.pdf") + if reader.is_encrypted: + reader.decrypt("password") +except Exception as e: + print(f"Failed to decrypt: {e}") +``` + +### Corrupted PDFs +```bash +# Use qpdf to repair +qpdf --check corrupted.pdf +qpdf --replace-input corrupted.pdf +``` + +### Text Extraction Issues +```python +# Fallback to OCR for scanned PDFs +import pytesseract +from pdf2image import convert_from_path + +def extract_text_with_ocr(pdf_path): + images = convert_from_path(pdf_path) + text = "" + for i, image in enumerate(images): + text += pytesseract.image_to_string(image) + return text +``` + +## License Information + +- **pypdf**: BSD License +- **pdfplumber**: MIT License +- **pypdfium2**: Apache/BSD License +- **reportlab**: BSD License +- **poppler-utils**: GPL-2 License +- **qpdf**: Apache License +- **pdf-lib**: MIT License +- **pdfjs-dist**: Apache License \ No newline at end of file diff --git a/skills/pdf/SKILL.md b/skills/pdf/SKILL.md new file mode 100644 index 0000000..607252c --- /dev/null +++ b/skills/pdf/SKILL.md @@ -0,0 +1,294 @@ +--- +name: pdf +description: Comprehensive PDF manipulation toolkit for extracting text and tables, creating new PDFs, merging/splitting documents, and handling forms. When Claude needs to fill in a PDF form or programmatically process, generate, or analyze PDF documents at scale. +license: Proprietary. LICENSE.txt has complete terms +--- + +# PDF Processing Guide + +## Overview + +This guide covers essential PDF processing operations using Python libraries and command-line tools. For advanced features, JavaScript libraries, and detailed examples, see REFERENCE.md. If you need to fill out a PDF form, read FORMS.md and follow its instructions. + +## Quick Start + +```python +from pypdf import PdfReader, PdfWriter + +# Read a PDF +reader = PdfReader("document.pdf") +print(f"Pages: {len(reader.pages)}") + +# Extract text +text = "" +for page in reader.pages: + text += page.extract_text() +``` + +## Python Libraries + +### pypdf - Basic Operations + +#### Merge PDFs +```python +from pypdf import PdfWriter, PdfReader + +writer = PdfWriter() +for pdf_file in ["doc1.pdf", "doc2.pdf", "doc3.pdf"]: + reader = PdfReader(pdf_file) + for page in reader.pages: + writer.add_page(page) + +with open("merged.pdf", "wb") as output: + writer.write(output) +``` + +#### Split PDF +```python +reader = PdfReader("input.pdf") +for i, page in enumerate(reader.pages): + writer = PdfWriter() + writer.add_page(page) + with open(f"page_{i+1}.pdf", "wb") as output: + writer.write(output) +``` + +#### Extract Metadata +```python +reader = PdfReader("document.pdf") +meta = reader.metadata +print(f"Title: {meta.title}") +print(f"Author: {meta.author}") +print(f"Subject: {meta.subject}") +print(f"Creator: {meta.creator}") +``` + +#### Rotate Pages +```python +reader = PdfReader("input.pdf") +writer = PdfWriter() + +page = reader.pages[0] +page.rotate(90) # Rotate 90 degrees clockwise +writer.add_page(page) + +with open("rotated.pdf", "wb") as output: + writer.write(output) +``` + +### pdfplumber - Text and Table Extraction + +#### Extract Text with Layout +```python +import pdfplumber + +with pdfplumber.open("document.pdf") as pdf: + for page in pdf.pages: + text = page.extract_text() + print(text) +``` + +#### Extract Tables +```python +with pdfplumber.open("document.pdf") as pdf: + for i, page in enumerate(pdf.pages): + tables = page.extract_tables() + for j, table in enumerate(tables): + print(f"Table {j+1} on page {i+1}:") + for row in table: + print(row) +``` + +#### Advanced Table Extraction +```python +import pandas as pd + +with pdfplumber.open("document.pdf") as pdf: + all_tables = [] + for page in pdf.pages: + tables = page.extract_tables() + for table in tables: + if table: # Check if table is not empty + df = pd.DataFrame(table[1:], columns=table[0]) + all_tables.append(df) + +# Combine all tables +if all_tables: + combined_df = pd.concat(all_tables, ignore_index=True) + combined_df.to_excel("extracted_tables.xlsx", index=False) +``` + +### reportlab - Create PDFs + +#### Basic PDF Creation +```python +from reportlab.lib.pagesizes import letter +from reportlab.pdfgen import canvas + +c = canvas.Canvas("hello.pdf", pagesize=letter) +width, height = letter + +# Add text +c.drawString(100, height - 100, "Hello World!") +c.drawString(100, height - 120, "This is a PDF created with reportlab") + +# Add a line +c.line(100, height - 140, 400, height - 140) + +# Save +c.save() +``` + +#### Create PDF with Multiple Pages +```python +from reportlab.lib.pagesizes import letter +from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak +from reportlab.lib.styles import getSampleStyleSheet + +doc = SimpleDocTemplate("report.pdf", pagesize=letter) +styles = getSampleStyleSheet() +story = [] + +# Add content +title = Paragraph("Report Title", styles['Title']) +story.append(title) +story.append(Spacer(1, 12)) + +body = Paragraph("This is the body of the report. " * 20, styles['Normal']) +story.append(body) +story.append(PageBreak()) + +# Page 2 +story.append(Paragraph("Page 2", styles['Heading1'])) +story.append(Paragraph("Content for page 2", styles['Normal'])) + +# Build PDF +doc.build(story) +``` + +## Command-Line Tools + +### pdftotext (poppler-utils) +```bash +# Extract text +pdftotext input.pdf output.txt + +# Extract text preserving layout +pdftotext -layout input.pdf output.txt + +# Extract specific pages +pdftotext -f 1 -l 5 input.pdf output.txt # Pages 1-5 +``` + +### qpdf +```bash +# Merge PDFs +qpdf --empty --pages file1.pdf file2.pdf -- merged.pdf + +# Split pages +qpdf input.pdf --pages . 1-5 -- pages1-5.pdf +qpdf input.pdf --pages . 6-10 -- pages6-10.pdf + +# Rotate pages +qpdf input.pdf output.pdf --rotate=+90:1 # Rotate page 1 by 90 degrees + +# Remove password +qpdf --password=mypassword --decrypt encrypted.pdf decrypted.pdf +``` + +### pdftk (if available) +```bash +# Merge +pdftk file1.pdf file2.pdf cat output merged.pdf + +# Split +pdftk input.pdf burst + +# Rotate +pdftk input.pdf rotate 1east output rotated.pdf +``` + +## Common Tasks + +### Extract Text from Scanned PDFs +```python +# Requires: pip install pytesseract pdf2image +import pytesseract +from pdf2image import convert_from_path + +# Convert PDF to images +images = convert_from_path('scanned.pdf') + +# OCR each page +text = "" +for i, image in enumerate(images): + text += f"Page {i+1}:\n" + text += pytesseract.image_to_string(image) + text += "\n\n" + +print(text) +``` + +### Add Watermark +```python +from pypdf import PdfReader, PdfWriter + +# Create watermark (or load existing) +watermark = PdfReader("watermark.pdf").pages[0] + +# Apply to all pages +reader = PdfReader("document.pdf") +writer = PdfWriter() + +for page in reader.pages: + page.merge_page(watermark) + writer.add_page(page) + +with open("watermarked.pdf", "wb") as output: + writer.write(output) +``` + +### Extract Images +```bash +# Using pdfimages (poppler-utils) +pdfimages -j input.pdf output_prefix + +# This extracts all images as output_prefix-000.jpg, output_prefix-001.jpg, etc. +``` + +### Password Protection +```python +from pypdf import PdfReader, PdfWriter + +reader = PdfReader("input.pdf") +writer = PdfWriter() + +for page in reader.pages: + writer.add_page(page) + +# Add password +writer.encrypt("userpassword", "ownerpassword") + +with open("encrypted.pdf", "wb") as output: + writer.write(output) +``` + +## Quick Reference + +| Task | Best Tool | Command/Code | +|------|-----------|--------------| +| Merge PDFs | pypdf | `writer.add_page(page)` | +| Split PDFs | pypdf | One page per file | +| Extract text | pdfplumber | `page.extract_text()` | +| Extract tables | pdfplumber | `page.extract_tables()` | +| Create PDFs | reportlab | Canvas or Platypus | +| Command line merge | qpdf | `qpdf --empty --pages ...` | +| OCR scanned PDFs | pytesseract | Convert to image first | +| Fill PDF forms | pdf-lib or pypdf (see FORMS.md) | See FORMS.md | + +## Next Steps + +- For advanced pypdfium2 usage, see REFERENCE.md +- For JavaScript libraries (pdf-lib), see REFERENCE.md +- If you need to fill out a PDF form, follow the instructions in FORMS.md +- For troubleshooting guides, see REFERENCE.md diff --git a/skills/pdf/scripts/check_bounding_boxes.py b/skills/pdf/scripts/check_bounding_boxes.py new file mode 100644 index 0000000..2a1ee2c --- /dev/null +++ b/skills/pdf/scripts/check_bounding_boxes.py @@ -0,0 +1,70 @@ +from dataclasses import dataclass +import json +import sys + + +# Script to check that the `fields.json` file that Claude creates when analyzing PDFs +# does not have overlapping bounding boxes. See FORMS.md. + + +@dataclass +class RectAndField: + rect: list[float] + rect_type: str + field: dict + + +# Returns a list of messages that are printed to stdout for Claude to read. +def get_bounding_box_messages(fields_json_stream) -> list[str]: + messages = [] + fields = json.load(fields_json_stream) + messages.append(f"Read {len(fields['form_fields'])} fields") + + def rects_intersect(r1, r2): + disjoint_horizontal = r1[0] >= r2[2] or r1[2] <= r2[0] + disjoint_vertical = r1[1] >= r2[3] or r1[3] <= r2[1] + return not (disjoint_horizontal or disjoint_vertical) + + rects_and_fields = [] + for f in fields["form_fields"]: + rects_and_fields.append(RectAndField(f["label_bounding_box"], "label", f)) + rects_and_fields.append(RectAndField(f["entry_bounding_box"], "entry", f)) + + has_error = False + for i, ri in enumerate(rects_and_fields): + # This is O(N^2); we can optimize if it becomes a problem. + for j in range(i + 1, len(rects_and_fields)): + rj = rects_and_fields[j] + if ri.field["page_number"] == rj.field["page_number"] and rects_intersect(ri.rect, rj.rect): + has_error = True + if ri.field is rj.field: + messages.append(f"FAILURE: intersection between label and entry bounding boxes for `{ri.field['description']}` ({ri.rect}, {rj.rect})") + else: + messages.append(f"FAILURE: intersection between {ri.rect_type} bounding box for `{ri.field['description']}` ({ri.rect}) and {rj.rect_type} bounding box for `{rj.field['description']}` ({rj.rect})") + if len(messages) >= 20: + messages.append("Aborting further checks; fix bounding boxes and try again") + return messages + if ri.rect_type == "entry": + if "entry_text" in ri.field: + font_size = ri.field["entry_text"].get("font_size", 14) + entry_height = ri.rect[3] - ri.rect[1] + if entry_height < font_size: + has_error = True + messages.append(f"FAILURE: entry bounding box height ({entry_height}) for `{ri.field['description']}` is too short for the text content (font size: {font_size}). Increase the box height or decrease the font size.") + if len(messages) >= 20: + messages.append("Aborting further checks; fix bounding boxes and try again") + return messages + + if not has_error: + messages.append("SUCCESS: All bounding boxes are valid") + return messages + +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: check_bounding_boxes.py [fields.json]") + sys.exit(1) + # Input file should be in the `fields.json` format described in FORMS.md. + with open(sys.argv[1]) as f: + messages = get_bounding_box_messages(f) + for msg in messages: + print(msg) diff --git a/skills/pdf/scripts/check_bounding_boxes_test.py b/skills/pdf/scripts/check_bounding_boxes_test.py new file mode 100644 index 0000000..1dbb463 --- /dev/null +++ b/skills/pdf/scripts/check_bounding_boxes_test.py @@ -0,0 +1,226 @@ +import unittest +import json +import io +from check_bounding_boxes import get_bounding_box_messages + + +# Currently this is not run automatically in CI; it's just for documentation and manual checking. +class TestGetBoundingBoxMessages(unittest.TestCase): + + def create_json_stream(self, data): + """Helper to create a JSON stream from data""" + return io.StringIO(json.dumps(data)) + + def test_no_intersections(self): + """Test case with no bounding box intersections""" + data = { + "form_fields": [ + { + "description": "Name", + "page_number": 1, + "label_bounding_box": [10, 10, 50, 30], + "entry_bounding_box": [60, 10, 150, 30] + }, + { + "description": "Email", + "page_number": 1, + "label_bounding_box": [10, 40, 50, 60], + "entry_bounding_box": [60, 40, 150, 60] + } + ] + } + + stream = self.create_json_stream(data) + messages = get_bounding_box_messages(stream) + self.assertTrue(any("SUCCESS" in msg for msg in messages)) + self.assertFalse(any("FAILURE" in msg for msg in messages)) + + def test_label_entry_intersection_same_field(self): + """Test intersection between label and entry of the same field""" + data = { + "form_fields": [ + { + "description": "Name", + "page_number": 1, + "label_bounding_box": [10, 10, 60, 30], + "entry_bounding_box": [50, 10, 150, 30] # Overlaps with label + } + ] + } + + stream = self.create_json_stream(data) + messages = get_bounding_box_messages(stream) + self.assertTrue(any("FAILURE" in msg and "intersection" in msg for msg in messages)) + self.assertFalse(any("SUCCESS" in msg for msg in messages)) + + def test_intersection_between_different_fields(self): + """Test intersection between bounding boxes of different fields""" + data = { + "form_fields": [ + { + "description": "Name", + "page_number": 1, + "label_bounding_box": [10, 10, 50, 30], + "entry_bounding_box": [60, 10, 150, 30] + }, + { + "description": "Email", + "page_number": 1, + "label_bounding_box": [40, 20, 80, 40], # Overlaps with Name's boxes + "entry_bounding_box": [160, 10, 250, 30] + } + ] + } + + stream = self.create_json_stream(data) + messages = get_bounding_box_messages(stream) + self.assertTrue(any("FAILURE" in msg and "intersection" in msg for msg in messages)) + self.assertFalse(any("SUCCESS" in msg for msg in messages)) + + def test_different_pages_no_intersection(self): + """Test that boxes on different pages don't count as intersecting""" + data = { + "form_fields": [ + { + "description": "Name", + "page_number": 1, + "label_bounding_box": [10, 10, 50, 30], + "entry_bounding_box": [60, 10, 150, 30] + }, + { + "description": "Email", + "page_number": 2, + "label_bounding_box": [10, 10, 50, 30], # Same coordinates but different page + "entry_bounding_box": [60, 10, 150, 30] + } + ] + } + + stream = self.create_json_stream(data) + messages = get_bounding_box_messages(stream) + self.assertTrue(any("SUCCESS" in msg for msg in messages)) + self.assertFalse(any("FAILURE" in msg for msg in messages)) + + def test_entry_height_too_small(self): + """Test that entry box height is checked against font size""" + data = { + "form_fields": [ + { + "description": "Name", + "page_number": 1, + "label_bounding_box": [10, 10, 50, 30], + "entry_bounding_box": [60, 10, 150, 20], # Height is 10 + "entry_text": { + "font_size": 14 # Font size larger than height + } + } + ] + } + + stream = self.create_json_stream(data) + messages = get_bounding_box_messages(stream) + self.assertTrue(any("FAILURE" in msg and "height" in msg for msg in messages)) + self.assertFalse(any("SUCCESS" in msg for msg in messages)) + + def test_entry_height_adequate(self): + """Test that adequate entry box height passes""" + data = { + "form_fields": [ + { + "description": "Name", + "page_number": 1, + "label_bounding_box": [10, 10, 50, 30], + "entry_bounding_box": [60, 10, 150, 30], # Height is 20 + "entry_text": { + "font_size": 14 # Font size smaller than height + } + } + ] + } + + stream = self.create_json_stream(data) + messages = get_bounding_box_messages(stream) + self.assertTrue(any("SUCCESS" in msg for msg in messages)) + self.assertFalse(any("FAILURE" in msg for msg in messages)) + + def test_default_font_size(self): + """Test that default font size is used when not specified""" + data = { + "form_fields": [ + { + "description": "Name", + "page_number": 1, + "label_bounding_box": [10, 10, 50, 30], + "entry_bounding_box": [60, 10, 150, 20], # Height is 10 + "entry_text": {} # No font_size specified, should use default 14 + } + ] + } + + stream = self.create_json_stream(data) + messages = get_bounding_box_messages(stream) + self.assertTrue(any("FAILURE" in msg and "height" in msg for msg in messages)) + self.assertFalse(any("SUCCESS" in msg for msg in messages)) + + def test_no_entry_text(self): + """Test that missing entry_text doesn't cause height check""" + data = { + "form_fields": [ + { + "description": "Name", + "page_number": 1, + "label_bounding_box": [10, 10, 50, 30], + "entry_bounding_box": [60, 10, 150, 20] # Small height but no entry_text + } + ] + } + + stream = self.create_json_stream(data) + messages = get_bounding_box_messages(stream) + self.assertTrue(any("SUCCESS" in msg for msg in messages)) + self.assertFalse(any("FAILURE" in msg for msg in messages)) + + def test_multiple_errors_limit(self): + """Test that error messages are limited to prevent excessive output""" + fields = [] + # Create many overlapping fields + for i in range(25): + fields.append({ + "description": f"Field{i}", + "page_number": 1, + "label_bounding_box": [10, 10, 50, 30], # All overlap + "entry_bounding_box": [20, 15, 60, 35] # All overlap + }) + + data = {"form_fields": fields} + + stream = self.create_json_stream(data) + messages = get_bounding_box_messages(stream) + # Should abort after ~20 messages + self.assertTrue(any("Aborting" in msg for msg in messages)) + # Should have some FAILURE messages but not hundreds + failure_count = sum(1 for msg in messages if "FAILURE" in msg) + self.assertGreater(failure_count, 0) + self.assertLess(len(messages), 30) # Should be limited + + def test_edge_touching_boxes(self): + """Test that boxes touching at edges don't count as intersecting""" + data = { + "form_fields": [ + { + "description": "Name", + "page_number": 1, + "label_bounding_box": [10, 10, 50, 30], + "entry_bounding_box": [50, 10, 150, 30] # Touches at x=50 + } + ] + } + + stream = self.create_json_stream(data) + messages = get_bounding_box_messages(stream) + self.assertTrue(any("SUCCESS" in msg for msg in messages)) + self.assertFalse(any("FAILURE" in msg for msg in messages)) + + +if __name__ == '__main__': + unittest.main() diff --git a/skills/pdf/scripts/check_fillable_fields.py b/skills/pdf/scripts/check_fillable_fields.py new file mode 100644 index 0000000..95e434a --- /dev/null +++ b/skills/pdf/scripts/check_fillable_fields.py @@ -0,0 +1,12 @@ +import sys +from pypdf import PdfReader + + +# Script for Claude to run to determine whether a PDF has fillable form fields. See FORMS.md. + + +reader = PdfReader(sys.argv[1]) +if (reader.get_fields()): + print("This PDF has fillable form fields") +else: + print("This PDF does not have fillable form fields; you will need to visually determine where to enter data") diff --git a/skills/pdf/scripts/convert_pdf_to_images.py b/skills/pdf/scripts/convert_pdf_to_images.py new file mode 100644 index 0000000..f8a4ec5 --- /dev/null +++ b/skills/pdf/scripts/convert_pdf_to_images.py @@ -0,0 +1,35 @@ +import os +import sys + +from pdf2image import convert_from_path + + +# Converts each page of a PDF to a PNG image. + + +def convert(pdf_path, output_dir, max_dim=1000): + images = convert_from_path(pdf_path, dpi=200) + + for i, image in enumerate(images): + # Scale image if needed to keep width/height under `max_dim` + width, height = image.size + if width > max_dim or height > max_dim: + scale_factor = min(max_dim / width, max_dim / height) + new_width = int(width * scale_factor) + new_height = int(height * scale_factor) + image = image.resize((new_width, new_height)) + + image_path = os.path.join(output_dir, f"page_{i+1}.png") + image.save(image_path) + print(f"Saved page {i+1} as {image_path} (size: {image.size})") + + print(f"Converted {len(images)} pages to PNG images") + + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("Usage: convert_pdf_to_images.py [input pdf] [output directory]") + sys.exit(1) + pdf_path = sys.argv[1] + output_directory = sys.argv[2] + convert(pdf_path, output_directory) diff --git a/skills/pdf/scripts/create_validation_image.py b/skills/pdf/scripts/create_validation_image.py new file mode 100644 index 0000000..154d71e --- /dev/null +++ b/skills/pdf/scripts/create_validation_image.py @@ -0,0 +1,41 @@ +import json +import sys + +from PIL import Image, ImageDraw + + +# Creates "validation" images with rectangles for the bounding box information that +# Claude creates when determining where to add text annotations in PDFs. See FORMS.md. + + +def create_validation_image(page_number, fields_json_path, input_path, output_path): + # Input file should be in the `fields.json` format described in FORMS.md. + with open(fields_json_path, 'r') as f: + data = json.load(f) + + img = Image.open(input_path) + draw = ImageDraw.Draw(img) + num_boxes = 0 + + for field in data["form_fields"]: + if field["page_number"] == page_number: + entry_box = field['entry_bounding_box'] + label_box = field['label_bounding_box'] + # Draw red rectangle over entry bounding box and blue rectangle over the label. + draw.rectangle(entry_box, outline='red', width=2) + draw.rectangle(label_box, outline='blue', width=2) + num_boxes += 2 + + img.save(output_path) + print(f"Created validation image at {output_path} with {num_boxes} bounding boxes") + + +if __name__ == "__main__": + if len(sys.argv) != 5: + print("Usage: create_validation_image.py [page number] [fields.json file] [input image path] [output image path]") + sys.exit(1) + page_number = int(sys.argv[1]) + fields_json_path = sys.argv[2] + input_image_path = sys.argv[3] + output_image_path = sys.argv[4] + create_validation_image(page_number, fields_json_path, input_image_path, output_image_path) diff --git a/skills/pdf/scripts/extract_form_field_info.py b/skills/pdf/scripts/extract_form_field_info.py new file mode 100644 index 0000000..58f5ae7 --- /dev/null +++ b/skills/pdf/scripts/extract_form_field_info.py @@ -0,0 +1,152 @@ +import json +import sys + +from pypdf import PdfReader + + +# Extracts data for the fillable form fields in a PDF and outputs JSON that +# Claude uses to fill the fields. See FORMS.md. + + +# This matches the format used by PdfReader `get_fields` and `update_page_form_field_values` methods. +def get_full_annotation_field_id(annotation): + components = [] + while annotation: + field_name = annotation.get('/T') + if field_name: + components.append(field_name) + annotation = annotation.get('/Parent') + return ".".join(reversed(components)) if components else None + + +def make_field_dict(field, field_id): + field_dict = {"field_id": field_id} + ft = field.get('/FT') + if ft == "/Tx": + field_dict["type"] = "text" + elif ft == "/Btn": + field_dict["type"] = "checkbox" # radio groups handled separately + states = field.get("/_States_", []) + if len(states) == 2: + # "/Off" seems to always be the unchecked value, as suggested by + # https://opensource.adobe.com/dc-acrobat-sdk-docs/standards/pdfstandards/pdf/PDF32000_2008.pdf#page=448 + # It can be either first or second in the "/_States_" list. + if "/Off" in states: + field_dict["checked_value"] = states[0] if states[0] != "/Off" else states[1] + field_dict["unchecked_value"] = "/Off" + else: + print(f"Unexpected state values for checkbox `${field_id}`. Its checked and unchecked values may not be correct; if you're trying to check it, visually verify the results.") + field_dict["checked_value"] = states[0] + field_dict["unchecked_value"] = states[1] + elif ft == "/Ch": + field_dict["type"] = "choice" + states = field.get("/_States_", []) + field_dict["choice_options"] = [{ + "value": state[0], + "text": state[1], + } for state in states] + else: + field_dict["type"] = f"unknown ({ft})" + return field_dict + + +# Returns a list of fillable PDF fields: +# [ +# { +# "field_id": "name", +# "page": 1, +# "type": ("text", "checkbox", "radio_group", or "choice") +# // Per-type additional fields described in FORMS.md +# }, +# ] +def get_field_info(reader: PdfReader): + fields = reader.get_fields() + + field_info_by_id = {} + possible_radio_names = set() + + for field_id, field in fields.items(): + # Skip if this is a container field with children, except that it might be + # a parent group for radio button options. + if field.get("/Kids"): + if field.get("/FT") == "/Btn": + possible_radio_names.add(field_id) + continue + field_info_by_id[field_id] = make_field_dict(field, field_id) + + # Bounding rects are stored in annotations in page objects. + + # Radio button options have a separate annotation for each choice; + # all choices have the same field name. + # See https://westhealth.github.io/exploring-fillable-forms-with-pdfrw.html + radio_fields_by_id = {} + + for page_index, page in enumerate(reader.pages): + annotations = page.get('/Annots', []) + for ann in annotations: + field_id = get_full_annotation_field_id(ann) + if field_id in field_info_by_id: + field_info_by_id[field_id]["page"] = page_index + 1 + field_info_by_id[field_id]["rect"] = ann.get('/Rect') + elif field_id in possible_radio_names: + try: + # ann['/AP']['/N'] should have two items. One of them is '/Off', + # the other is the active value. + on_values = [v for v in ann["/AP"]["/N"] if v != "/Off"] + except KeyError: + continue + if len(on_values) == 1: + rect = ann.get("/Rect") + if field_id not in radio_fields_by_id: + radio_fields_by_id[field_id] = { + "field_id": field_id, + "type": "radio_group", + "page": page_index + 1, + "radio_options": [], + } + # Note: at least on macOS 15.7, Preview.app doesn't show selected + # radio buttons correctly. (It does if you remove the leading slash + # from the value, but that causes them not to appear correctly in + # Chrome/Firefox/Acrobat/etc). + radio_fields_by_id[field_id]["radio_options"].append({ + "value": on_values[0], + "rect": rect, + }) + + # Some PDFs have form field definitions without corresponding annotations, + # so we can't tell where they are. Ignore these fields for now. + fields_with_location = [] + for field_info in field_info_by_id.values(): + if "page" in field_info: + fields_with_location.append(field_info) + else: + print(f"Unable to determine location for field id: {field_info.get('field_id')}, ignoring") + + # Sort by page number, then Y position (flipped in PDF coordinate system), then X. + def sort_key(f): + if "radio_options" in f: + rect = f["radio_options"][0]["rect"] or [0, 0, 0, 0] + else: + rect = f.get("rect") or [0, 0, 0, 0] + adjusted_position = [-rect[1], rect[0]] + return [f.get("page"), adjusted_position] + + sorted_fields = fields_with_location + list(radio_fields_by_id.values()) + sorted_fields.sort(key=sort_key) + + return sorted_fields + + +def write_field_info(pdf_path: str, json_output_path: str): + reader = PdfReader(pdf_path) + field_info = get_field_info(reader) + with open(json_output_path, "w") as f: + json.dump(field_info, f, indent=2) + print(f"Wrote {len(field_info)} fields to {json_output_path}") + + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("Usage: extract_form_field_info.py [input pdf] [output json]") + sys.exit(1) + write_field_info(sys.argv[1], sys.argv[2]) diff --git a/skills/pdf/scripts/fill_fillable_fields.py b/skills/pdf/scripts/fill_fillable_fields.py new file mode 100644 index 0000000..2158d36 --- /dev/null +++ b/skills/pdf/scripts/fill_fillable_fields.py @@ -0,0 +1,114 @@ +import json +import sys + +from pypdf import PdfReader, PdfWriter + +from extract_form_field_info import get_field_info + + +# Fills fillable form fields in a PDF. See FORMS.md. + + +def fill_pdf_fields(input_pdf_path: str, fields_json_path: str, output_pdf_path: str): + with open(fields_json_path) as f: + fields = json.load(f) + # Group by page number. + fields_by_page = {} + for field in fields: + if "value" in field: + field_id = field["field_id"] + page = field["page"] + if page not in fields_by_page: + fields_by_page[page] = {} + fields_by_page[page][field_id] = field["value"] + + reader = PdfReader(input_pdf_path) + + has_error = False + field_info = get_field_info(reader) + fields_by_ids = {f["field_id"]: f for f in field_info} + for field in fields: + existing_field = fields_by_ids.get(field["field_id"]) + if not existing_field: + has_error = True + print(f"ERROR: `{field['field_id']}` is not a valid field ID") + elif field["page"] != existing_field["page"]: + has_error = True + print(f"ERROR: Incorrect page number for `{field['field_id']}` (got {field['page']}, expected {existing_field['page']})") + else: + if "value" in field: + err = validation_error_for_field_value(existing_field, field["value"]) + if err: + print(err) + has_error = True + if has_error: + sys.exit(1) + + writer = PdfWriter(clone_from=reader) + for page, field_values in fields_by_page.items(): + writer.update_page_form_field_values(writer.pages[page - 1], field_values, auto_regenerate=False) + + # This seems to be necessary for many PDF viewers to format the form values correctly. + # It may cause the viewer to show a "save changes" dialog even if the user doesn't make any changes. + writer.set_need_appearances_writer(True) + + with open(output_pdf_path, "wb") as f: + writer.write(f) + + +def validation_error_for_field_value(field_info, field_value): + field_type = field_info["type"] + field_id = field_info["field_id"] + if field_type == "checkbox": + checked_val = field_info["checked_value"] + unchecked_val = field_info["unchecked_value"] + if field_value != checked_val and field_value != unchecked_val: + return f'ERROR: Invalid value "{field_value}" for checkbox field "{field_id}". The checked value is "{checked_val}" and the unchecked value is "{unchecked_val}"' + elif field_type == "radio_group": + option_values = [opt["value"] for opt in field_info["radio_options"]] + if field_value not in option_values: + return f'ERROR: Invalid value "{field_value}" for radio group field "{field_id}". Valid values are: {option_values}' + elif field_type == "choice": + choice_values = [opt["value"] for opt in field_info["choice_options"]] + if field_value not in choice_values: + return f'ERROR: Invalid value "{field_value}" for choice field "{field_id}". Valid values are: {choice_values}' + return None + + +# pypdf (at least version 5.7.0) has a bug when setting the value for a selection list field. +# In _writer.py around line 966: +# +# if field.get(FA.FT, "/Tx") == "/Ch" and field_flags & FA.FfBits.Combo == 0: +# txt = "\n".join(annotation.get_inherited(FA.Opt, [])) +# +# The problem is that for selection lists, `get_inherited` returns a list of two-element lists like +# [["value1", "Text 1"], ["value2", "Text 2"], ...] +# This causes `join` to throw a TypeError because it expects an iterable of strings. +# The horrible workaround is to patch `get_inherited` to return a list of the value strings. +# We call the original method and adjust the return value only if the argument to `get_inherited` +# is `FA.Opt` and if the return value is a list of two-element lists. +def monkeypatch_pydpf_method(): + from pypdf.generic import DictionaryObject + from pypdf.constants import FieldDictionaryAttributes + + original_get_inherited = DictionaryObject.get_inherited + + def patched_get_inherited(self, key: str, default = None): + result = original_get_inherited(self, key, default) + if key == FieldDictionaryAttributes.Opt: + if isinstance(result, list) and all(isinstance(v, list) and len(v) == 2 for v in result): + result = [r[0] for r in result] + return result + + DictionaryObject.get_inherited = patched_get_inherited + + +if __name__ == "__main__": + if len(sys.argv) != 4: + print("Usage: fill_fillable_fields.py [input pdf] [field_values.json] [output pdf]") + sys.exit(1) + monkeypatch_pydpf_method() + input_pdf = sys.argv[1] + fields_json = sys.argv[2] + output_pdf = sys.argv[3] + fill_pdf_fields(input_pdf, fields_json, output_pdf) diff --git a/skills/pdf/scripts/fill_pdf_form_with_annotations.py b/skills/pdf/scripts/fill_pdf_form_with_annotations.py new file mode 100644 index 0000000..59b6194 --- /dev/null +++ b/skills/pdf/scripts/fill_pdf_form_with_annotations.py @@ -0,0 +1,108 @@ +import json +import sys + +from pypdf import PdfReader, PdfWriter +from pypdf.annotations import FreeText + + +# Fills a PDF by adding text annotations defined in `fields.json`. See FORMS.md. + + +def transform_coordinates(bbox, image_width, image_height, pdf_width, pdf_height): + """Transform bounding box from image coordinates to PDF coordinates""" + # Image coordinates: origin at top-left, y increases downward + # PDF coordinates: origin at bottom-left, y increases upward + x_scale = pdf_width / image_width + y_scale = pdf_height / image_height + + left = bbox[0] * x_scale + right = bbox[2] * x_scale + + # Flip Y coordinates for PDF + top = pdf_height - (bbox[1] * y_scale) + bottom = pdf_height - (bbox[3] * y_scale) + + return left, bottom, right, top + + +def fill_pdf_form(input_pdf_path, fields_json_path, output_pdf_path): + """Fill the PDF form with data from fields.json""" + + # `fields.json` format described in FORMS.md. + with open(fields_json_path, "r") as f: + fields_data = json.load(f) + + # Open the PDF + reader = PdfReader(input_pdf_path) + writer = PdfWriter() + + # Copy all pages to writer + writer.append(reader) + + # Get PDF dimensions for each page + pdf_dimensions = {} + for i, page in enumerate(reader.pages): + mediabox = page.mediabox + pdf_dimensions[i + 1] = [mediabox.width, mediabox.height] + + # Process each form field + annotations = [] + for field in fields_data["form_fields"]: + page_num = field["page_number"] + + # Get page dimensions and transform coordinates. + page_info = next(p for p in fields_data["pages"] if p["page_number"] == page_num) + image_width = page_info["image_width"] + image_height = page_info["image_height"] + pdf_width, pdf_height = pdf_dimensions[page_num] + + transformed_entry_box = transform_coordinates( + field["entry_bounding_box"], + image_width, image_height, + pdf_width, pdf_height + ) + + # Skip empty fields + if "entry_text" not in field or "text" not in field["entry_text"]: + continue + entry_text = field["entry_text"] + text = entry_text["text"] + if not text: + continue + + font_name = entry_text.get("font", "Arial") + font_size = str(entry_text.get("font_size", 14)) + "pt" + font_color = entry_text.get("font_color", "000000") + + # Font size/color seems to not work reliably across viewers: + # https://github.com/py-pdf/pypdf/issues/2084 + annotation = FreeText( + text=text, + rect=transformed_entry_box, + font=font_name, + font_size=font_size, + font_color=font_color, + border_color=None, + background_color=None, + ) + annotations.append(annotation) + # page_number is 0-based for pypdf + writer.add_annotation(page_number=page_num - 1, annotation=annotation) + + # Save the filled PDF + with open(output_pdf_path, "wb") as output: + writer.write(output) + + print(f"Successfully filled PDF form and saved to {output_pdf_path}") + print(f"Added {len(annotations)} text annotations") + + +if __name__ == "__main__": + if len(sys.argv) != 4: + print("Usage: fill_pdf_form_with_annotations.py [input pdf] [fields.json] [output pdf]") + sys.exit(1) + input_pdf = sys.argv[1] + fields_json = sys.argv[2] + output_pdf = sys.argv[3] + + fill_pdf_form(input_pdf, fields_json, output_pdf) \ No newline at end of file diff --git a/skills/pptx/LICENSE.txt b/skills/pptx/LICENSE.txt new file mode 100644 index 0000000..c55ab42 --- /dev/null +++ b/skills/pptx/LICENSE.txt @@ -0,0 +1,30 @@ +© 2025 Anthropic, PBC. All rights reserved. + +LICENSE: Use of these materials (including all code, prompts, assets, files, +and other components of this Skill) is governed by your agreement with +Anthropic regarding use of Anthropic's services. If no separate agreement +exists, use is governed by Anthropic's Consumer Terms of Service or +Commercial Terms of Service, as applicable: +https://www.anthropic.com/legal/consumer-terms +https://www.anthropic.com/legal/commercial-terms +Your applicable agreement is referred to as the "Agreement." "Services" are +as defined in the Agreement. + +ADDITIONAL RESTRICTIONS: Notwithstanding anything in the Agreement to the +contrary, users may not: + +- Extract these materials from the Services or retain copies of these + materials outside the Services +- Reproduce or copy these materials, except for temporary copies created + automatically during authorized use of the Services +- Create derivative works based on these materials +- Distribute, sublicense, or transfer these materials to any third party +- Make, offer to sell, sell, or import any inventions embodied in these + materials +- Reverse engineer, decompile, or disassemble these materials + +The receipt, viewing, or possession of these materials does not convey or +imply any license or right beyond those expressly granted above. + +Anthropic retains all right, title, and interest in these materials, +including all copyrights, patents, and other intellectual property rights. diff --git a/skills/pptx/SKILL.md b/skills/pptx/SKILL.md new file mode 100644 index 0000000..89ea4db --- /dev/null +++ b/skills/pptx/SKILL.md @@ -0,0 +1,460 @@ +--- +name: pptx +description: "Presentation creation, editing, and analysis. When Claude needs to work with presentations (.pptx files) for: (1) Creating new presentations, (2) Modifying or editing content, (3) Working with layouts, (4) Adding comments or speaker notes, or any other presentation tasks" +license: Proprietary. LICENSE.txt has complete terms +--- + +# PPTX creation, editing, and analysis + +## Overview + +Create, edit, or analyze the contents of .pptx files when requested. A .pptx file is essentially a ZIP archive containing XML files and other resources. Different tools and workflows are available for different tasks. + +## Reading and analyzing content + +### Text extraction + +To read just the text content of a presentation, convert the document to markdown: + +```bash +# Convert document to markdown +python -m markitdown path-to-file.pptx +``` + +### Raw XML access + +Use raw XML access for: comments, speaker notes, slide layouts, animations, design elements, and complex formatting. To access these features, unpack a presentation and read its raw XML contents. + +#### Unpacking a file + +`python ooxml/scripts/unpack.py ` + +**Note**: The unpack.py script is located at `skills/pptx/ooxml/scripts/unpack.py` relative to the project root. If the script doesn't exist at this path, use `find . -name "unpack.py"` to locate it. + +#### Key file structures + +- `ppt/presentation.xml` - Main presentation metadata and slide references +- `ppt/slides/slide{N}.xml` - Individual slide contents (slide1.xml, slide2.xml, etc.) +- `ppt/notesSlides/notesSlide{N}.xml` - Speaker notes for each slide +- `ppt/comments/modernComment_*.xml` - Comments for specific slides +- `ppt/slideLayouts/` - Layout templates for slides +- `ppt/slideMasters/` - Master slide templates +- `ppt/theme/` - Theme and styling information +- `ppt/media/` - Images and other media files + +#### Typography and color extraction + +**To emulate example designs**, analyze the presentation's typography and colors first using the methods below: + +1. **Read theme file**: Check `ppt/theme/theme1.xml` for colors (``) and fonts (``) +2. **Sample slide content**: Examine `ppt/slides/slide1.xml` for actual font usage (``) and colors +3. **Search for patterns**: Use grep to find color (``, ``) and font references across all XML files + +## Creating a new PowerPoint presentation **without a template** + +When creating a new PowerPoint presentation from scratch, use the **html2pptx** workflow to convert HTML slides to PowerPoint with accurate positioning. + +### Workflow + +1. **MANDATORY - READ ENTIRE FILE NOW**: Read [`html2pptx.md`](html2pptx.md) completely from start to finish. **NEVER set any range limits when reading this file.** Read the full file content for detailed syntax, critical formatting rules, and best practices before proceeding with presentation creation. +2. **PREREQUISITE - Install html2pptx library**: + - Check and install if needed: `npm list -g @ant/html2pptx || npm install -g skills/pptx/html2pptx.tgz` + - **Note**: If you see "Cannot find module '@ant/html2pptx'" error later, the package isn't installed +3. **CRITICAL**: Plan the presentation + - Plan the shared aspects of the presentation. Describe the tone of the presentation's content and the colors and typography that should be used in the presentation. + - Write a DETAILED outline of the presentation + - For each slide, describe the slide's layout and contents + - For each slide, write presenter notes (1 to 3 sentences per slide) +4. **CRITICAL**: Set CSS variables + - In a shared `.css` file, override CSS variables to use on each slide for colors, typography, and spacing. DO NOT create classes in this file. +5. Create an HTML file for each slide with proper dimensions (e.g., 960px × 540px for 16:9) + - Recall the outline, layout/content description, and speaker notes you wrote for this slide in Step 3. Think out loud how to best apply them to this slide. + - Embed the contents of the shared `.css` file in a ` + + +
+

Slide title

+

Subtitle or context

+
+
+ + +``` + +## Using the @ant/html2pptx Library + +### Installation & Setup + +**Important**: Install the @ant/html2pptx package globally before using this library. See the **Prerequisites Check** section at the top of this document. + +**When running scripts, always set NODE_PATH:** + +```sh +NODE_PATH="$(npm root -g)" node your-script.js 2>&1 +``` + +**If you get "Cannot find module" errors**, see the Prerequisites Check section or verify that NODE_PATH is correctly pointing to the global node_modules directory. + +### Dependencies + +These libraries have been globally installed and are available to use: + +- `pptxgenjs` +- `playwright` + +### ⚠️ IMPORTANT: How To Use html2pptx + +Common errors: + +- DO NOT call `pptx.addSlide()` directly, `html2pptx` creates a slide for you +- `html2pptx` accepts an `htmlFilePath` and a `pptx` presentation object + - If you pass the wrong arguments, your script will throw errors or time out + +**Your script MUST follow the following example.** + +```javascript +const pptxgen = require("pptxgenjs"); +const { html2pptx } = require("@ant/html2pptx"); + +// Create a new pptx presentation +const pptx = new pptxgen(); +pptx.layout = "LAYOUT_16x9"; // Must match HTML body dimensions + +// Add an HTML-only slide +await html2pptx("slide1.html", pptx); + +// Add a slide with a chart placeholder +const { slide, placeholders } = await html2pptx("slide2.html", pptx); +slide.addChart(pptx.charts.LINE, chartData, placeholders[0]); + +// Save the presentation +await pptx.writeFile("output.pptx"); +``` + +### API Reference + +#### Function Signature + +```javascript +await html2pptx(htmlFilePath, pptxPresentation, options); +``` + +#### Parameters + +- `htmlFilePath` (string): Path to HTML file (absolute or relative) +- `pptxPresentation` (pptxgen): PptxGenJS presentation instance with layout already set +- `options` (object, optional): + - `tmpDir` (string): Temporary directory for generated files (default: `process.env.TMPDIR || '/tmp'`) + +#### Returns + +```javascript +{ + slide: pptxgenSlide, // The created/updated slide + placeholders: [ // Array of placeholder positions + { id: string, x: number, y: number, w: number, h: number }, + ... + ] +} +``` + +### Validation + +The library automatically validates and collects all errors before throwing: + +1. **HTML dimensions must match presentation layout** - Reports dimension mismatches +2. **Content must not overflow body** - Reports overflow with exact measurements +3. **Text element styling** - Reports backgrounds/borders/shadows on text elements (only allowed on block elements) + +**All validation errors are collected and reported together** in a single error message, allowing you to fix all issues at once instead of one at a time. + +### Working with Placeholders + +```javascript +const { slide, placeholders } = await html2pptx("slide.html", pptx); + +// Use first placeholder +slide.addChart(pptx.charts.BAR, data, placeholders[0]); + +// Find by ID +const chartArea = placeholders.find((p) => p.id === "chart-area"); +slide.addChart(pptx.charts.LINE, data, chartArea); +``` + +### Complete Example + +```javascript +const pptxgen = require("pptxgenjs"); +const { html2pptx } = require("@ant/html2pptx"); + +async function createPresentation() { + const pptx = new pptxgen(); + pptx.layout = "LAYOUT_16x9"; + pptx.author = "Your Name"; + pptx.title = "My Presentation"; + + // Slide 1: Title + const { slide: slide1 } = await html2pptx("slides/title.html", pptx); + + // Slide 2: Content with chart + const { slide: slide2, placeholders } = await html2pptx( + "slides/data.html", + pptx + ); + + const chartData = [ + { + name: "Sales", + labels: ["Q1", "Q2", "Q3", "Q4"], + values: [4500, 5500, 6200, 7100], + }, + ]; + + slide2.addChart(pptx.charts.BAR, chartData, { + ...placeholders[0], + showTitle: true, + title: "Quarterly Sales", + showCatAxisTitle: true, + catAxisTitle: "Quarter", + showValAxisTitle: true, + valAxisTitle: "Sales ($000s)", + }); + + // Save + await pptx.writeFile({ fileName: "presentation.pptx" }); + console.log("Presentation created successfully!"); +} + +createPresentation().catch(console.error); +``` + +**Run with:** + +```sh +NODE_PATH="$(npm root -g)" node create-presentation.js +``` + +## Using PptxGenJS + +After converting HTML to slides with `html2pptx`, you'll use PptxGenJS to add dynamic content like charts, images, and additional elements. + +### ⚠️ Critical Rules + +#### Colors + +- **NEVER use `#` prefix** with hex colors in PptxGenJS - causes file corruption +- ✅ Correct: `color: "FF0000"`, `fill: { color: "0066CC" }` +- ❌ Wrong: `color: "#FF0000"` (breaks document) + +### Adding Images + +Always calculate aspect ratios from actual image dimensions: + +```javascript +// Get image dimensions: identify image.png | grep -o '[0-9]* x [0-9]*' +const imgWidth = 1860, + imgHeight = 1519; // From actual file +const aspectRatio = imgWidth / imgHeight; + +const h = 3; // Max height +const w = h * aspectRatio; +const x = (10 - w) / 2; // Center on 16:9 slide + +slide.addImage({ path: "chart.png", x, y: 1.5, w, h }); +``` + +### Adding Text + +```javascript +// Rich text with formatting +slide.addText( + [ + { text: "Bold ", options: { bold: true } }, + { text: "Italic ", options: { italic: true } }, + { text: "Normal" }, + ], + { + x: 1, + y: 2, + w: 8, + h: 1, + } +); +``` + +### Adding Shapes + +```javascript +// Rectangle +slide.addShape(pptx.shapes.RECTANGLE, { + x: 1, + y: 1, + w: 3, + h: 2, + fill: { color: "4472C4" }, + line: { color: "000000", width: 2 }, +}); + +// Circle +slide.addShape(pptx.shapes.OVAL, { + x: 5, + y: 1, + w: 2, + h: 2, + fill: { color: "ED7D31" }, +}); + +// Rounded rectangle +slide.addShape(pptx.shapes.ROUNDED_RECTANGLE, { + x: 1, + y: 4, + w: 3, + h: 1.5, + fill: { color: "70AD47" }, + rectRadius: 0.2, +}); +``` + +### Adding Charts + +**Required for most charts:** Axis labels using `catAxisTitle` (category) and `valAxisTitle` (value). + +**Chart Data Format:** + +- Use **single series with all labels** for simple bar/line charts +- Each series creates a separate legend entry +- Labels array defines X-axis values + +**Time Series Data - Choose Correct Granularity:** + +- **< 30 days**: Use daily grouping (e.g., "10-01", "10-02") - avoid monthly aggregation that creates single-point charts +- **30-365 days**: Use monthly grouping (e.g., "2024-01", "2024-02") +- **> 365 days**: Use yearly grouping (e.g., "2023", "2024") +- **Validate**: Charts with only 1 data point likely indicate incorrect aggregation for the time period + +```javascript +const { slide, placeholders } = await html2pptx("slide.html", pptx); + +// CORRECT: Single series with all labels +slide.addChart( + pptx.charts.BAR, + [ + { + name: "Sales 2024", + labels: ["Q1", "Q2", "Q3", "Q4"], + values: [4500, 5500, 6200, 7100], + }, + ], + { + ...placeholders[0], // Use placeholder position + barDir: "col", // 'col' = vertical bars, 'bar' = horizontal + showTitle: true, + title: "Quarterly Sales", + showLegend: false, // No legend needed for single series + // Required axis labels + showCatAxisTitle: true, + catAxisTitle: "Quarter", + showValAxisTitle: true, + valAxisTitle: "Sales ($000s)", + // Optional: Control scaling (adjust min based on data range for better visualization) + valAxisMaxVal: 8000, + valAxisMinVal: 0, // Use 0 for counts/amounts; for clustered data (e.g., 4500-7100), consider starting closer to min value + valAxisMajorUnit: 2000, // Control y-axis label spacing to prevent crowding + catAxisLabelRotate: 45, // Rotate labels if crowded + dataLabelPosition: "outEnd", + dataLabelColor: "000000", + // Use single color for single-series charts + chartColors: ["4472C4"], // All bars same color + } +); +``` + +#### Scatter Chart + +**IMPORTANT**: Scatter chart data format is unusual - first series contains X-axis values, subsequent series contain Y-values: + +```javascript +// Prepare data +const data1 = [ + { x: 10, y: 20 }, + { x: 15, y: 25 }, + { x: 20, y: 30 }, +]; +const data2 = [ + { x: 12, y: 18 }, + { x: 18, y: 22 }, +]; + +const allXValues = [...data1.map((d) => d.x), ...data2.map((d) => d.x)]; + +slide.addChart( + pptx.charts.SCATTER, + [ + { name: "X-Axis", values: allXValues }, // First series = X values + { name: "Series 1", values: data1.map((d) => d.y) }, // Y values only + { name: "Series 2", values: data2.map((d) => d.y) }, // Y values only + ], + { + x: 1, + y: 1, + w: 8, + h: 4, + lineSize: 0, // 0 = no connecting lines + lineDataSymbol: "circle", + lineDataSymbolSize: 6, + showCatAxisTitle: true, + catAxisTitle: "X Axis", + showValAxisTitle: true, + valAxisTitle: "Y Axis", + chartColors: ["4472C4", "ED7D31"], + } +); +``` + +#### Line Chart + +```javascript +slide.addChart( + pptx.charts.LINE, + [ + { + name: "Temperature", + labels: ["Jan", "Feb", "Mar", "Apr"], + values: [32, 35, 42, 55], + }, + ], + { + x: 1, + y: 1, + w: 8, + h: 4, + lineSize: 4, + lineSmooth: true, + // Required axis labels + showCatAxisTitle: true, + catAxisTitle: "Month", + showValAxisTitle: true, + valAxisTitle: "Temperature (°F)", + // Optional: Y-axis range (set min based on data range for better visualization) + valAxisMinVal: 0, // For ranges starting at 0 (counts, percentages, etc.) + valAxisMaxVal: 60, + valAxisMajorUnit: 20, // Control y-axis label spacing to prevent crowding (e.g., 10, 20, 25) + // valAxisMinVal: 30, // PREFERRED: For data clustered in a range (e.g., 32-55 or ratings 3-5), start axis closer to min value to show variation + // Optional: Chart colors + chartColors: ["4472C4", "ED7D31", "A5A5A5"], + } +); +``` + +#### Pie Chart (No Axis Labels Required) + +**CRITICAL**: Pie charts require a **single data series** with all categories in the `labels` array and corresponding values in the `values` array. + +```javascript +slide.addChart( + pptx.charts.PIE, + [ + { + name: "Market Share", + labels: ["Product A", "Product B", "Other"], // All categories in one array + values: [35, 45, 20], // All values in one array + }, + ], + { + x: 2, + y: 1, + w: 6, + h: 4, + showPercent: true, + showLegend: true, + legendPos: "r", // right + chartColors: ["4472C4", "ED7D31", "A5A5A5"], + } +); +``` + +#### Multiple Data Series + +```javascript +slide.addChart( + pptx.charts.LINE, + [ + { + name: "Product A", + labels: ["Q1", "Q2", "Q3", "Q4"], + values: [10, 20, 30, 40], + }, + { + name: "Product B", + labels: ["Q1", "Q2", "Q3", "Q4"], + values: [15, 25, 20, 35], + }, + ], + { + x: 1, + y: 1, + w: 8, + h: 4, + showCatAxisTitle: true, + catAxisTitle: "Quarter", + showValAxisTitle: true, + valAxisTitle: "Revenue ($M)", + } +); +``` + +### Chart Colors + +**CRITICAL**: Use hex colors **without** the `#` prefix - including `#` causes file corruption. + +**Align chart colors with your chosen design palette**, ensuring sufficient contrast and distinctiveness for data visualization. Adjust colors for: + +- Strong contrast between adjacent series +- Readability against slide backgrounds +- Accessibility (avoid red-green only combinations) + +```javascript +// Example: Ocean palette-inspired chart colors (adjusted for contrast) +const chartColors = ["16A085", "FF6B9D", "2C3E50", "F39C12", "9B59B6"]; + +// Single-series chart: Use one color for all bars/points +slide.addChart( + pptx.charts.BAR, + [ + { + name: "Sales", + labels: ["Q1", "Q2", "Q3", "Q4"], + values: [4500, 5500, 6200, 7100], + }, + ], + { + ...placeholders[0], + chartColors: ["16A085"], // All bars same color + showLegend: false, + } +); + +// Multi-series chart: Each series gets a different color +slide.addChart( + pptx.charts.LINE, + [ + { name: "Product A", labels: ["Q1", "Q2", "Q3"], values: [10, 20, 30] }, + { name: "Product B", labels: ["Q1", "Q2", "Q3"], values: [15, 25, 20] }, + ], + { + ...placeholders[0], + chartColors: ["16A085", "FF6B9D"], // One color per series + } +); +``` + +### Adding Tables + +Tables can be added with basic or advanced formatting: + +#### Basic Table + +```javascript +slide.addTable( + [ + ["Header 1", "Header 2", "Header 3"], + ["Row 1, Col 1", "Row 1, Col 2", "Row 1, Col 3"], + ["Row 2, Col 1", "Row 2, Col 2", "Row 2, Col 3"], + ], + { + x: 0.5, + y: 1, + w: 9, + h: 3, + border: { pt: 1, color: "999999" }, + fill: { color: "F1F1F1" }, + } +); +``` + +#### Table with Custom Formatting + +```javascript +const tableData = [ + // Header row with custom styling + [ + { + text: "Product", + options: { fill: { color: "4472C4" }, color: "FFFFFF", bold: true }, + }, + { + text: "Revenue", + options: { fill: { color: "4472C4" }, color: "FFFFFF", bold: true }, + }, + { + text: "Growth", + options: { fill: { color: "4472C4" }, color: "FFFFFF", bold: true }, + }, + ], + // Data rows + ["Product A", "$50M", "+15%"], + ["Product B", "$35M", "+22%"], + ["Product C", "$28M", "+8%"], +]; + +slide.addTable(tableData, { + x: 1, + y: 1.5, + w: 8, + h: 3, + colW: [3, 2.5, 2.5], // Column widths + rowH: [0.5, 0.6, 0.6, 0.6], // Row heights + border: { pt: 1, color: "CCCCCC" }, + align: "center", + valign: "middle", + fontSize: 14, +}); +``` + +#### Table with Merged Cells + +```javascript +const mergedTableData = [ + [ + { + text: "Q1 Results", + options: { + colspan: 3, + fill: { color: "4472C4" }, + color: "FFFFFF", + bold: true, + }, + }, + ], + ["Product", "Sales", "Market Share"], + ["Product A", "$25M", "35%"], + ["Product B", "$18M", "25%"], +]; + +slide.addTable(mergedTableData, { + x: 1, + y: 1, + w: 8, + h: 2.5, + colW: [3, 2.5, 2.5], + border: { pt: 1, color: "DDDDDD" }, +}); +``` + +### Table Options + +Common table options: + +- `x, y, w, h` - Position and size +- `colW` - Array of column widths (in inches) +- `rowH` - Array of row heights (in inches) +- `border` - Border style: `{ pt: 1, color: "999999" }` +- `fill` - Background color (no # prefix) +- `align` - Text alignment: "left", "center", "right" +- `valign` - Vertical alignment: "top", "middle", "bottom" +- `fontSize` - Text size +- `autoPage` - Auto-create new slides if content overflows diff --git a/skills/pptx/html2pptx.tgz b/skills/pptx/html2pptx.tgz new file mode 100644 index 0000000..099e080 Binary files /dev/null and b/skills/pptx/html2pptx.tgz differ diff --git a/skills/pptx/ooxml.md b/skills/pptx/ooxml.md new file mode 100644 index 0000000..951b3cf --- /dev/null +++ b/skills/pptx/ooxml.md @@ -0,0 +1,427 @@ +# Office Open XML Technical Reference for PowerPoint + +**Important: Read this entire document before starting.** Critical XML schema rules and formatting requirements are covered throughout. Incorrect implementation can create invalid PPTX files that PowerPoint cannot open. + +## Technical Guidelines + +### Schema Compliance +- **Element ordering in ``**: ``, ``, `` +- **Whitespace**: Add `xml:space='preserve'` to `` elements with leading/trailing spaces +- **Unicode**: Escape characters in ASCII content: `"` becomes `“` +- **Images**: Add to `ppt/media/`, reference in slide XML, set dimensions to fit slide bounds +- **Relationships**: Update `ppt/slides/_rels/slideN.xml.rels` for each slide's resources +- **Dirty attribute**: Add `dirty="0"` to `` and `` elements to indicate clean state + +## Presentation Structure + +### Basic Slide Structure +```xml + + + + + ... + ... + + + + +``` + +### Text Box / Shape with Text +```xml + + + + + + + + + + + + + + + + + + + + + + Slide Title + + + + +``` + +### Text Formatting +```xml + + + + Bold Text + + + + + + Italic Text + + + + + + Underlined + + + + + + + + + + Highlighted Text + + + + + + + + + + Colored Arial 24pt + + + + + + + + + + Formatted text + +``` + +### Lists +```xml + + + + + + + First bullet point + + + + + + + + + + First numbered item + + + + + + + + + + Indented bullet + + +``` + +### Shapes +```xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +``` + +### Images +```xml + + + + + + + + + + + + + + + + + + + + + + + + + + +``` + +### Tables +```xml + + + + + + + + + + + + + + + + + + + + + + + + + + + Cell 1 + + + + + + + + + + + Cell 2 + + + + + + + + + +``` + +### Slide Layouts + +```xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +``` + +## File Updates + +When adding content, update these files: + +**`ppt/_rels/presentation.xml.rels`:** +```xml + + +``` + +**`ppt/slides/_rels/slide1.xml.rels`:** +```xml + + +``` + +**`[Content_Types].xml`:** +```xml + + + +``` + +**`ppt/presentation.xml`:** +```xml + + + + +``` + +**`docProps/app.xml`:** Update slide count and statistics +```xml +2 +10 +50 +``` + +## Slide Operations + +### Adding a New Slide +When adding a slide to the end of the presentation: + +1. **Create the slide file** (`ppt/slides/slideN.xml`) +2. **Update `[Content_Types].xml`**: Add Override for the new slide +3. **Update `ppt/_rels/presentation.xml.rels`**: Add relationship for the new slide +4. **Update `ppt/presentation.xml`**: Add slide ID to `` +5. **Create slide relationships** (`ppt/slides/_rels/slideN.xml.rels`) if needed +6. **Update `docProps/app.xml`**: Increment slide count and update statistics (if present) + +### Duplicating a Slide +1. Copy the source slide XML file with a new name +2. Update all IDs in the new slide to be unique +3. Follow the "Adding a New Slide" steps above +4. **CRITICAL**: Remove or update any notes slide references in `_rels` files +5. Remove references to unused media files + +### Reordering Slides +1. **Update `ppt/presentation.xml`**: Reorder `` elements in `` +2. The order of `` elements determines slide order +3. Keep slide IDs and relationship IDs unchanged + +Example: +```xml + + + + + + + + + + + + + +``` + +### Deleting a Slide +1. **Remove from `ppt/presentation.xml`**: Delete the `` entry +2. **Remove from `ppt/_rels/presentation.xml.rels`**: Delete the relationship +3. **Remove from `[Content_Types].xml`**: Delete the Override entry +4. **Delete files**: Remove `ppt/slides/slideN.xml` and `ppt/slides/_rels/slideN.xml.rels` +5. **Update `docProps/app.xml`**: Decrement slide count and update statistics +6. **Clean up unused media**: Remove orphaned images from `ppt/media/` + +Note: Don't renumber remaining slides - keep their original IDs and filenames. + + +## Common Errors to Avoid + +- **Encodings**: Escape unicode characters in ASCII content: `"` becomes `“` +- **Images**: Add to `ppt/media/` and update relationship files +- **Lists**: Omit bullets from list headers +- **IDs**: Use valid hexadecimal values for UUIDs +- **Themes**: Check all themes in `theme` directory for colors + +## Validation Checklist for Template-Based Presentations + +### Before Packing, Always: +- **Clean unused resources**: Remove unreferenced media, fonts, and notes directories +- **Fix Content_Types.xml**: Declare ALL slides, layouts, and themes present in the package +- **Fix relationship IDs**: + - Remove font embed references if not using embedded fonts +- **Remove broken references**: Check all `_rels` files for references to deleted resources + +### Common Template Duplication Pitfalls: +- Multiple slides referencing the same notes slide after duplication +- Image/media references from template slides that no longer exist +- Font embedding references when fonts aren't included +- Missing slideLayout declarations for layouts 12-25 +- docProps directory may not unpack - this is optional \ No newline at end of file diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-chart.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-chart.xsd new file mode 100644 index 0000000..6454ef9 --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-chart.xsd @@ -0,0 +1,1499 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd new file mode 100644 index 0000000..afa4f46 --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd @@ -0,0 +1,146 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd new file mode 100644 index 0000000..64e66b8 --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd @@ -0,0 +1,1085 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd new file mode 100644 index 0000000..687eea8 --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd @@ -0,0 +1,11 @@ + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-main.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-main.xsd new file mode 100644 index 0000000..6ac81b0 --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-main.xsd @@ -0,0 +1,3081 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-picture.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-picture.xsd new file mode 100644 index 0000000..1dbf051 --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-picture.xsd @@ -0,0 +1,23 @@ + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd new file mode 100644 index 0000000..f1af17d --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd @@ -0,0 +1,185 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd new file mode 100644 index 0000000..0a185ab --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd @@ -0,0 +1,287 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/pml.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/pml.xsd new file mode 100644 index 0000000..14ef488 --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/pml.xsd @@ -0,0 +1,1676 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd new file mode 100644 index 0000000..c20f3bf --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd @@ -0,0 +1,28 @@ + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd new file mode 100644 index 0000000..ac60252 --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd @@ -0,0 +1,144 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd new file mode 100644 index 0000000..424b8ba --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd @@ -0,0 +1,174 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd new file mode 100644 index 0000000..2bddce2 --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd @@ -0,0 +1,25 @@ + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd new file mode 100644 index 0000000..8a8c18b --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd new file mode 100644 index 0000000..5c42706 --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd @@ -0,0 +1,59 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd new file mode 100644 index 0000000..853c341 --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd @@ -0,0 +1,56 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd new file mode 100644 index 0000000..da835ee --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd @@ -0,0 +1,195 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-math.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-math.xsd new file mode 100644 index 0000000..87ad265 --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-math.xsd @@ -0,0 +1,582 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd new file mode 100644 index 0000000..9e86f1b --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd @@ -0,0 +1,25 @@ + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/sml.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/sml.xsd new file mode 100644 index 0000000..d0be42e --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/sml.xsd @@ -0,0 +1,4439 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-main.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-main.xsd new file mode 100644 index 0000000..8821dd1 --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-main.xsd @@ -0,0 +1,570 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd new file mode 100644 index 0000000..ca2575c --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd @@ -0,0 +1,509 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd new file mode 100644 index 0000000..dd079e6 --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd @@ -0,0 +1,12 @@ + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd new file mode 100644 index 0000000..3dd6cf6 --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd @@ -0,0 +1,108 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd new file mode 100644 index 0000000..f1041e3 --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd @@ -0,0 +1,96 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/wml.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/wml.xsd new file mode 100644 index 0000000..9c5b7a6 --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/wml.xsd @@ -0,0 +1,3646 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/xml.xsd b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/xml.xsd new file mode 100644 index 0000000..0f13678 --- /dev/null +++ b/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/xml.xsd @@ -0,0 +1,116 @@ + + + + + + See http://www.w3.org/XML/1998/namespace.html and + http://www.w3.org/TR/REC-xml for information about this namespace. + + This schema document describes the XML namespace, in a form + suitable for import by other schema documents. + + Note that local names in this namespace are intended to be defined + only by the World Wide Web Consortium or its subgroups. The + following names are currently defined in this namespace and should + not be used with conflicting semantics by any Working Group, + specification, or document instance: + + base (as an attribute name): denotes an attribute whose value + provides a URI to be used as the base for interpreting any + relative URIs in the scope of the element on which it + appears; its value is inherited. This name is reserved + by virtue of its definition in the XML Base specification. + + lang (as an attribute name): denotes an attribute whose value + is a language code for the natural language of the content of + any element; its value is inherited. This name is reserved + by virtue of its definition in the XML specification. + + space (as an attribute name): denotes an attribute whose + value is a keyword indicating what whitespace processing + discipline is intended for the content of the element; its + value is inherited. This name is reserved by virtue of its + definition in the XML specification. + + Father (in any context at all): denotes Jon Bosak, the chair of + the original XML Working Group. This name is reserved by + the following decision of the W3C XML Plenary and + XML Coordination groups: + + In appreciation for his vision, leadership and dedication + the W3C XML Plenary on this 10th day of February, 2000 + reserves for Jon Bosak in perpetuity the XML name + xml:Father + + + + + This schema defines attributes and an attribute group + suitable for use by + schemas wishing to allow xml:base, xml:lang or xml:space attributes + on elements they define. + + To enable this, such a schema must import this schema + for the XML namespace, e.g. as follows: + <schema . . .> + . . . + <import namespace="http://www.w3.org/XML/1998/namespace" + schemaLocation="http://www.w3.org/2001/03/xml.xsd"/> + + Subsequently, qualified reference to any of the attributes + or the group defined below will have the desired effect, e.g. + + <type . . .> + . . . + <attributeGroup ref="xml:specialAttrs"/> + + will define a type which will schema-validate an instance + element with any of those attributes + + + + In keeping with the XML Schema WG's standard versioning + policy, this schema document will persist at + http://www.w3.org/2001/03/xml.xsd. + At the date of issue it can also be found at + http://www.w3.org/2001/xml.xsd. + The schema document at that URI may however change in the future, + in order to remain compatible with the latest version of XML Schema + itself. In other words, if the XML Schema namespace changes, the version + of this document at + http://www.w3.org/2001/xml.xsd will change + accordingly; the version at + http://www.w3.org/2001/03/xml.xsd will not change. + + + + + + In due course, we should install the relevant ISO 2- and 3-letter + codes as the enumerated possible values . . . + + + + + + + + + + + + + + + See http://www.w3.org/TR/xmlbase/ for + information about this attribute. + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-contentTypes.xsd b/skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-contentTypes.xsd new file mode 100644 index 0000000..a6de9d2 --- /dev/null +++ b/skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-contentTypes.xsd @@ -0,0 +1,42 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-coreProperties.xsd b/skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-coreProperties.xsd new file mode 100644 index 0000000..10e978b --- /dev/null +++ b/skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-coreProperties.xsd @@ -0,0 +1,50 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-digSig.xsd b/skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-digSig.xsd new file mode 100644 index 0000000..4248bf7 --- /dev/null +++ b/skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-digSig.xsd @@ -0,0 +1,49 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-relationships.xsd b/skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-relationships.xsd new file mode 100644 index 0000000..5649746 --- /dev/null +++ b/skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-relationships.xsd @@ -0,0 +1,33 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/mce/mc.xsd b/skills/pptx/ooxml/schemas/mce/mc.xsd new file mode 100644 index 0000000..ef72545 --- /dev/null +++ b/skills/pptx/ooxml/schemas/mce/mc.xsd @@ -0,0 +1,75 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/microsoft/wml-2010.xsd b/skills/pptx/ooxml/schemas/microsoft/wml-2010.xsd new file mode 100644 index 0000000..f65f777 --- /dev/null +++ b/skills/pptx/ooxml/schemas/microsoft/wml-2010.xsd @@ -0,0 +1,560 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/microsoft/wml-2012.xsd b/skills/pptx/ooxml/schemas/microsoft/wml-2012.xsd new file mode 100644 index 0000000..6b00755 --- /dev/null +++ b/skills/pptx/ooxml/schemas/microsoft/wml-2012.xsd @@ -0,0 +1,67 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/microsoft/wml-2018.xsd b/skills/pptx/ooxml/schemas/microsoft/wml-2018.xsd new file mode 100644 index 0000000..f321d33 --- /dev/null +++ b/skills/pptx/ooxml/schemas/microsoft/wml-2018.xsd @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/microsoft/wml-cex-2018.xsd b/skills/pptx/ooxml/schemas/microsoft/wml-cex-2018.xsd new file mode 100644 index 0000000..364c6a9 --- /dev/null +++ b/skills/pptx/ooxml/schemas/microsoft/wml-cex-2018.xsd @@ -0,0 +1,20 @@ + + + + + + + + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/microsoft/wml-cid-2016.xsd b/skills/pptx/ooxml/schemas/microsoft/wml-cid-2016.xsd new file mode 100644 index 0000000..fed9d15 --- /dev/null +++ b/skills/pptx/ooxml/schemas/microsoft/wml-cid-2016.xsd @@ -0,0 +1,13 @@ + + + + + + + + + + + + + diff --git a/skills/pptx/ooxml/schemas/microsoft/wml-sdtdatahash-2020.xsd b/skills/pptx/ooxml/schemas/microsoft/wml-sdtdatahash-2020.xsd new file mode 100644 index 0000000..680cf15 --- /dev/null +++ b/skills/pptx/ooxml/schemas/microsoft/wml-sdtdatahash-2020.xsd @@ -0,0 +1,4 @@ + + + + diff --git a/skills/pptx/ooxml/schemas/microsoft/wml-symex-2015.xsd b/skills/pptx/ooxml/schemas/microsoft/wml-symex-2015.xsd new file mode 100644 index 0000000..89ada90 --- /dev/null +++ b/skills/pptx/ooxml/schemas/microsoft/wml-symex-2015.xsd @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/skills/pptx/ooxml/scripts/pack.py b/skills/pptx/ooxml/scripts/pack.py new file mode 100644 index 0000000..68bc088 --- /dev/null +++ b/skills/pptx/ooxml/scripts/pack.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python3 +""" +Tool to pack a directory into a .docx, .pptx, or .xlsx file with XML formatting undone. + +Example usage: + python pack.py [--force] +""" + +import argparse +import shutil +import subprocess +import sys +import tempfile +import defusedxml.minidom +import zipfile +from pathlib import Path + + +def main(): + parser = argparse.ArgumentParser(description="Pack a directory into an Office file") + parser.add_argument("input_directory", help="Unpacked Office document directory") + parser.add_argument("output_file", help="Output Office file (.docx/.pptx/.xlsx)") + parser.add_argument("--force", action="store_true", help="Skip validation") + args = parser.parse_args() + + try: + success = pack_document( + args.input_directory, args.output_file, validate=not args.force + ) + + # Show warning if validation was skipped + if args.force: + print("Warning: Skipped validation, file may be corrupt", file=sys.stderr) + # Exit with error if validation failed + elif not success: + print("Contents would produce a corrupt file.", file=sys.stderr) + print("Please validate XML before repacking.", file=sys.stderr) + print("Use --force to skip validation and pack anyway.", file=sys.stderr) + sys.exit(1) + + except ValueError as e: + sys.exit(f"Error: {e}") + + +def pack_document(input_dir, output_file, validate=False): + """Pack a directory into an Office file (.docx/.pptx/.xlsx). + + Args: + input_dir: Path to unpacked Office document directory + output_file: Path to output Office file + validate: If True, validates with soffice (default: False) + + Returns: + bool: True if successful, False if validation failed + """ + input_dir = Path(input_dir) + output_file = Path(output_file) + + if not input_dir.is_dir(): + raise ValueError(f"{input_dir} is not a directory") + if output_file.suffix.lower() not in {".docx", ".pptx", ".xlsx"}: + raise ValueError(f"{output_file} must be a .docx, .pptx, or .xlsx file") + + # Work in temporary directory to avoid modifying original + with tempfile.TemporaryDirectory() as temp_dir: + temp_content_dir = Path(temp_dir) / "content" + shutil.copytree(input_dir, temp_content_dir) + + # Process XML files to remove pretty-printing whitespace + for pattern in ["*.xml", "*.rels"]: + for xml_file in temp_content_dir.rglob(pattern): + condense_xml(xml_file) + + # Create final Office file as zip archive + output_file.parent.mkdir(parents=True, exist_ok=True) + with zipfile.ZipFile(output_file, "w", zipfile.ZIP_DEFLATED) as zf: + for f in temp_content_dir.rglob("*"): + if f.is_file(): + zf.write(f, f.relative_to(temp_content_dir)) + + # Validate if requested + if validate: + if not validate_document(output_file): + output_file.unlink() # Delete the corrupt file + return False + + return True + + +def validate_document(doc_path): + """Validate document by converting to HTML with soffice.""" + # Determine the correct filter based on file extension + match doc_path.suffix.lower(): + case ".docx": + filter_name = "html:HTML" + case ".pptx": + filter_name = "html:impress_html_Export" + case ".xlsx": + filter_name = "html:HTML (StarCalc)" + + with tempfile.TemporaryDirectory() as temp_dir: + try: + result = subprocess.run( + [ + "soffice", + "--headless", + "--convert-to", + filter_name, + "--outdir", + temp_dir, + str(doc_path), + ], + capture_output=True, + timeout=10, + text=True, + ) + if not (Path(temp_dir) / f"{doc_path.stem}.html").exists(): + error_msg = result.stderr.strip() or "Document validation failed" + print(f"Validation error: {error_msg}", file=sys.stderr) + return False + return True + except FileNotFoundError: + print("Warning: soffice not found. Skipping validation.", file=sys.stderr) + return True + except subprocess.TimeoutExpired: + print("Validation error: Timeout during conversion", file=sys.stderr) + return False + except Exception as e: + print(f"Validation error: {e}", file=sys.stderr) + return False + + +def condense_xml(xml_file): + """Strip unnecessary whitespace and remove comments.""" + with open(xml_file, "r", encoding="utf-8") as f: + dom = defusedxml.minidom.parse(f) + + # Process each element to remove whitespace and comments + for element in dom.getElementsByTagName("*"): + # Skip w:t elements and their processing + if element.tagName.endswith(":t"): + continue + + # Remove whitespace-only text nodes and comment nodes + for child in list(element.childNodes): + if ( + child.nodeType == child.TEXT_NODE + and child.nodeValue + and child.nodeValue.strip() == "" + ) or child.nodeType == child.COMMENT_NODE: + element.removeChild(child) + + # Write back the condensed XML + with open(xml_file, "wb") as f: + f.write(dom.toxml(encoding="UTF-8")) + + +if __name__ == "__main__": + main() diff --git a/skills/pptx/ooxml/scripts/unpack.py b/skills/pptx/ooxml/scripts/unpack.py new file mode 100644 index 0000000..4938798 --- /dev/null +++ b/skills/pptx/ooxml/scripts/unpack.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 +"""Unpack and format XML contents of Office files (.docx, .pptx, .xlsx)""" + +import random +import sys +import defusedxml.minidom +import zipfile +from pathlib import Path + +# Get command line arguments +assert len(sys.argv) == 3, "Usage: python unpack.py " +input_file, output_dir = sys.argv[1], sys.argv[2] + +# Extract and format +output_path = Path(output_dir) +output_path.mkdir(parents=True, exist_ok=True) +zipfile.ZipFile(input_file).extractall(output_path) + +# Pretty print all XML files +xml_files = list(output_path.rglob("*.xml")) + list(output_path.rglob("*.rels")) +for xml_file in xml_files: + content = xml_file.read_text(encoding="utf-8") + dom = defusedxml.minidom.parseString(content) + xml_file.write_bytes(dom.toprettyxml(indent=" ", encoding="ascii")) + +# For .docx files, suggest an RSID for tracked changes +if input_file.endswith(".docx"): + suggested_rsid = "".join(random.choices("0123456789ABCDEF", k=8)) + print(f"Suggested RSID for edit session: {suggested_rsid}") diff --git a/skills/pptx/ooxml/scripts/validate.py b/skills/pptx/ooxml/scripts/validate.py new file mode 100644 index 0000000..508c589 --- /dev/null +++ b/skills/pptx/ooxml/scripts/validate.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 +""" +Command line tool to validate Office document XML files against XSD schemas and tracked changes. + +Usage: + python validate.py --original +""" + +import argparse +import sys +from pathlib import Path + +from validation import DOCXSchemaValidator, PPTXSchemaValidator, RedliningValidator + + +def main(): + parser = argparse.ArgumentParser(description="Validate Office document XML files") + parser.add_argument( + "unpacked_dir", + help="Path to unpacked Office document directory", + ) + parser.add_argument( + "--original", + required=True, + help="Path to original file (.docx/.pptx/.xlsx)", + ) + parser.add_argument( + "-v", + "--verbose", + action="store_true", + help="Enable verbose output", + ) + args = parser.parse_args() + + # Validate paths + unpacked_dir = Path(args.unpacked_dir) + original_file = Path(args.original) + file_extension = original_file.suffix.lower() + assert unpacked_dir.is_dir(), f"Error: {unpacked_dir} is not a directory" + assert original_file.is_file(), f"Error: {original_file} is not a file" + assert file_extension in [".docx", ".pptx", ".xlsx"], ( + f"Error: {original_file} must be a .docx, .pptx, or .xlsx file" + ) + + # Run validations + match file_extension: + case ".docx": + validators = [DOCXSchemaValidator, RedliningValidator] + case ".pptx": + validators = [PPTXSchemaValidator] + case _: + print(f"Error: Validation not supported for file type {file_extension}") + sys.exit(1) + + # Run validators + success = True + for V in validators: + validator = V(unpacked_dir, original_file, verbose=args.verbose) + if not validator.validate(): + success = False + + if success: + print("All validations PASSED!") + + sys.exit(0 if success else 1) + + +if __name__ == "__main__": + main() diff --git a/skills/pptx/ooxml/scripts/validation/__init__.py b/skills/pptx/ooxml/scripts/validation/__init__.py new file mode 100644 index 0000000..db092ec --- /dev/null +++ b/skills/pptx/ooxml/scripts/validation/__init__.py @@ -0,0 +1,15 @@ +""" +Validation modules for Word document processing. +""" + +from .base import BaseSchemaValidator +from .docx import DOCXSchemaValidator +from .pptx import PPTXSchemaValidator +from .redlining import RedliningValidator + +__all__ = [ + "BaseSchemaValidator", + "DOCXSchemaValidator", + "PPTXSchemaValidator", + "RedliningValidator", +] diff --git a/skills/pptx/ooxml/scripts/validation/base.py b/skills/pptx/ooxml/scripts/validation/base.py new file mode 100644 index 0000000..0681b19 --- /dev/null +++ b/skills/pptx/ooxml/scripts/validation/base.py @@ -0,0 +1,951 @@ +""" +Base validator with common validation logic for document files. +""" + +import re +from pathlib import Path + +import lxml.etree + + +class BaseSchemaValidator: + """Base validator with common validation logic for document files.""" + + # Elements whose 'id' attributes must be unique within their file + # Format: element_name -> (attribute_name, scope) + # scope can be 'file' (unique within file) or 'global' (unique across all files) + UNIQUE_ID_REQUIREMENTS = { + # Word elements + "comment": ("id", "file"), # Comment IDs in comments.xml + "commentrangestart": ("id", "file"), # Must match comment IDs + "commentrangeend": ("id", "file"), # Must match comment IDs + "bookmarkstart": ("id", "file"), # Bookmark start IDs + "bookmarkend": ("id", "file"), # Bookmark end IDs + # Note: ins and del (track changes) can share IDs when part of same revision + # PowerPoint elements + "sldid": ("id", "file"), # Slide IDs in presentation.xml + "sldmasterid": ("id", "global"), # Slide master IDs must be globally unique + "sldlayoutid": ("id", "global"), # Slide layout IDs must be globally unique + "cm": ("authorid", "file"), # Comment author IDs + # Excel elements + "sheet": ("sheetid", "file"), # Sheet IDs in workbook.xml + "definedname": ("id", "file"), # Named range IDs + # Drawing/Shape elements (all formats) + "cxnsp": ("id", "file"), # Connection shape IDs + "sp": ("id", "file"), # Shape IDs + "pic": ("id", "file"), # Picture IDs + "grpsp": ("id", "file"), # Group shape IDs + } + + # Mapping of element names to expected relationship types + # Subclasses should override this with format-specific mappings + ELEMENT_RELATIONSHIP_TYPES = {} + + # Unified schema mappings for all Office document types + SCHEMA_MAPPINGS = { + # Document type specific schemas + "word": "ISO-IEC29500-4_2016/wml.xsd", # Word documents + "ppt": "ISO-IEC29500-4_2016/pml.xsd", # PowerPoint presentations + "xl": "ISO-IEC29500-4_2016/sml.xsd", # Excel spreadsheets + # Common file types + "[Content_Types].xml": "ecma/fouth-edition/opc-contentTypes.xsd", + "app.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd", + "core.xml": "ecma/fouth-edition/opc-coreProperties.xsd", + "custom.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd", + ".rels": "ecma/fouth-edition/opc-relationships.xsd", + # Word-specific files + "people.xml": "microsoft/wml-2012.xsd", + "commentsIds.xml": "microsoft/wml-cid-2016.xsd", + "commentsExtensible.xml": "microsoft/wml-cex-2018.xsd", + "commentsExtended.xml": "microsoft/wml-2012.xsd", + # Chart files (common across document types) + "chart": "ISO-IEC29500-4_2016/dml-chart.xsd", + # Theme files (common across document types) + "theme": "ISO-IEC29500-4_2016/dml-main.xsd", + # Drawing and media files + "drawing": "ISO-IEC29500-4_2016/dml-main.xsd", + } + + # Unified namespace constants + MC_NAMESPACE = "http://schemas.openxmlformats.org/markup-compatibility/2006" + XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace" + + # Common OOXML namespaces used across validators + PACKAGE_RELATIONSHIPS_NAMESPACE = ( + "http://schemas.openxmlformats.org/package/2006/relationships" + ) + OFFICE_RELATIONSHIPS_NAMESPACE = ( + "http://schemas.openxmlformats.org/officeDocument/2006/relationships" + ) + CONTENT_TYPES_NAMESPACE = ( + "http://schemas.openxmlformats.org/package/2006/content-types" + ) + + # Folders where we should clean ignorable namespaces + MAIN_CONTENT_FOLDERS = {"word", "ppt", "xl"} + + # All allowed OOXML namespaces (superset of all document types) + OOXML_NAMESPACES = { + "http://schemas.openxmlformats.org/officeDocument/2006/math", + "http://schemas.openxmlformats.org/officeDocument/2006/relationships", + "http://schemas.openxmlformats.org/schemaLibrary/2006/main", + "http://schemas.openxmlformats.org/drawingml/2006/main", + "http://schemas.openxmlformats.org/drawingml/2006/chart", + "http://schemas.openxmlformats.org/drawingml/2006/chartDrawing", + "http://schemas.openxmlformats.org/drawingml/2006/diagram", + "http://schemas.openxmlformats.org/drawingml/2006/picture", + "http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing", + "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing", + "http://schemas.openxmlformats.org/wordprocessingml/2006/main", + "http://schemas.openxmlformats.org/presentationml/2006/main", + "http://schemas.openxmlformats.org/spreadsheetml/2006/main", + "http://schemas.openxmlformats.org/officeDocument/2006/sharedTypes", + "http://www.w3.org/XML/1998/namespace", + } + + def __init__(self, unpacked_dir, original_file, verbose=False): + self.unpacked_dir = Path(unpacked_dir).resolve() + self.original_file = Path(original_file) + self.verbose = verbose + + # Set schemas directory + self.schemas_dir = Path(__file__).parent.parent.parent / "schemas" + + # Get all XML and .rels files + patterns = ["*.xml", "*.rels"] + self.xml_files = [ + f for pattern in patterns for f in self.unpacked_dir.rglob(pattern) + ] + + if not self.xml_files: + print(f"Warning: No XML files found in {self.unpacked_dir}") + + def validate(self): + """Run all validation checks and return True if all pass.""" + raise NotImplementedError("Subclasses must implement the validate method") + + def validate_xml(self): + """Validate that all XML files are well-formed.""" + errors = [] + + for xml_file in self.xml_files: + try: + # Try to parse the XML file + lxml.etree.parse(str(xml_file)) + except lxml.etree.XMLSyntaxError as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {e.lineno}: {e.msg}" + ) + except Exception as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Unexpected error: {str(e)}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} XML violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All XML files are well-formed") + return True + + def validate_namespaces(self): + """Validate that namespace prefixes in Ignorable attributes are declared.""" + errors = [] + + for xml_file in self.xml_files: + try: + root = lxml.etree.parse(str(xml_file)).getroot() + declared = set(root.nsmap.keys()) - {None} # Exclude default namespace + + for attr_val in [ + v for k, v in root.attrib.items() if k.endswith("Ignorable") + ]: + undeclared = set(attr_val.split()) - declared + errors.extend( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Namespace '{ns}' in Ignorable but not declared" + for ns in undeclared + ) + except lxml.etree.XMLSyntaxError: + continue + + if errors: + print(f"FAILED - {len(errors)} namespace issues:") + for error in errors: + print(error) + return False + if self.verbose: + print("PASSED - All namespace prefixes properly declared") + return True + + def validate_unique_ids(self): + """Validate that specific IDs are unique according to OOXML requirements.""" + errors = [] + global_ids = {} # Track globally unique IDs across all files + + for xml_file in self.xml_files: + try: + root = lxml.etree.parse(str(xml_file)).getroot() + file_ids = {} # Track IDs that must be unique within this file + + # Remove all mc:AlternateContent elements from the tree + mc_elements = root.xpath( + ".//mc:AlternateContent", namespaces={"mc": self.MC_NAMESPACE} + ) + for elem in mc_elements: + elem.getparent().remove(elem) + + # Now check IDs in the cleaned tree + for elem in root.iter(): + # Get the element name without namespace + tag = ( + elem.tag.split("}")[-1].lower() + if "}" in elem.tag + else elem.tag.lower() + ) + + # Check if this element type has ID uniqueness requirements + if tag in self.UNIQUE_ID_REQUIREMENTS: + attr_name, scope = self.UNIQUE_ID_REQUIREMENTS[tag] + + # Look for the specified attribute + id_value = None + for attr, value in elem.attrib.items(): + attr_local = ( + attr.split("}")[-1].lower() + if "}" in attr + else attr.lower() + ) + if attr_local == attr_name: + id_value = value + break + + if id_value is not None: + if scope == "global": + # Check global uniqueness + if id_value in global_ids: + prev_file, prev_line, prev_tag = global_ids[ + id_value + ] + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: Global ID '{id_value}' in <{tag}> " + f"already used in {prev_file} at line {prev_line} in <{prev_tag}>" + ) + else: + global_ids[id_value] = ( + xml_file.relative_to(self.unpacked_dir), + elem.sourceline, + tag, + ) + elif scope == "file": + # Check file-level uniqueness + key = (tag, attr_name) + if key not in file_ids: + file_ids[key] = {} + + if id_value in file_ids[key]: + prev_line = file_ids[key][id_value] + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: Duplicate {attr_name}='{id_value}' in <{tag}> " + f"(first occurrence at line {prev_line})" + ) + else: + file_ids[key][id_value] = elem.sourceline + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} ID uniqueness violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All required IDs are unique") + return True + + def validate_file_references(self): + """ + Validate that all .rels files properly reference files and that all files are referenced. + """ + errors = [] + + # Find all .rels files + rels_files = list(self.unpacked_dir.rglob("*.rels")) + + if not rels_files: + if self.verbose: + print("PASSED - No .rels files found") + return True + + # Get all files in the unpacked directory (excluding reference files) + all_files = [] + for file_path in self.unpacked_dir.rglob("*"): + if ( + file_path.is_file() + and file_path.name != "[Content_Types].xml" + and not file_path.name.endswith(".rels") + ): # This file is not referenced by .rels + all_files.append(file_path.resolve()) + + # Track all files that are referenced by any .rels file + all_referenced_files = set() + + if self.verbose: + print( + f"Found {len(rels_files)} .rels files and {len(all_files)} target files" + ) + + # Check each .rels file + for rels_file in rels_files: + try: + # Parse relationships file + rels_root = lxml.etree.parse(str(rels_file)).getroot() + + # Get the directory where this .rels file is located + rels_dir = rels_file.parent + + # Find all relationships and their targets + referenced_files = set() + broken_refs = [] + + for rel in rels_root.findall( + ".//ns:Relationship", + namespaces={"ns": self.PACKAGE_RELATIONSHIPS_NAMESPACE}, + ): + target = rel.get("Target") + if target and not target.startswith( + ("http", "mailto:") + ): # Skip external URLs + # Resolve the target path relative to the .rels file location + if rels_file.name == ".rels": + # Root .rels file - targets are relative to unpacked_dir + target_path = self.unpacked_dir / target + else: + # Other .rels files - targets are relative to their parent's parent + # e.g., word/_rels/document.xml.rels -> targets relative to word/ + base_dir = rels_dir.parent + target_path = base_dir / target + + # Normalize the path and check if it exists + try: + target_path = target_path.resolve() + if target_path.exists() and target_path.is_file(): + referenced_files.add(target_path) + all_referenced_files.add(target_path) + else: + broken_refs.append((target, rel.sourceline)) + except (OSError, ValueError): + broken_refs.append((target, rel.sourceline)) + + # Report broken references + if broken_refs: + rel_path = rels_file.relative_to(self.unpacked_dir) + for broken_ref, line_num in broken_refs: + errors.append( + f" {rel_path}: Line {line_num}: Broken reference to {broken_ref}" + ) + + except Exception as e: + rel_path = rels_file.relative_to(self.unpacked_dir) + errors.append(f" Error parsing {rel_path}: {e}") + + # Check for unreferenced files (files that exist but are not referenced anywhere) + unreferenced_files = set(all_files) - all_referenced_files + + if unreferenced_files: + for unref_file in sorted(unreferenced_files): + unref_rel_path = unref_file.relative_to(self.unpacked_dir) + errors.append(f" Unreferenced file: {unref_rel_path}") + + if errors: + print(f"FAILED - Found {len(errors)} relationship validation errors:") + for error in errors: + print(error) + print( + "CRITICAL: These errors will cause the document to appear corrupt. " + + "Broken references MUST be fixed, " + + "and unreferenced files MUST be referenced or removed." + ) + return False + else: + if self.verbose: + print( + "PASSED - All references are valid and all files are properly referenced" + ) + return True + + def validate_all_relationship_ids(self): + """ + Validate that all r:id attributes in XML files reference existing IDs + in their corresponding .rels files, and optionally validate relationship types. + """ + import lxml.etree + + errors = [] + + # Process each XML file that might contain r:id references + for xml_file in self.xml_files: + # Skip .rels files themselves + if xml_file.suffix == ".rels": + continue + + # Determine the corresponding .rels file + # For dir/file.xml, it's dir/_rels/file.xml.rels + rels_dir = xml_file.parent / "_rels" + rels_file = rels_dir / f"{xml_file.name}.rels" + + # Skip if there's no corresponding .rels file (that's okay) + if not rels_file.exists(): + continue + + try: + # Parse the .rels file to get valid relationship IDs and their types + rels_root = lxml.etree.parse(str(rels_file)).getroot() + rid_to_type = {} + + for rel in rels_root.findall( + f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship" + ): + rid = rel.get("Id") + rel_type = rel.get("Type", "") + if rid: + # Check for duplicate rIds + if rid in rid_to_type: + rels_rel_path = rels_file.relative_to(self.unpacked_dir) + errors.append( + f" {rels_rel_path}: Line {rel.sourceline}: " + f"Duplicate relationship ID '{rid}' (IDs must be unique)" + ) + # Extract just the type name from the full URL + type_name = ( + rel_type.split("/")[-1] if "/" in rel_type else rel_type + ) + rid_to_type[rid] = type_name + + # Parse the XML file to find all r:id references + xml_root = lxml.etree.parse(str(xml_file)).getroot() + + # Find all elements with r:id attributes + for elem in xml_root.iter(): + # Check for r:id attribute (relationship ID) + rid_attr = elem.get(f"{{{self.OFFICE_RELATIONSHIPS_NAMESPACE}}}id") + if rid_attr: + xml_rel_path = xml_file.relative_to(self.unpacked_dir) + elem_name = ( + elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag + ) + + # Check if the ID exists + if rid_attr not in rid_to_type: + errors.append( + f" {xml_rel_path}: Line {elem.sourceline}: " + f"<{elem_name}> references non-existent relationship '{rid_attr}' " + f"(valid IDs: {', '.join(sorted(rid_to_type.keys())[:5])}{'...' if len(rid_to_type) > 5 else ''})" + ) + # Check if we have type expectations for this element + elif self.ELEMENT_RELATIONSHIP_TYPES: + expected_type = self._get_expected_relationship_type( + elem_name + ) + if expected_type: + actual_type = rid_to_type[rid_attr] + # Check if the actual type matches or contains the expected type + if expected_type not in actual_type.lower(): + errors.append( + f" {xml_rel_path}: Line {elem.sourceline}: " + f"<{elem_name}> references '{rid_attr}' which points to '{actual_type}' " + f"but should point to a '{expected_type}' relationship" + ) + + except Exception as e: + xml_rel_path = xml_file.relative_to(self.unpacked_dir) + errors.append(f" Error processing {xml_rel_path}: {e}") + + if errors: + print(f"FAILED - Found {len(errors)} relationship ID reference errors:") + for error in errors: + print(error) + print("\nThese ID mismatches will cause the document to appear corrupt!") + return False + else: + if self.verbose: + print("PASSED - All relationship ID references are valid") + return True + + def _get_expected_relationship_type(self, element_name): + """ + Get the expected relationship type for an element. + First checks the explicit mapping, then tries pattern detection. + """ + # Normalize element name to lowercase + elem_lower = element_name.lower() + + # Check explicit mapping first + if elem_lower in self.ELEMENT_RELATIONSHIP_TYPES: + return self.ELEMENT_RELATIONSHIP_TYPES[elem_lower] + + # Try pattern detection for common patterns + # Pattern 1: Elements ending in "Id" often expect a relationship of the prefix type + if elem_lower.endswith("id") and len(elem_lower) > 2: + # e.g., "sldId" -> "sld", "sldMasterId" -> "sldMaster" + prefix = elem_lower[:-2] # Remove "id" + # Check if this might be a compound like "sldMasterId" + if prefix.endswith("master"): + return prefix.lower() + elif prefix.endswith("layout"): + return prefix.lower() + else: + # Simple case like "sldId" -> "slide" + # Common transformations + if prefix == "sld": + return "slide" + return prefix.lower() + + # Pattern 2: Elements ending in "Reference" expect a relationship of the prefix type + if elem_lower.endswith("reference") and len(elem_lower) > 9: + prefix = elem_lower[:-9] # Remove "reference" + return prefix.lower() + + return None + + def validate_content_types(self): + """Validate that all content files are properly declared in [Content_Types].xml.""" + errors = [] + + # Find [Content_Types].xml file + content_types_file = self.unpacked_dir / "[Content_Types].xml" + if not content_types_file.exists(): + print("FAILED - [Content_Types].xml file not found") + return False + + try: + # Parse and get all declared parts and extensions + root = lxml.etree.parse(str(content_types_file)).getroot() + declared_parts = set() + declared_extensions = set() + + # Get Override declarations (specific files) + for override in root.findall( + f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Override" + ): + part_name = override.get("PartName") + if part_name is not None: + declared_parts.add(part_name.lstrip("/")) + + # Get Default declarations (by extension) + for default in root.findall( + f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Default" + ): + extension = default.get("Extension") + if extension is not None: + declared_extensions.add(extension.lower()) + + # Root elements that require content type declaration + declarable_roots = { + "sld", + "sldLayout", + "sldMaster", + "presentation", # PowerPoint + "document", # Word + "workbook", + "worksheet", # Excel + "theme", # Common + } + + # Common media file extensions that should be declared + media_extensions = { + "png": "image/png", + "jpg": "image/jpeg", + "jpeg": "image/jpeg", + "gif": "image/gif", + "bmp": "image/bmp", + "tiff": "image/tiff", + "wmf": "image/x-wmf", + "emf": "image/x-emf", + } + + # Get all files in the unpacked directory + all_files = list(self.unpacked_dir.rglob("*")) + all_files = [f for f in all_files if f.is_file()] + + # Check all XML files for Override declarations + for xml_file in self.xml_files: + path_str = str(xml_file.relative_to(self.unpacked_dir)).replace( + "\\", "/" + ) + + # Skip non-content files + if any( + skip in path_str + for skip in [".rels", "[Content_Types]", "docProps/", "_rels/"] + ): + continue + + try: + root_tag = lxml.etree.parse(str(xml_file)).getroot().tag + root_name = root_tag.split("}")[-1] if "}" in root_tag else root_tag + + if root_name in declarable_roots and path_str not in declared_parts: + errors.append( + f" {path_str}: File with <{root_name}> root not declared in [Content_Types].xml" + ) + + except Exception: + continue # Skip unparseable files + + # Check all non-XML files for Default extension declarations + for file_path in all_files: + # Skip XML files and metadata files (already checked above) + if file_path.suffix.lower() in {".xml", ".rels"}: + continue + if file_path.name == "[Content_Types].xml": + continue + if "_rels" in file_path.parts or "docProps" in file_path.parts: + continue + + extension = file_path.suffix.lstrip(".").lower() + if extension and extension not in declared_extensions: + # Check if it's a known media extension that should be declared + if extension in media_extensions: + relative_path = file_path.relative_to(self.unpacked_dir) + errors.append( + f' {relative_path}: File with extension \'{extension}\' not declared in [Content_Types].xml - should add: ' + ) + + except Exception as e: + errors.append(f" Error parsing [Content_Types].xml: {e}") + + if errors: + print(f"FAILED - Found {len(errors)} content type declaration errors:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print( + "PASSED - All content files are properly declared in [Content_Types].xml" + ) + return True + + def validate_file_against_xsd(self, xml_file, verbose=False): + """Validate a single XML file against XSD schema, comparing with original. + + Args: + xml_file: Path to XML file to validate + verbose: Enable verbose output + + Returns: + tuple: (is_valid, new_errors_set) where is_valid is True/False/None (skipped) + """ + # Resolve both paths to handle symlinks + xml_file = Path(xml_file).resolve() + unpacked_dir = self.unpacked_dir.resolve() + + # Validate current file + is_valid, current_errors = self._validate_single_file_xsd( + xml_file, unpacked_dir + ) + + if is_valid is None: + return None, set() # Skipped + elif is_valid: + return True, set() # Valid, no errors + + # Get errors from original file for this specific file + original_errors = self._get_original_file_errors(xml_file) + + # Compare with original (both are guaranteed to be sets here) + assert current_errors is not None + new_errors = current_errors - original_errors + + if new_errors: + if verbose: + relative_path = xml_file.relative_to(unpacked_dir) + print(f"FAILED - {relative_path}: {len(new_errors)} new error(s)") + for error in list(new_errors)[:3]: + truncated = error[:250] + "..." if len(error) > 250 else error + print(f" - {truncated}") + return False, new_errors + else: + # All errors existed in original + if verbose: + print( + f"PASSED - No new errors (original had {len(current_errors)} errors)" + ) + return True, set() + + def validate_against_xsd(self): + """Validate XML files against XSD schemas, showing only new errors compared to original.""" + new_errors = [] + original_error_count = 0 + valid_count = 0 + skipped_count = 0 + + for xml_file in self.xml_files: + relative_path = str(xml_file.relative_to(self.unpacked_dir)) + is_valid, new_file_errors = self.validate_file_against_xsd( + xml_file, verbose=False + ) + + if is_valid is None: + skipped_count += 1 + continue + elif is_valid and not new_file_errors: + valid_count += 1 + continue + elif is_valid: + # Had errors but all existed in original + original_error_count += 1 + valid_count += 1 + continue + + # Has new errors + new_errors.append(f" {relative_path}: {len(new_file_errors)} new error(s)") + for error in list(new_file_errors)[:3]: # Show first 3 errors + new_errors.append( + f" - {error[:250]}..." if len(error) > 250 else f" - {error}" + ) + + # Print summary + if self.verbose: + print(f"Validated {len(self.xml_files)} files:") + print(f" - Valid: {valid_count}") + print(f" - Skipped (no schema): {skipped_count}") + if original_error_count: + print(f" - With original errors (ignored): {original_error_count}") + print( + f" - With NEW errors: {len(new_errors) > 0 and len([e for e in new_errors if not e.startswith(' ')]) or 0}" + ) + + if new_errors: + print("\nFAILED - Found NEW validation errors:") + for error in new_errors: + print(error) + return False + else: + if self.verbose: + print("\nPASSED - No new XSD validation errors introduced") + return True + + def _get_schema_path(self, xml_file): + """Determine the appropriate schema path for an XML file.""" + # Check exact filename match + if xml_file.name in self.SCHEMA_MAPPINGS: + return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.name] + + # Check .rels files + if xml_file.suffix == ".rels": + return self.schemas_dir / self.SCHEMA_MAPPINGS[".rels"] + + # Check chart files + if "charts/" in str(xml_file) and xml_file.name.startswith("chart"): + return self.schemas_dir / self.SCHEMA_MAPPINGS["chart"] + + # Check theme files + if "theme/" in str(xml_file) and xml_file.name.startswith("theme"): + return self.schemas_dir / self.SCHEMA_MAPPINGS["theme"] + + # Check if file is in a main content folder and use appropriate schema + if xml_file.parent.name in self.MAIN_CONTENT_FOLDERS: + return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.parent.name] + + return None + + def _clean_ignorable_namespaces(self, xml_doc): + """Remove attributes and elements not in allowed namespaces.""" + # Create a clean copy + xml_string = lxml.etree.tostring(xml_doc, encoding="unicode") + xml_copy = lxml.etree.fromstring(xml_string) + + # Remove attributes not in allowed namespaces + for elem in xml_copy.iter(): + attrs_to_remove = [] + + for attr in elem.attrib: + # Check if attribute is from a namespace other than allowed ones + if "{" in attr: + ns = attr.split("}")[0][1:] + if ns not in self.OOXML_NAMESPACES: + attrs_to_remove.append(attr) + + # Remove collected attributes + for attr in attrs_to_remove: + del elem.attrib[attr] + + # Remove elements not in allowed namespaces + self._remove_ignorable_elements(xml_copy) + + return lxml.etree.ElementTree(xml_copy) + + def _remove_ignorable_elements(self, root): + """Recursively remove all elements not in allowed namespaces.""" + elements_to_remove = [] + + # Find elements to remove + for elem in list(root): + # Skip non-element nodes (comments, processing instructions, etc.) + if not hasattr(elem, "tag") or callable(elem.tag): + continue + + tag_str = str(elem.tag) + if tag_str.startswith("{"): + ns = tag_str.split("}")[0][1:] + if ns not in self.OOXML_NAMESPACES: + elements_to_remove.append(elem) + continue + + # Recursively clean child elements + self._remove_ignorable_elements(elem) + + # Remove collected elements + for elem in elements_to_remove: + root.remove(elem) + + def _preprocess_for_mc_ignorable(self, xml_doc): + """Preprocess XML to handle mc:Ignorable attribute properly.""" + # Remove mc:Ignorable attributes before validation + root = xml_doc.getroot() + + # Remove mc:Ignorable attribute from root + if f"{{{self.MC_NAMESPACE}}}Ignorable" in root.attrib: + del root.attrib[f"{{{self.MC_NAMESPACE}}}Ignorable"] + + return xml_doc + + def _validate_single_file_xsd(self, xml_file, base_path): + """Validate a single XML file against XSD schema. Returns (is_valid, errors_set).""" + schema_path = self._get_schema_path(xml_file) + if not schema_path: + return None, None # Skip file + + try: + # Load schema + with open(schema_path, "rb") as xsd_file: + parser = lxml.etree.XMLParser() + xsd_doc = lxml.etree.parse( + xsd_file, parser=parser, base_url=str(schema_path) + ) + schema = lxml.etree.XMLSchema(xsd_doc) + + # Load and preprocess XML + with open(xml_file, "r") as f: + xml_doc = lxml.etree.parse(f) + + xml_doc, _ = self._remove_template_tags_from_text_nodes(xml_doc) + xml_doc = self._preprocess_for_mc_ignorable(xml_doc) + + # Clean ignorable namespaces if needed + relative_path = xml_file.relative_to(base_path) + if ( + relative_path.parts + and relative_path.parts[0] in self.MAIN_CONTENT_FOLDERS + ): + xml_doc = self._clean_ignorable_namespaces(xml_doc) + + # Validate + if schema.validate(xml_doc): + return True, set() + else: + errors = set() + for error in schema.error_log: + # Store normalized error message (without line numbers for comparison) + errors.add(error.message) + return False, errors + + except Exception as e: + return False, {str(e)} + + def _get_original_file_errors(self, xml_file): + """Get XSD validation errors from a single file in the original document. + + Args: + xml_file: Path to the XML file in unpacked_dir to check + + Returns: + set: Set of error messages from the original file + """ + import tempfile + import zipfile + + # Resolve both paths to handle symlinks (e.g., /var vs /private/var on macOS) + xml_file = Path(xml_file).resolve() + unpacked_dir = self.unpacked_dir.resolve() + relative_path = xml_file.relative_to(unpacked_dir) + + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + # Extract original file + with zipfile.ZipFile(self.original_file, "r") as zip_ref: + zip_ref.extractall(temp_path) + + # Find corresponding file in original + original_xml_file = temp_path / relative_path + + if not original_xml_file.exists(): + # File didn't exist in original, so no original errors + return set() + + # Validate the specific file in original + is_valid, errors = self._validate_single_file_xsd( + original_xml_file, temp_path + ) + return errors if errors else set() + + def _remove_template_tags_from_text_nodes(self, xml_doc): + """Remove template tags from XML text nodes and collect warnings. + + Template tags follow the pattern {{ ... }} and are used as placeholders + for content replacement. They should be removed from text content before + XSD validation while preserving XML structure. + + Returns: + tuple: (cleaned_xml_doc, warnings_list) + """ + warnings = [] + template_pattern = re.compile(r"\{\{[^}]*\}\}") + + # Create a copy of the document to avoid modifying the original + xml_string = lxml.etree.tostring(xml_doc, encoding="unicode") + xml_copy = lxml.etree.fromstring(xml_string) + + def process_text_content(text, content_type): + if not text: + return text + matches = list(template_pattern.finditer(text)) + if matches: + for match in matches: + warnings.append( + f"Found template tag in {content_type}: {match.group()}" + ) + return template_pattern.sub("", text) + return text + + # Process all text nodes in the document + for elem in xml_copy.iter(): + # Skip processing if this is a w:t element + if not hasattr(elem, "tag") or callable(elem.tag): + continue + tag_str = str(elem.tag) + if tag_str.endswith("}t") or tag_str == "t": + continue + + elem.text = process_text_content(elem.text, "text content") + elem.tail = process_text_content(elem.tail, "tail content") + + return lxml.etree.ElementTree(xml_copy), warnings + + +if __name__ == "__main__": + raise RuntimeError("This module should not be run directly.") diff --git a/skills/pptx/ooxml/scripts/validation/docx.py b/skills/pptx/ooxml/scripts/validation/docx.py new file mode 100644 index 0000000..602c470 --- /dev/null +++ b/skills/pptx/ooxml/scripts/validation/docx.py @@ -0,0 +1,274 @@ +""" +Validator for Word document XML files against XSD schemas. +""" + +import re +import tempfile +import zipfile + +import lxml.etree + +from .base import BaseSchemaValidator + + +class DOCXSchemaValidator(BaseSchemaValidator): + """Validator for Word document XML files against XSD schemas.""" + + # Word-specific namespace + WORD_2006_NAMESPACE = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" + + # Word-specific element to relationship type mappings + # Start with empty mapping - add specific cases as we discover them + ELEMENT_RELATIONSHIP_TYPES = {} + + def validate(self): + """Run all validation checks and return True if all pass.""" + # Test 0: XML well-formedness + if not self.validate_xml(): + return False + + # Test 1: Namespace declarations + all_valid = True + if not self.validate_namespaces(): + all_valid = False + + # Test 2: Unique IDs + if not self.validate_unique_ids(): + all_valid = False + + # Test 3: Relationship and file reference validation + if not self.validate_file_references(): + all_valid = False + + # Test 4: Content type declarations + if not self.validate_content_types(): + all_valid = False + + # Test 5: XSD schema validation + if not self.validate_against_xsd(): + all_valid = False + + # Test 6: Whitespace preservation + if not self.validate_whitespace_preservation(): + all_valid = False + + # Test 7: Deletion validation + if not self.validate_deletions(): + all_valid = False + + # Test 8: Insertion validation + if not self.validate_insertions(): + all_valid = False + + # Test 9: Relationship ID reference validation + if not self.validate_all_relationship_ids(): + all_valid = False + + # Count and compare paragraphs + self.compare_paragraph_counts() + + return all_valid + + def validate_whitespace_preservation(self): + """ + Validate that w:t elements with whitespace have xml:space='preserve'. + """ + errors = [] + + for xml_file in self.xml_files: + # Only check document.xml files + if xml_file.name != "document.xml": + continue + + try: + root = lxml.etree.parse(str(xml_file)).getroot() + + # Find all w:t elements + for elem in root.iter(f"{{{self.WORD_2006_NAMESPACE}}}t"): + if elem.text: + text = elem.text + # Check if text starts or ends with whitespace + if re.match(r"^\s.*", text) or re.match(r".*\s$", text): + # Check if xml:space="preserve" attribute exists + xml_space_attr = f"{{{self.XML_NAMESPACE}}}space" + if ( + xml_space_attr not in elem.attrib + or elem.attrib[xml_space_attr] != "preserve" + ): + # Show a preview of the text + text_preview = ( + repr(text)[:50] + "..." + if len(repr(text)) > 50 + else repr(text) + ) + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: w:t element with whitespace missing xml:space='preserve': {text_preview}" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} whitespace preservation violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All whitespace is properly preserved") + return True + + def validate_deletions(self): + """ + Validate that w:t elements are not within w:del elements. + For some reason, XSD validation does not catch this, so we do it manually. + """ + errors = [] + + for xml_file in self.xml_files: + # Only check document.xml files + if xml_file.name != "document.xml": + continue + + try: + root = lxml.etree.parse(str(xml_file)).getroot() + + # Find all w:t elements that are descendants of w:del elements + namespaces = {"w": self.WORD_2006_NAMESPACE} + xpath_expression = ".//w:del//w:t" + problematic_t_elements = root.xpath( + xpath_expression, namespaces=namespaces + ) + for t_elem in problematic_t_elements: + if t_elem.text: + # Show a preview of the text + text_preview = ( + repr(t_elem.text)[:50] + "..." + if len(repr(t_elem.text)) > 50 + else repr(t_elem.text) + ) + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {t_elem.sourceline}: found within : {text_preview}" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} deletion validation violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - No w:t elements found within w:del elements") + return True + + def count_paragraphs_in_unpacked(self): + """Count the number of paragraphs in the unpacked document.""" + count = 0 + + for xml_file in self.xml_files: + # Only check document.xml files + if xml_file.name != "document.xml": + continue + + try: + root = lxml.etree.parse(str(xml_file)).getroot() + # Count all w:p elements + paragraphs = root.findall(f".//{{{self.WORD_2006_NAMESPACE}}}p") + count = len(paragraphs) + except Exception as e: + print(f"Error counting paragraphs in unpacked document: {e}") + + return count + + def count_paragraphs_in_original(self): + """Count the number of paragraphs in the original docx file.""" + count = 0 + + try: + # Create temporary directory to unpack original + with tempfile.TemporaryDirectory() as temp_dir: + # Unpack original docx + with zipfile.ZipFile(self.original_file, "r") as zip_ref: + zip_ref.extractall(temp_dir) + + # Parse document.xml + doc_xml_path = temp_dir + "/word/document.xml" + root = lxml.etree.parse(doc_xml_path).getroot() + + # Count all w:p elements + paragraphs = root.findall(f".//{{{self.WORD_2006_NAMESPACE}}}p") + count = len(paragraphs) + + except Exception as e: + print(f"Error counting paragraphs in original document: {e}") + + return count + + def validate_insertions(self): + """ + Validate that w:delText elements are not within w:ins elements. + w:delText is only allowed in w:ins if nested within a w:del. + """ + errors = [] + + for xml_file in self.xml_files: + if xml_file.name != "document.xml": + continue + + try: + root = lxml.etree.parse(str(xml_file)).getroot() + namespaces = {"w": self.WORD_2006_NAMESPACE} + + # Find w:delText in w:ins that are NOT within w:del + invalid_elements = root.xpath( + ".//w:ins//w:delText[not(ancestor::w:del)]", + namespaces=namespaces + ) + + for elem in invalid_elements: + text_preview = ( + repr(elem.text or "")[:50] + "..." + if len(repr(elem.text or "")) > 50 + else repr(elem.text or "") + ) + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: within : {text_preview}" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} insertion validation violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - No w:delText elements within w:ins elements") + return True + + def compare_paragraph_counts(self): + """Compare paragraph counts between original and new document.""" + original_count = self.count_paragraphs_in_original() + new_count = self.count_paragraphs_in_unpacked() + + diff = new_count - original_count + diff_str = f"+{diff}" if diff > 0 else str(diff) + print(f"\nParagraphs: {original_count} → {new_count} ({diff_str})") + + +if __name__ == "__main__": + raise RuntimeError("This module should not be run directly.") diff --git a/skills/pptx/ooxml/scripts/validation/pptx.py b/skills/pptx/ooxml/scripts/validation/pptx.py new file mode 100644 index 0000000..66d5b1e --- /dev/null +++ b/skills/pptx/ooxml/scripts/validation/pptx.py @@ -0,0 +1,315 @@ +""" +Validator for PowerPoint presentation XML files against XSD schemas. +""" + +import re + +from .base import BaseSchemaValidator + + +class PPTXSchemaValidator(BaseSchemaValidator): + """Validator for PowerPoint presentation XML files against XSD schemas.""" + + # PowerPoint presentation namespace + PRESENTATIONML_NAMESPACE = ( + "http://schemas.openxmlformats.org/presentationml/2006/main" + ) + + # PowerPoint-specific element to relationship type mappings + ELEMENT_RELATIONSHIP_TYPES = { + "sldid": "slide", + "sldmasterid": "slidemaster", + "notesmasterid": "notesmaster", + "sldlayoutid": "slidelayout", + "themeid": "theme", + "tablestyleid": "tablestyles", + } + + def validate(self): + """Run all validation checks and return True if all pass.""" + # Test 0: XML well-formedness + if not self.validate_xml(): + return False + + # Test 1: Namespace declarations + all_valid = True + if not self.validate_namespaces(): + all_valid = False + + # Test 2: Unique IDs + if not self.validate_unique_ids(): + all_valid = False + + # Test 3: UUID ID validation + if not self.validate_uuid_ids(): + all_valid = False + + # Test 4: Relationship and file reference validation + if not self.validate_file_references(): + all_valid = False + + # Test 5: Slide layout ID validation + if not self.validate_slide_layout_ids(): + all_valid = False + + # Test 6: Content type declarations + if not self.validate_content_types(): + all_valid = False + + # Test 7: XSD schema validation + if not self.validate_against_xsd(): + all_valid = False + + # Test 8: Notes slide reference validation + if not self.validate_notes_slide_references(): + all_valid = False + + # Test 9: Relationship ID reference validation + if not self.validate_all_relationship_ids(): + all_valid = False + + # Test 10: Duplicate slide layout references validation + if not self.validate_no_duplicate_slide_layouts(): + all_valid = False + + return all_valid + + def validate_uuid_ids(self): + """Validate that ID attributes that look like UUIDs contain only hex values.""" + import lxml.etree + + errors = [] + # UUID pattern: 8-4-4-4-12 hex digits with optional braces/hyphens + uuid_pattern = re.compile( + r"^[\{\(]?[0-9A-Fa-f]{8}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{12}[\}\)]?$" + ) + + for xml_file in self.xml_files: + try: + root = lxml.etree.parse(str(xml_file)).getroot() + + # Check all elements for ID attributes + for elem in root.iter(): + for attr, value in elem.attrib.items(): + # Check if this is an ID attribute + attr_name = attr.split("}")[-1].lower() + if attr_name == "id" or attr_name.endswith("id"): + # Check if value looks like a UUID (has the right length and pattern structure) + if self._looks_like_uuid(value): + # Validate that it contains only hex characters in the right positions + if not uuid_pattern.match(value): + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: ID '{value}' appears to be a UUID but contains invalid hex characters" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} UUID ID validation errors:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All UUID-like IDs contain valid hex values") + return True + + def _looks_like_uuid(self, value): + """Check if a value has the general structure of a UUID.""" + # Remove common UUID delimiters + clean_value = value.strip("{}()").replace("-", "") + # Check if it's 32 hex-like characters (could include invalid hex chars) + return len(clean_value) == 32 and all(c.isalnum() for c in clean_value) + + def validate_slide_layout_ids(self): + """Validate that sldLayoutId elements in slide masters reference valid slide layouts.""" + import lxml.etree + + errors = [] + + # Find all slide master files + slide_masters = list(self.unpacked_dir.glob("ppt/slideMasters/*.xml")) + + if not slide_masters: + if self.verbose: + print("PASSED - No slide masters found") + return True + + for slide_master in slide_masters: + try: + # Parse the slide master file + root = lxml.etree.parse(str(slide_master)).getroot() + + # Find the corresponding _rels file for this slide master + rels_file = slide_master.parent / "_rels" / f"{slide_master.name}.rels" + + if not rels_file.exists(): + errors.append( + f" {slide_master.relative_to(self.unpacked_dir)}: " + f"Missing relationships file: {rels_file.relative_to(self.unpacked_dir)}" + ) + continue + + # Parse the relationships file + rels_root = lxml.etree.parse(str(rels_file)).getroot() + + # Build a set of valid relationship IDs that point to slide layouts + valid_layout_rids = set() + for rel in rels_root.findall( + f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship" + ): + rel_type = rel.get("Type", "") + if "slideLayout" in rel_type: + valid_layout_rids.add(rel.get("Id")) + + # Find all sldLayoutId elements in the slide master + for sld_layout_id in root.findall( + f".//{{{self.PRESENTATIONML_NAMESPACE}}}sldLayoutId" + ): + r_id = sld_layout_id.get( + f"{{{self.OFFICE_RELATIONSHIPS_NAMESPACE}}}id" + ) + layout_id = sld_layout_id.get("id") + + if r_id and r_id not in valid_layout_rids: + errors.append( + f" {slide_master.relative_to(self.unpacked_dir)}: " + f"Line {sld_layout_id.sourceline}: sldLayoutId with id='{layout_id}' " + f"references r:id='{r_id}' which is not found in slide layout relationships" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {slide_master.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} slide layout ID validation errors:") + for error in errors: + print(error) + print( + "Remove invalid references or add missing slide layouts to the relationships file." + ) + return False + else: + if self.verbose: + print("PASSED - All slide layout IDs reference valid slide layouts") + return True + + def validate_no_duplicate_slide_layouts(self): + """Validate that each slide has exactly one slideLayout reference.""" + import lxml.etree + + errors = [] + slide_rels_files = list(self.unpacked_dir.glob("ppt/slides/_rels/*.xml.rels")) + + for rels_file in slide_rels_files: + try: + root = lxml.etree.parse(str(rels_file)).getroot() + + # Find all slideLayout relationships + layout_rels = [ + rel + for rel in root.findall( + f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship" + ) + if "slideLayout" in rel.get("Type", "") + ] + + if len(layout_rels) > 1: + errors.append( + f" {rels_file.relative_to(self.unpacked_dir)}: has {len(layout_rels)} slideLayout references" + ) + + except Exception as e: + errors.append( + f" {rels_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print("FAILED - Found slides with duplicate slideLayout references:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All slides have exactly one slideLayout reference") + return True + + def validate_notes_slide_references(self): + """Validate that each notesSlide file is referenced by only one slide.""" + import lxml.etree + + errors = [] + notes_slide_references = {} # Track which slides reference each notesSlide + + # Find all slide relationship files + slide_rels_files = list(self.unpacked_dir.glob("ppt/slides/_rels/*.xml.rels")) + + if not slide_rels_files: + if self.verbose: + print("PASSED - No slide relationship files found") + return True + + for rels_file in slide_rels_files: + try: + # Parse the relationships file + root = lxml.etree.parse(str(rels_file)).getroot() + + # Find all notesSlide relationships + for rel in root.findall( + f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship" + ): + rel_type = rel.get("Type", "") + if "notesSlide" in rel_type: + target = rel.get("Target", "") + if target: + # Normalize the target path to handle relative paths + normalized_target = target.replace("../", "") + + # Track which slide references this notesSlide + slide_name = rels_file.stem.replace( + ".xml", "" + ) # e.g., "slide1" + + if normalized_target not in notes_slide_references: + notes_slide_references[normalized_target] = [] + notes_slide_references[normalized_target].append( + (slide_name, rels_file) + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {rels_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + # Check for duplicate references + for target, references in notes_slide_references.items(): + if len(references) > 1: + slide_names = [ref[0] for ref in references] + errors.append( + f" Notes slide '{target}' is referenced by multiple slides: {', '.join(slide_names)}" + ) + for slide_name, rels_file in references: + errors.append(f" - {rels_file.relative_to(self.unpacked_dir)}") + + if errors: + print( + f"FAILED - Found {len([e for e in errors if not e.startswith(' ')])} notes slide reference validation errors:" + ) + for error in errors: + print(error) + print("Each slide may optionally have its own slide file.") + return False + else: + if self.verbose: + print("PASSED - All notes slide references are unique") + return True + + +if __name__ == "__main__": + raise RuntimeError("This module should not be run directly.") diff --git a/skills/pptx/ooxml/scripts/validation/redlining.py b/skills/pptx/ooxml/scripts/validation/redlining.py new file mode 100644 index 0000000..7ed425e --- /dev/null +++ b/skills/pptx/ooxml/scripts/validation/redlining.py @@ -0,0 +1,279 @@ +""" +Validator for tracked changes in Word documents. +""" + +import subprocess +import tempfile +import zipfile +from pathlib import Path + + +class RedliningValidator: + """Validator for tracked changes in Word documents.""" + + def __init__(self, unpacked_dir, original_docx, verbose=False): + self.unpacked_dir = Path(unpacked_dir) + self.original_docx = Path(original_docx) + self.verbose = verbose + self.namespaces = { + "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main" + } + + def validate(self): + """Main validation method that returns True if valid, False otherwise.""" + # Verify unpacked directory exists and has correct structure + modified_file = self.unpacked_dir / "word" / "document.xml" + if not modified_file.exists(): + print(f"FAILED - Modified document.xml not found at {modified_file}") + return False + + # First, check if there are any tracked changes by Claude to validate + try: + import xml.etree.ElementTree as ET + + tree = ET.parse(modified_file) + root = tree.getroot() + + # Check for w:del or w:ins tags authored by Claude + del_elements = root.findall(".//w:del", self.namespaces) + ins_elements = root.findall(".//w:ins", self.namespaces) + + # Filter to only include changes by Claude + claude_del_elements = [ + elem + for elem in del_elements + if elem.get(f"{{{self.namespaces['w']}}}author") == "Claude" + ] + claude_ins_elements = [ + elem + for elem in ins_elements + if elem.get(f"{{{self.namespaces['w']}}}author") == "Claude" + ] + + # Redlining validation is only needed if tracked changes by Claude have been used. + if not claude_del_elements and not claude_ins_elements: + if self.verbose: + print("PASSED - No tracked changes by Claude found.") + return True + + except Exception: + # If we can't parse the XML, continue with full validation + pass + + # Create temporary directory for unpacking original docx + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + # Unpack original docx + try: + with zipfile.ZipFile(self.original_docx, "r") as zip_ref: + zip_ref.extractall(temp_path) + except Exception as e: + print(f"FAILED - Error unpacking original docx: {e}") + return False + + original_file = temp_path / "word" / "document.xml" + if not original_file.exists(): + print( + f"FAILED - Original document.xml not found in {self.original_docx}" + ) + return False + + # Parse both XML files using xml.etree.ElementTree for redlining validation + try: + import xml.etree.ElementTree as ET + + modified_tree = ET.parse(modified_file) + modified_root = modified_tree.getroot() + original_tree = ET.parse(original_file) + original_root = original_tree.getroot() + except ET.ParseError as e: + print(f"FAILED - Error parsing XML files: {e}") + return False + + # Remove Claude's tracked changes from both documents + self._remove_claude_tracked_changes(original_root) + self._remove_claude_tracked_changes(modified_root) + + # Extract and compare text content + modified_text = self._extract_text_content(modified_root) + original_text = self._extract_text_content(original_root) + + if modified_text != original_text: + # Show detailed character-level differences for each paragraph + error_message = self._generate_detailed_diff( + original_text, modified_text + ) + print(error_message) + return False + + if self.verbose: + print("PASSED - All changes by Claude are properly tracked") + return True + + def _generate_detailed_diff(self, original_text, modified_text): + """Generate detailed word-level differences using git word diff.""" + error_parts = [ + "FAILED - Document text doesn't match after removing Claude's tracked changes", + "", + "Likely causes:", + " 1. Modified text inside another author's or tags", + " 2. Made edits without proper tracked changes", + " 3. Didn't nest inside when deleting another's insertion", + "", + "For pre-redlined documents, use correct patterns:", + " - To reject another's INSERTION: Nest inside their ", + " - To restore another's DELETION: Add new AFTER their ", + "", + ] + + # Show git word diff + git_diff = self._get_git_word_diff(original_text, modified_text) + if git_diff: + error_parts.extend(["Differences:", "============", git_diff]) + else: + error_parts.append("Unable to generate word diff (git not available)") + + return "\n".join(error_parts) + + def _get_git_word_diff(self, original_text, modified_text): + """Generate word diff using git with character-level precision.""" + try: + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + # Create two files + original_file = temp_path / "original.txt" + modified_file = temp_path / "modified.txt" + + original_file.write_text(original_text, encoding="utf-8") + modified_file.write_text(modified_text, encoding="utf-8") + + # Try character-level diff first for precise differences + result = subprocess.run( + [ + "git", + "diff", + "--word-diff=plain", + "--word-diff-regex=.", # Character-by-character diff + "-U0", # Zero lines of context - show only changed lines + "--no-index", + str(original_file), + str(modified_file), + ], + capture_output=True, + text=True, + ) + + if result.stdout.strip(): + # Clean up the output - remove git diff header lines + lines = result.stdout.split("\n") + # Skip the header lines (diff --git, index, +++, ---, @@) + content_lines = [] + in_content = False + for line in lines: + if line.startswith("@@"): + in_content = True + continue + if in_content and line.strip(): + content_lines.append(line) + + if content_lines: + return "\n".join(content_lines) + + # Fallback to word-level diff if character-level is too verbose + result = subprocess.run( + [ + "git", + "diff", + "--word-diff=plain", + "-U0", # Zero lines of context + "--no-index", + str(original_file), + str(modified_file), + ], + capture_output=True, + text=True, + ) + + if result.stdout.strip(): + lines = result.stdout.split("\n") + content_lines = [] + in_content = False + for line in lines: + if line.startswith("@@"): + in_content = True + continue + if in_content and line.strip(): + content_lines.append(line) + return "\n".join(content_lines) + + except (subprocess.CalledProcessError, FileNotFoundError, Exception): + # Git not available or other error, return None to use fallback + pass + + return None + + def _remove_claude_tracked_changes(self, root): + """Remove tracked changes authored by Claude from the XML root.""" + ins_tag = f"{{{self.namespaces['w']}}}ins" + del_tag = f"{{{self.namespaces['w']}}}del" + author_attr = f"{{{self.namespaces['w']}}}author" + + # Remove w:ins elements + for parent in root.iter(): + to_remove = [] + for child in parent: + if child.tag == ins_tag and child.get(author_attr) == "Claude": + to_remove.append(child) + for elem in to_remove: + parent.remove(elem) + + # Unwrap content in w:del elements where author is "Claude" + deltext_tag = f"{{{self.namespaces['w']}}}delText" + t_tag = f"{{{self.namespaces['w']}}}t" + + for parent in root.iter(): + to_process = [] + for child in parent: + if child.tag == del_tag and child.get(author_attr) == "Claude": + to_process.append((child, list(parent).index(child))) + + # Process in reverse order to maintain indices + for del_elem, del_index in reversed(to_process): + # Convert w:delText to w:t before moving + for elem in del_elem.iter(): + if elem.tag == deltext_tag: + elem.tag = t_tag + + # Move all children of w:del to its parent before removing w:del + for child in reversed(list(del_elem)): + parent.insert(del_index, child) + parent.remove(del_elem) + + def _extract_text_content(self, root): + """Extract text content from Word XML, preserving paragraph structure. + + Empty paragraphs are skipped to avoid false positives when tracked + insertions add only structural elements without text content. + """ + p_tag = f"{{{self.namespaces['w']}}}p" + t_tag = f"{{{self.namespaces['w']}}}t" + + paragraphs = [] + for p_elem in root.findall(f".//{p_tag}"): + # Get all text elements within this paragraph + text_parts = [] + for t_elem in p_elem.findall(f".//{t_tag}"): + if t_elem.text: + text_parts.append(t_elem.text) + paragraph_text = "".join(text_parts) + # Skip empty paragraphs - they don't affect content validation + if paragraph_text: + paragraphs.append(paragraph_text) + + return "\n".join(paragraphs) + + +if __name__ == "__main__": + raise RuntimeError("This module should not be run directly.") diff --git a/skills/pptx/scripts/inventory.py b/skills/pptx/scripts/inventory.py new file mode 100644 index 0000000..edda390 --- /dev/null +++ b/skills/pptx/scripts/inventory.py @@ -0,0 +1,1020 @@ +#!/usr/bin/env python3 +""" +Extract structured text content from PowerPoint presentations. + +This module provides functionality to: +- Extract all text content from PowerPoint shapes +- Preserve paragraph formatting (alignment, bullets, fonts, spacing) +- Handle nested GroupShapes recursively with correct absolute positions +- Sort shapes by visual position on slides +- Filter out slide numbers and non-content placeholders +- Export to JSON with clean, structured data + +Classes: + ParagraphData: Represents a text paragraph with formatting + ShapeData: Represents a shape with position and text content + +Main Functions: + extract_text_inventory: Extract all text from a presentation + save_inventory: Save extracted data to JSON + +Usage: + python inventory.py input.pptx output.json +""" + +import argparse +import json +import platform +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, Union + +from PIL import Image, ImageDraw, ImageFont +from pptx import Presentation +from pptx.enum.text import PP_ALIGN +from pptx.shapes.base import BaseShape + +# Type aliases for cleaner signatures +JsonValue = Union[str, int, float, bool, None] +ParagraphDict = Dict[str, JsonValue] +ShapeDict = Dict[ + str, Union[str, float, bool, List[ParagraphDict], List[str], Dict[str, Any], None] +] +InventoryData = Dict[ + str, Dict[str, "ShapeData"] +] # Dict of slide_id -> {shape_id -> ShapeData} +InventoryDict = Dict[str, Dict[str, ShapeDict]] # JSON-serializable inventory + + +def main(): + """Main entry point for command-line usage.""" + parser = argparse.ArgumentParser( + description="Extract text inventory from PowerPoint with proper GroupShape support.", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python inventory.py presentation.pptx inventory.json + Extracts text inventory with correct absolute positions for grouped shapes + + python inventory.py presentation.pptx inventory.json --issues-only + Extracts only text shapes that have overflow or overlap issues + +The output JSON includes: + - All text content organized by slide and shape + - Correct absolute positions for shapes in groups + - Visual position and size in inches + - Paragraph properties and formatting + - Issue detection: text overflow and shape overlaps + """, + ) + + parser.add_argument("input", help="Input PowerPoint file (.pptx)") + parser.add_argument("output", help="Output JSON file for inventory") + parser.add_argument( + "--issues-only", + action="store_true", + help="Include only text shapes that have overflow or overlap issues", + ) + + args = parser.parse_args() + + input_path = Path(args.input) + if not input_path.exists(): + print(f"Error: Input file not found: {args.input}") + sys.exit(1) + + if not input_path.suffix.lower() == ".pptx": + print("Error: Input must be a PowerPoint file (.pptx)") + sys.exit(1) + + try: + print(f"Extracting text inventory from: {args.input}") + if args.issues_only: + print( + "Filtering to include only text shapes with issues (overflow/overlap)" + ) + inventory = extract_text_inventory(input_path, issues_only=args.issues_only) + + output_path = Path(args.output) + output_path.parent.mkdir(parents=True, exist_ok=True) + save_inventory(inventory, output_path) + + print(f"Output saved to: {args.output}") + + # Report statistics + total_slides = len(inventory) + total_shapes = sum(len(shapes) for shapes in inventory.values()) + if args.issues_only: + if total_shapes > 0: + print( + f"Found {total_shapes} text elements with issues in {total_slides} slides" + ) + else: + print("No issues discovered") + else: + print( + f"Found text in {total_slides} slides with {total_shapes} text elements" + ) + + except Exception as e: + print(f"Error processing presentation: {e}") + import traceback + + traceback.print_exc() + sys.exit(1) + + +@dataclass +class ShapeWithPosition: + """A shape with its absolute position on the slide.""" + + shape: BaseShape + absolute_left: int # in EMUs + absolute_top: int # in EMUs + + +class ParagraphData: + """Data structure for paragraph properties extracted from a PowerPoint paragraph.""" + + def __init__(self, paragraph: Any): + """Initialize from a PowerPoint paragraph object. + + Args: + paragraph: The PowerPoint paragraph object + """ + self.text: str = paragraph.text.strip() + self.bullet: bool = False + self.level: Optional[int] = None + self.alignment: Optional[str] = None + self.space_before: Optional[float] = None + self.space_after: Optional[float] = None + self.font_name: Optional[str] = None + self.font_size: Optional[float] = None + self.bold: Optional[bool] = None + self.italic: Optional[bool] = None + self.underline: Optional[bool] = None + self.color: Optional[str] = None + self.theme_color: Optional[str] = None + self.line_spacing: Optional[float] = None + + # Check for bullet formatting + if ( + hasattr(paragraph, "_p") + and paragraph._p is not None + and paragraph._p.pPr is not None + ): + pPr = paragraph._p.pPr + ns = "{http://schemas.openxmlformats.org/drawingml/2006/main}" + if ( + pPr.find(f"{ns}buChar") is not None + or pPr.find(f"{ns}buAutoNum") is not None + ): + self.bullet = True + if hasattr(paragraph, "level"): + self.level = paragraph.level + + # Add alignment if not LEFT (default) + if hasattr(paragraph, "alignment") and paragraph.alignment is not None: + alignment_map = { + PP_ALIGN.CENTER: "CENTER", + PP_ALIGN.RIGHT: "RIGHT", + PP_ALIGN.JUSTIFY: "JUSTIFY", + } + if paragraph.alignment in alignment_map: + self.alignment = alignment_map[paragraph.alignment] + + # Add spacing properties if set + if hasattr(paragraph, "space_before") and paragraph.space_before: + self.space_before = paragraph.space_before.pt + if hasattr(paragraph, "space_after") and paragraph.space_after: + self.space_after = paragraph.space_after.pt + + # Extract font properties from first run + if paragraph.runs: + first_run = paragraph.runs[0] + if hasattr(first_run, "font"): + font = first_run.font + if font.name: + self.font_name = font.name + if font.size: + self.font_size = font.size.pt + if font.bold is not None: + self.bold = font.bold + if font.italic is not None: + self.italic = font.italic + if font.underline is not None: + self.underline = font.underline + + # Handle color - both RGB and theme colors + try: + # Try RGB color first + if font.color.rgb: + self.color = str(font.color.rgb) + except (AttributeError, TypeError): + # Fall back to theme color + try: + if font.color.theme_color: + self.theme_color = font.color.theme_color.name + except (AttributeError, TypeError): + pass + + # Add line spacing if set + if hasattr(paragraph, "line_spacing") and paragraph.line_spacing is not None: + if hasattr(paragraph.line_spacing, "pt"): + self.line_spacing = round(paragraph.line_spacing.pt, 2) + else: + # Multiplier - convert to points + font_size = self.font_size if self.font_size else 12.0 + self.line_spacing = round(paragraph.line_spacing * font_size, 2) + + def to_dict(self) -> ParagraphDict: + """Convert to dictionary for JSON serialization, excluding None values.""" + result: ParagraphDict = {"text": self.text} + + # Add optional fields only if they have values + if self.bullet: + result["bullet"] = self.bullet + if self.level is not None: + result["level"] = self.level + if self.alignment: + result["alignment"] = self.alignment + if self.space_before is not None: + result["space_before"] = self.space_before + if self.space_after is not None: + result["space_after"] = self.space_after + if self.font_name: + result["font_name"] = self.font_name + if self.font_size is not None: + result["font_size"] = self.font_size + if self.bold is not None: + result["bold"] = self.bold + if self.italic is not None: + result["italic"] = self.italic + if self.underline is not None: + result["underline"] = self.underline + if self.color: + result["color"] = self.color + if self.theme_color: + result["theme_color"] = self.theme_color + if self.line_spacing is not None: + result["line_spacing"] = self.line_spacing + + return result + + +class ShapeData: + """Data structure for shape properties extracted from a PowerPoint shape.""" + + @staticmethod + def emu_to_inches(emu: int) -> float: + """Convert EMUs (English Metric Units) to inches.""" + return emu / 914400.0 + + @staticmethod + def inches_to_pixels(inches: float, dpi: int = 96) -> int: + """Convert inches to pixels at given DPI.""" + return int(inches * dpi) + + @staticmethod + def get_font_path(font_name: str) -> Optional[str]: + """Get the font file path for a given font name. + + Args: + font_name: Name of the font (e.g., 'Arial', 'Calibri') + + Returns: + Path to the font file, or None if not found + """ + system = platform.system() + + # Common font file variations to try + font_variations = [ + font_name, + font_name.lower(), + font_name.replace(" ", ""), + font_name.replace(" ", "-"), + ] + + # Define font directories and extensions by platform + if system == "Darwin": # macOS + font_dirs = [ + "/System/Library/Fonts/", + "/Library/Fonts/", + "~/Library/Fonts/", + ] + extensions = [".ttf", ".otf", ".ttc", ".dfont"] + else: # Linux + font_dirs = [ + "/usr/share/fonts/truetype/", + "/usr/local/share/fonts/", + "~/.fonts/", + ] + extensions = [".ttf", ".otf"] + + # Try to find the font file + from pathlib import Path + + for font_dir in font_dirs: + font_dir_path = Path(font_dir).expanduser() + if not font_dir_path.exists(): + continue + + # First try exact matches + for variant in font_variations: + for ext in extensions: + font_path = font_dir_path / f"{variant}{ext}" + if font_path.exists(): + return str(font_path) + + # Then try fuzzy matching - find files containing the font name + try: + for file_path in font_dir_path.iterdir(): + if file_path.is_file(): + file_name_lower = file_path.name.lower() + font_name_lower = font_name.lower().replace(" ", "") + if font_name_lower in file_name_lower and any( + file_name_lower.endswith(ext) for ext in extensions + ): + return str(file_path) + except (OSError, PermissionError): + continue + + return None + + @staticmethod + def get_slide_dimensions(slide: Any) -> tuple[Optional[int], Optional[int]]: + """Get slide dimensions from slide object. + + Args: + slide: Slide object + + Returns: + Tuple of (width_emu, height_emu) or (None, None) if not found + """ + try: + prs = slide.part.package.presentation_part.presentation + return prs.slide_width, prs.slide_height + except (AttributeError, TypeError): + return None, None + + @staticmethod + def get_default_font_size(shape: BaseShape, slide_layout: Any) -> Optional[float]: + """Extract default font size from slide layout for a placeholder shape. + + Args: + shape: Placeholder shape + slide_layout: Slide layout containing the placeholder definition + + Returns: + Default font size in points, or None if not found + """ + try: + if not hasattr(shape, "placeholder_format"): + return None + + shape_type = shape.placeholder_format.type # type: ignore + for layout_placeholder in slide_layout.placeholders: + if layout_placeholder.placeholder_format.type == shape_type: + # Find first defRPr element with sz (size) attribute + for elem in layout_placeholder.element.iter(): + if "defRPr" in elem.tag and (sz := elem.get("sz")): + return float(sz) / 100.0 # Convert EMUs to points + break + except Exception: + pass + return None + + def __init__( + self, + shape: BaseShape, + absolute_left: Optional[int] = None, + absolute_top: Optional[int] = None, + slide: Optional[Any] = None, + ): + """Initialize from a PowerPoint shape object. + + Args: + shape: The PowerPoint shape object (should be pre-validated) + absolute_left: Absolute left position in EMUs (for shapes in groups) + absolute_top: Absolute top position in EMUs (for shapes in groups) + slide: Optional slide object to get dimensions and layout information + """ + self.shape = shape # Store reference to original shape + self.shape_id: str = "" # Will be set after sorting + + # Get slide dimensions from slide object + self.slide_width_emu, self.slide_height_emu = ( + self.get_slide_dimensions(slide) if slide else (None, None) + ) + + # Get placeholder type if applicable + self.placeholder_type: Optional[str] = None + self.default_font_size: Optional[float] = None + if hasattr(shape, "is_placeholder") and shape.is_placeholder: # type: ignore + if shape.placeholder_format and shape.placeholder_format.type: # type: ignore + self.placeholder_type = ( + str(shape.placeholder_format.type).split(".")[-1].split(" ")[0] # type: ignore + ) + + # Get default font size from layout + if slide and hasattr(slide, "slide_layout"): + self.default_font_size = self.get_default_font_size( + shape, slide.slide_layout + ) + + # Get position information + # Use absolute positions if provided (for shapes in groups), otherwise use shape's position + left_emu = ( + absolute_left + if absolute_left is not None + else (shape.left if hasattr(shape, "left") else 0) + ) + top_emu = ( + absolute_top + if absolute_top is not None + else (shape.top if hasattr(shape, "top") else 0) + ) + + self.left: float = round(self.emu_to_inches(left_emu), 2) # type: ignore + self.top: float = round(self.emu_to_inches(top_emu), 2) # type: ignore + self.width: float = round( + self.emu_to_inches(shape.width if hasattr(shape, "width") else 0), + 2, # type: ignore + ) + self.height: float = round( + self.emu_to_inches(shape.height if hasattr(shape, "height") else 0), + 2, # type: ignore + ) + + # Store EMU positions for overflow calculations + self.left_emu = left_emu + self.top_emu = top_emu + self.width_emu = shape.width if hasattr(shape, "width") else 0 + self.height_emu = shape.height if hasattr(shape, "height") else 0 + + # Calculate overflow status + self.frame_overflow_bottom: Optional[float] = None + self.slide_overflow_right: Optional[float] = None + self.slide_overflow_bottom: Optional[float] = None + self.overlapping_shapes: Dict[ + str, float + ] = {} # Dict of shape_id -> overlap area in sq inches + self.warnings: List[str] = [] + self._estimate_frame_overflow() + self._calculate_slide_overflow() + self._detect_bullet_issues() + + @property + def paragraphs(self) -> List[ParagraphData]: + """Calculate paragraphs from the shape's text frame.""" + if not self.shape or not hasattr(self.shape, "text_frame"): + return [] + + paragraphs = [] + for paragraph in self.shape.text_frame.paragraphs: # type: ignore + if paragraph.text.strip(): + paragraphs.append(ParagraphData(paragraph)) + return paragraphs + + def _get_default_font_size(self) -> int: + """Get default font size from theme text styles or use conservative default.""" + try: + if not ( + hasattr(self.shape, "part") and hasattr(self.shape.part, "slide_layout") + ): + return 14 + + slide_master = self.shape.part.slide_layout.slide_master # type: ignore + if not hasattr(slide_master, "element"): + return 14 + + # Determine theme style based on placeholder type + style_name = "bodyStyle" # Default + if self.placeholder_type and "TITLE" in self.placeholder_type: + style_name = "titleStyle" + + # Find font size in theme styles + for child in slide_master.element.iter(): + tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag + if tag == style_name: + for elem in child.iter(): + if "sz" in elem.attrib: + return int(elem.attrib["sz"]) // 100 + except Exception: + pass + + return 14 # Conservative default for body text + + def _get_usable_dimensions(self, text_frame) -> Tuple[int, int]: + """Get usable width and height in pixels after accounting for margins.""" + # Default PowerPoint margins in inches + margins = {"top": 0.05, "bottom": 0.05, "left": 0.1, "right": 0.1} + + # Override with actual margins if set + if hasattr(text_frame, "margin_top") and text_frame.margin_top: + margins["top"] = self.emu_to_inches(text_frame.margin_top) + if hasattr(text_frame, "margin_bottom") and text_frame.margin_bottom: + margins["bottom"] = self.emu_to_inches(text_frame.margin_bottom) + if hasattr(text_frame, "margin_left") and text_frame.margin_left: + margins["left"] = self.emu_to_inches(text_frame.margin_left) + if hasattr(text_frame, "margin_right") and text_frame.margin_right: + margins["right"] = self.emu_to_inches(text_frame.margin_right) + + # Calculate usable area + usable_width = self.width - margins["left"] - margins["right"] + usable_height = self.height - margins["top"] - margins["bottom"] + + # Convert to pixels + return ( + self.inches_to_pixels(usable_width), + self.inches_to_pixels(usable_height), + ) + + def _wrap_text_line(self, line: str, max_width_px: int, draw, font) -> List[str]: + """Wrap a single line of text to fit within max_width_px.""" + if not line: + return [""] + + # Use textlength for efficient width calculation + if draw.textlength(line, font=font) <= max_width_px: + return [line] + + # Need to wrap - split into words + wrapped = [] + words = line.split(" ") + current_line = "" + + for word in words: + test_line = current_line + (" " if current_line else "") + word + if draw.textlength(test_line, font=font) <= max_width_px: + current_line = test_line + else: + if current_line: + wrapped.append(current_line) + current_line = word + + if current_line: + wrapped.append(current_line) + + return wrapped + + def _estimate_frame_overflow(self) -> None: + """Estimate if text overflows the shape bounds using PIL text measurement.""" + if not self.shape or not hasattr(self.shape, "text_frame"): + return + + text_frame = self.shape.text_frame # type: ignore + if not text_frame or not text_frame.paragraphs: + return + + # Get usable dimensions after accounting for margins + usable_width_px, usable_height_px = self._get_usable_dimensions(text_frame) + if usable_width_px <= 0 or usable_height_px <= 0: + return + + # Set up PIL for text measurement + dummy_img = Image.new("RGB", (1, 1)) + draw = ImageDraw.Draw(dummy_img) + + # Get default font size from placeholder or use conservative estimate + default_font_size = self._get_default_font_size() + + # Calculate total height of all paragraphs + total_height_px = 0 + + for para_idx, paragraph in enumerate(text_frame.paragraphs): + if not paragraph.text.strip(): + continue + + para_data = ParagraphData(paragraph) + + # Load font for this paragraph + font_name = para_data.font_name or "Arial" + font_size = int(para_data.font_size or default_font_size) + + font = None + font_path = self.get_font_path(font_name) + if font_path: + try: + font = ImageFont.truetype(font_path, size=font_size) + except Exception: + font = ImageFont.load_default() + else: + font = ImageFont.load_default() + + # Wrap all lines in this paragraph + all_wrapped_lines = [] + for line in paragraph.text.split("\n"): + wrapped = self._wrap_text_line(line, usable_width_px, draw, font) + all_wrapped_lines.extend(wrapped) + + if all_wrapped_lines: + # Calculate line height + if para_data.line_spacing: + # Custom line spacing explicitly set + line_height_px = para_data.line_spacing * 96 / 72 + else: + # PowerPoint default single spacing (1.0x font size) + line_height_px = font_size * 96 / 72 + + # Add space_before (except first paragraph) + if para_idx > 0 and para_data.space_before: + total_height_px += para_data.space_before * 96 / 72 + + # Add paragraph text height + total_height_px += len(all_wrapped_lines) * line_height_px + + # Add space_after + if para_data.space_after: + total_height_px += para_data.space_after * 96 / 72 + + # Check for overflow (ignore negligible overflows <= 0.05") + if total_height_px > usable_height_px: + overflow_px = total_height_px - usable_height_px + overflow_inches = round(overflow_px / 96.0, 2) + if overflow_inches > 0.05: # Only report significant overflows + self.frame_overflow_bottom = overflow_inches + + def _calculate_slide_overflow(self) -> None: + """Calculate if shape overflows the slide boundaries.""" + if self.slide_width_emu is None or self.slide_height_emu is None: + return + + # Check right overflow (ignore negligible overflows <= 0.01") + right_edge_emu = self.left_emu + self.width_emu + if right_edge_emu > self.slide_width_emu: + overflow_emu = right_edge_emu - self.slide_width_emu + overflow_inches = round(self.emu_to_inches(overflow_emu), 2) + if overflow_inches > 0.01: # Only report significant overflows + self.slide_overflow_right = overflow_inches + + # Check bottom overflow (ignore negligible overflows <= 0.01") + bottom_edge_emu = self.top_emu + self.height_emu + if bottom_edge_emu > self.slide_height_emu: + overflow_emu = bottom_edge_emu - self.slide_height_emu + overflow_inches = round(self.emu_to_inches(overflow_emu), 2) + if overflow_inches > 0.01: # Only report significant overflows + self.slide_overflow_bottom = overflow_inches + + def _detect_bullet_issues(self) -> None: + """Detect bullet point formatting issues in paragraphs.""" + if not self.shape or not hasattr(self.shape, "text_frame"): + return + + text_frame = self.shape.text_frame # type: ignore + if not text_frame or not text_frame.paragraphs: + return + + # Common bullet symbols that indicate manual bullets + bullet_symbols = ["•", "●", "○"] + + for paragraph in text_frame.paragraphs: + text = paragraph.text.strip() + # Check for manual bullet symbols + if text and any(text.startswith(symbol + " ") for symbol in bullet_symbols): + self.warnings.append( + "manual_bullet_symbol: use proper bullet formatting" + ) + break + + @property + def has_any_issues(self) -> bool: + """Check if shape has any issues (overflow, overlap, or warnings).""" + return ( + self.frame_overflow_bottom is not None + or self.slide_overflow_right is not None + or self.slide_overflow_bottom is not None + or len(self.overlapping_shapes) > 0 + or len(self.warnings) > 0 + ) + + def to_dict(self) -> ShapeDict: + """Convert to dictionary for JSON serialization.""" + result: ShapeDict = { + "left": self.left, + "top": self.top, + "width": self.width, + "height": self.height, + } + + # Add optional fields if present + if self.placeholder_type: + result["placeholder_type"] = self.placeholder_type + + if self.default_font_size: + result["default_font_size"] = self.default_font_size + + # Add overflow information only if there is overflow + overflow_data = {} + + # Add frame overflow if present + if self.frame_overflow_bottom is not None: + overflow_data["frame"] = {"overflow_bottom": self.frame_overflow_bottom} + + # Add slide overflow if present + slide_overflow = {} + if self.slide_overflow_right is not None: + slide_overflow["overflow_right"] = self.slide_overflow_right + if self.slide_overflow_bottom is not None: + slide_overflow["overflow_bottom"] = self.slide_overflow_bottom + if slide_overflow: + overflow_data["slide"] = slide_overflow + + # Only add overflow field if there is overflow + if overflow_data: + result["overflow"] = overflow_data + + # Add overlap field if there are overlapping shapes + if self.overlapping_shapes: + result["overlap"] = {"overlapping_shapes": self.overlapping_shapes} + + # Add warnings field if there are warnings + if self.warnings: + result["warnings"] = self.warnings + + # Add paragraphs after placeholder_type + result["paragraphs"] = [para.to_dict() for para in self.paragraphs] + + return result + + +def is_valid_shape(shape: BaseShape) -> bool: + """Check if a shape contains meaningful text content.""" + # Must have a text frame with content + if not hasattr(shape, "text_frame") or not shape.text_frame: # type: ignore + return False + + text = shape.text_frame.text.strip() # type: ignore + if not text: + return False + + # Skip slide numbers and numeric footers + if hasattr(shape, "is_placeholder") and shape.is_placeholder: # type: ignore + if shape.placeholder_format and shape.placeholder_format.type: # type: ignore + placeholder_type = ( + str(shape.placeholder_format.type).split(".")[-1].split(" ")[0] # type: ignore + ) + if placeholder_type == "SLIDE_NUMBER": + return False + if placeholder_type == "FOOTER" and text.isdigit(): + return False + + return True + + +def collect_shapes_with_absolute_positions( + shape: BaseShape, parent_left: int = 0, parent_top: int = 0 +) -> List[ShapeWithPosition]: + """Recursively collect all shapes with valid text, calculating absolute positions. + + For shapes within groups, their positions are relative to the group. + This function calculates the absolute position on the slide by accumulating + parent group offsets. + + Args: + shape: The shape to process + parent_left: Accumulated left offset from parent groups (in EMUs) + parent_top: Accumulated top offset from parent groups (in EMUs) + + Returns: + List of ShapeWithPosition objects with absolute positions + """ + if hasattr(shape, "shapes"): # GroupShape + result = [] + # Get this group's position + group_left = shape.left if hasattr(shape, "left") else 0 + group_top = shape.top if hasattr(shape, "top") else 0 + + # Calculate absolute position for this group + abs_group_left = parent_left + group_left + abs_group_top = parent_top + group_top + + # Process children with accumulated offsets + for child in shape.shapes: # type: ignore + result.extend( + collect_shapes_with_absolute_positions( + child, abs_group_left, abs_group_top + ) + ) + return result + + # Regular shape - check if it has valid text + if is_valid_shape(shape): + # Calculate absolute position + shape_left = shape.left if hasattr(shape, "left") else 0 + shape_top = shape.top if hasattr(shape, "top") else 0 + + return [ + ShapeWithPosition( + shape=shape, + absolute_left=parent_left + shape_left, + absolute_top=parent_top + shape_top, + ) + ] + + return [] + + +def sort_shapes_by_position(shapes: List[ShapeData]) -> List[ShapeData]: + """Sort shapes by visual position (top-to-bottom, left-to-right). + + Shapes within 0.5 inches vertically are considered on the same row. + """ + if not shapes: + return shapes + + # Sort by top position first + shapes = sorted(shapes, key=lambda s: (s.top, s.left)) + + # Group shapes by row (within 0.5 inches vertically) + result = [] + row = [shapes[0]] + row_top = shapes[0].top + + for shape in shapes[1:]: + if abs(shape.top - row_top) <= 0.5: + row.append(shape) + else: + # Sort current row by left position and add to result + result.extend(sorted(row, key=lambda s: s.left)) + row = [shape] + row_top = shape.top + + # Don't forget the last row + result.extend(sorted(row, key=lambda s: s.left)) + return result + + +def calculate_overlap( + rect1: Tuple[float, float, float, float], + rect2: Tuple[float, float, float, float], + tolerance: float = 0.05, +) -> Tuple[bool, float]: + """Calculate if and how much two rectangles overlap. + + Args: + rect1: (left, top, width, height) of first rectangle in inches + rect2: (left, top, width, height) of second rectangle in inches + tolerance: Minimum overlap in inches to consider as overlapping (default: 0.05") + + Returns: + Tuple of (overlaps, overlap_area) where: + - overlaps: True if rectangles overlap by more than tolerance + - overlap_area: Area of overlap in square inches + """ + left1, top1, w1, h1 = rect1 + left2, top2, w2, h2 = rect2 + + # Calculate overlap dimensions + overlap_width = min(left1 + w1, left2 + w2) - max(left1, left2) + overlap_height = min(top1 + h1, top2 + h2) - max(top1, top2) + + # Check if there's meaningful overlap (more than tolerance) + if overlap_width > tolerance and overlap_height > tolerance: + # Calculate overlap area in square inches + overlap_area = overlap_width * overlap_height + return True, round(overlap_area, 2) + + return False, 0 + + +def detect_overlaps(shapes: List[ShapeData]) -> None: + """Detect overlapping shapes and update their overlapping_shapes dictionaries. + + This function requires each ShapeData to have its shape_id already set. + It modifies the shapes in-place, adding shape IDs with overlap areas in square inches. + + Args: + shapes: List of ShapeData objects with shape_id attributes set + """ + n = len(shapes) + + # Compare each pair of shapes + for i in range(n): + for j in range(i + 1, n): + shape1 = shapes[i] + shape2 = shapes[j] + + # Ensure shape IDs are set + assert shape1.shape_id, f"Shape at index {i} has no shape_id" + assert shape2.shape_id, f"Shape at index {j} has no shape_id" + + rect1 = (shape1.left, shape1.top, shape1.width, shape1.height) + rect2 = (shape2.left, shape2.top, shape2.width, shape2.height) + + overlaps, overlap_area = calculate_overlap(rect1, rect2) + + if overlaps: + # Add shape IDs with overlap area in square inches + shape1.overlapping_shapes[shape2.shape_id] = overlap_area + shape2.overlapping_shapes[shape1.shape_id] = overlap_area + + +def extract_text_inventory( + pptx_path: Path, prs: Optional[Any] = None, issues_only: bool = False +) -> InventoryData: + """Extract text content from all slides in a PowerPoint presentation. + + Args: + pptx_path: Path to the PowerPoint file + prs: Optional Presentation object to use. If not provided, will load from pptx_path. + issues_only: If True, only include shapes that have overflow or overlap issues + + Returns a nested dictionary: {slide-N: {shape-N: ShapeData}} + Shapes are sorted by visual position (top-to-bottom, left-to-right). + The ShapeData objects contain the full shape information and can be + converted to dictionaries for JSON serialization using to_dict(). + """ + if prs is None: + prs = Presentation(str(pptx_path)) + inventory: InventoryData = {} + + for slide_idx, slide in enumerate(prs.slides): + # Collect all valid shapes from this slide with absolute positions + shapes_with_positions = [] + for shape in slide.shapes: # type: ignore + shapes_with_positions.extend(collect_shapes_with_absolute_positions(shape)) + + if not shapes_with_positions: + continue + + # Convert to ShapeData with absolute positions and slide reference + shape_data_list = [ + ShapeData( + swp.shape, + swp.absolute_left, + swp.absolute_top, + slide, + ) + for swp in shapes_with_positions + ] + + # Sort by visual position and assign stable IDs in one step + sorted_shapes = sort_shapes_by_position(shape_data_list) + for idx, shape_data in enumerate(sorted_shapes): + shape_data.shape_id = f"shape-{idx}" + + # Detect overlaps using the stable shape IDs + if len(sorted_shapes) > 1: + detect_overlaps(sorted_shapes) + + # Filter for issues only if requested (after overlap detection) + if issues_only: + sorted_shapes = [sd for sd in sorted_shapes if sd.has_any_issues] + + if not sorted_shapes: + continue + + # Create slide inventory using the stable shape IDs + inventory[f"slide-{slide_idx}"] = { + shape_data.shape_id: shape_data for shape_data in sorted_shapes + } + + return inventory + + +def get_inventory_as_dict(pptx_path: Path, issues_only: bool = False) -> InventoryDict: + """Extract text inventory and return as JSON-serializable dictionaries. + + This is a convenience wrapper around extract_text_inventory that returns + dictionaries instead of ShapeData objects, useful for testing and direct + JSON serialization. + + Args: + pptx_path: Path to the PowerPoint file + issues_only: If True, only include shapes that have overflow or overlap issues + + Returns: + Nested dictionary with all data serialized for JSON + """ + inventory = extract_text_inventory(pptx_path, issues_only=issues_only) + + # Convert ShapeData objects to dictionaries + dict_inventory: InventoryDict = {} + for slide_key, shapes in inventory.items(): + dict_inventory[slide_key] = { + shape_key: shape_data.to_dict() for shape_key, shape_data in shapes.items() + } + + return dict_inventory + + +def save_inventory(inventory: InventoryData, output_path: Path) -> None: + """Save inventory to JSON file with proper formatting. + + Converts ShapeData objects to dictionaries for JSON serialization. + """ + # Convert ShapeData objects to dictionaries + json_inventory: InventoryDict = {} + for slide_key, shapes in inventory.items(): + json_inventory[slide_key] = { + shape_key: shape_data.to_dict() for shape_key, shape_data in shapes.items() + } + + with open(output_path, "w", encoding="utf-8") as f: + json.dump(json_inventory, f, indent=2, ensure_ascii=False) + + +if __name__ == "__main__": + main() diff --git a/skills/pptx/scripts/rearrange.py b/skills/pptx/scripts/rearrange.py new file mode 100644 index 0000000..2519911 --- /dev/null +++ b/skills/pptx/scripts/rearrange.py @@ -0,0 +1,231 @@ +#!/usr/bin/env python3 +""" +Rearrange PowerPoint slides based on a sequence of indices. + +Usage: + python rearrange.py template.pptx output.pptx 0,34,34,50,52 + +This will create output.pptx using slides from template.pptx in the specified order. +Slides can be repeated (e.g., 34 appears twice). +""" + +import argparse +import shutil +import sys +from copy import deepcopy +from pathlib import Path + +import six +from pptx import Presentation + + +def main(): + parser = argparse.ArgumentParser( + description="Rearrange PowerPoint slides based on a sequence of indices.", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python rearrange.py template.pptx output.pptx 0,34,34,50,52 + Creates output.pptx using slides 0, 34 (twice), 50, and 52 from template.pptx + + python rearrange.py template.pptx output.pptx 5,3,1,2,4 + Creates output.pptx with slides reordered as specified + +Note: Slide indices are 0-based (first slide is 0, second is 1, etc.) + """, + ) + + parser.add_argument("template", help="Path to template PPTX file") + parser.add_argument("output", help="Path for output PPTX file") + parser.add_argument( + "sequence", help="Comma-separated sequence of slide indices (0-based)" + ) + + args = parser.parse_args() + + # Parse the slide sequence + try: + slide_sequence = [int(x.strip()) for x in args.sequence.split(",")] + except ValueError: + print( + "Error: Invalid sequence format. Use comma-separated integers (e.g., 0,34,34,50,52)" + ) + sys.exit(1) + + # Check template exists + template_path = Path(args.template) + if not template_path.exists(): + print(f"Error: Template file not found: {args.template}") + sys.exit(1) + + # Create output directory if needed + output_path = Path(args.output) + output_path.parent.mkdir(parents=True, exist_ok=True) + + try: + rearrange_presentation(template_path, output_path, slide_sequence) + except ValueError as e: + print(f"Error: {e}") + sys.exit(1) + except Exception as e: + print(f"Error processing presentation: {e}") + sys.exit(1) + + +def duplicate_slide(pres, index): + """Duplicate a slide in the presentation.""" + source = pres.slides[index] + + # Use source's layout to preserve formatting + new_slide = pres.slides.add_slide(source.slide_layout) + + # Collect all image and media relationships from the source slide + image_rels = {} + for rel_id, rel in six.iteritems(source.part.rels): + if "image" in rel.reltype or "media" in rel.reltype: + image_rels[rel_id] = rel + + # CRITICAL: Clear placeholder shapes to avoid duplicates + for shape in new_slide.shapes: + sp = shape.element + sp.getparent().remove(sp) + + # Copy all shapes from source + for shape in source.shapes: + el = shape.element + new_el = deepcopy(el) + new_slide.shapes._spTree.insert_element_before(new_el, "p:extLst") + + # Handle picture shapes - need to update the blip reference + # Look for all blip elements (they can be in pic or other contexts) + # Using the element's own xpath method without namespaces argument + blips = new_el.xpath(".//a:blip[@r:embed]") + for blip in blips: + old_rId = blip.get( + "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed" + ) + if old_rId in image_rels: + # Create a new relationship in the destination slide for this image + old_rel = image_rels[old_rId] + # get_or_add returns the rId directly, or adds and returns new rId + new_rId = new_slide.part.rels.get_or_add( + old_rel.reltype, old_rel._target + ) + # Update the blip's embed reference to use the new relationship ID + blip.set( + "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed", + new_rId, + ) + + # Copy any additional image/media relationships that might be referenced elsewhere + for rel_id, rel in image_rels.items(): + try: + new_slide.part.rels.get_or_add(rel.reltype, rel._target) + except Exception: + pass # Relationship might already exist + + return new_slide + + +def delete_slide(pres, index): + """Delete a slide from the presentation.""" + rId = pres.slides._sldIdLst[index].rId + pres.part.drop_rel(rId) + del pres.slides._sldIdLst[index] + + +def reorder_slides(pres, slide_index, target_index): + """Move a slide from one position to another.""" + slides = pres.slides._sldIdLst + + # Remove slide element from current position + slide_element = slides[slide_index] + slides.remove(slide_element) + + # Insert at target position + slides.insert(target_index, slide_element) + + +def rearrange_presentation(template_path, output_path, slide_sequence): + """ + Create a new presentation with slides from template in specified order. + + Args: + template_path: Path to template PPTX file + output_path: Path for output PPTX file + slide_sequence: List of slide indices (0-based) to include + """ + # Copy template to preserve dimensions and theme + if template_path != output_path: + shutil.copy2(template_path, output_path) + prs = Presentation(output_path) + else: + prs = Presentation(template_path) + + total_slides = len(prs.slides) + + # Validate indices + for idx in slide_sequence: + if idx < 0 or idx >= total_slides: + raise ValueError(f"Slide index {idx} out of range (0-{total_slides - 1})") + + # Track original slides and their duplicates + slide_map = [] # List of actual slide indices for final presentation + duplicated = {} # Track duplicates: original_idx -> [duplicate_indices] + + # Step 1: DUPLICATE repeated slides + print(f"Processing {len(slide_sequence)} slides from template...") + for i, template_idx in enumerate(slide_sequence): + if template_idx in duplicated and duplicated[template_idx]: + # Already duplicated this slide, use the duplicate + slide_map.append(duplicated[template_idx].pop(0)) + print(f" [{i}] Using duplicate of slide {template_idx}") + elif slide_sequence.count(template_idx) > 1 and template_idx not in duplicated: + # First occurrence of a repeated slide - create duplicates + slide_map.append(template_idx) + duplicates = [] + count = slide_sequence.count(template_idx) - 1 + print( + f" [{i}] Using original slide {template_idx}, creating {count} duplicate(s)" + ) + for _ in range(count): + duplicate_slide(prs, template_idx) + duplicates.append(len(prs.slides) - 1) + duplicated[template_idx] = duplicates + else: + # Unique slide or first occurrence already handled, use original + slide_map.append(template_idx) + print(f" [{i}] Using original slide {template_idx}") + + # Step 2: DELETE unwanted slides (work backwards) + slides_to_keep = set(slide_map) + print(f"\nDeleting {len(prs.slides) - len(slides_to_keep)} unused slides...") + for i in range(len(prs.slides) - 1, -1, -1): + if i not in slides_to_keep: + delete_slide(prs, i) + # Update slide_map indices after deletion + slide_map = [idx - 1 if idx > i else idx for idx in slide_map] + + # Step 3: REORDER to final sequence + print(f"Reordering {len(slide_map)} slides to final sequence...") + for target_pos in range(len(slide_map)): + # Find which slide should be at target_pos + current_pos = slide_map[target_pos] + if current_pos != target_pos: + reorder_slides(prs, current_pos, target_pos) + # Update slide_map: the move shifts other slides + for i in range(len(slide_map)): + if slide_map[i] > current_pos and slide_map[i] <= target_pos: + slide_map[i] -= 1 + elif slide_map[i] < current_pos and slide_map[i] >= target_pos: + slide_map[i] += 1 + slide_map[target_pos] = target_pos + + # Save the presentation + prs.save(output_path) + print(f"\nSaved rearranged presentation to: {output_path}") + print(f"Final presentation has {len(prs.slides)} slides") + + +if __name__ == "__main__": + main() diff --git a/skills/pptx/scripts/replace.py b/skills/pptx/scripts/replace.py new file mode 100644 index 0000000..8f7a8b1 --- /dev/null +++ b/skills/pptx/scripts/replace.py @@ -0,0 +1,385 @@ +#!/usr/bin/env python3 +"""Apply text replacements to PowerPoint presentation. + +Usage: + python replace.py + +The replacements JSON should have the structure output by inventory.py. +ALL text shapes identified by inventory.py will have their text cleared +unless "paragraphs" is specified in the replacements for that shape. +""" + +import json +import sys +from pathlib import Path +from typing import Any, Dict, List + +from inventory import InventoryData, extract_text_inventory +from pptx import Presentation +from pptx.dml.color import RGBColor +from pptx.enum.dml import MSO_THEME_COLOR +from pptx.enum.text import PP_ALIGN +from pptx.oxml.xmlchemy import OxmlElement +from pptx.util import Pt + + +def clear_paragraph_bullets(paragraph): + """Clear bullet formatting from a paragraph.""" + pPr = paragraph._element.get_or_add_pPr() + + # Remove existing bullet elements + for child in list(pPr): + if ( + child.tag.endswith("buChar") + or child.tag.endswith("buNone") + or child.tag.endswith("buAutoNum") + or child.tag.endswith("buFont") + ): + pPr.remove(child) + + return pPr + + +def apply_paragraph_properties(paragraph, para_data: Dict[str, Any]): + """Apply formatting properties to a paragraph.""" + # Get the text but don't set it on paragraph directly yet + text = para_data.get("text", "") + + # Get or create paragraph properties + pPr = clear_paragraph_bullets(paragraph) + + # Handle bullet formatting + if para_data.get("bullet", False): + level = para_data.get("level", 0) + paragraph.level = level + + # Calculate font-proportional indentation + font_size = para_data.get("font_size", 18.0) + level_indent_emu = int((font_size * (1.6 + level * 1.6)) * 12700) + hanging_indent_emu = int(-font_size * 0.8 * 12700) + + # Set indentation + pPr.attrib["marL"] = str(level_indent_emu) + pPr.attrib["indent"] = str(hanging_indent_emu) + + # Add bullet character + buChar = OxmlElement("a:buChar") + buChar.set("char", "•") + pPr.append(buChar) + + # Default to left alignment for bullets if not specified + if "alignment" not in para_data: + paragraph.alignment = PP_ALIGN.LEFT + else: + # Remove indentation for non-bullet text + pPr.attrib["marL"] = "0" + pPr.attrib["indent"] = "0" + + # Add buNone element + buNone = OxmlElement("a:buNone") + pPr.insert(0, buNone) + + # Apply alignment + if "alignment" in para_data: + alignment_map = { + "LEFT": PP_ALIGN.LEFT, + "CENTER": PP_ALIGN.CENTER, + "RIGHT": PP_ALIGN.RIGHT, + "JUSTIFY": PP_ALIGN.JUSTIFY, + } + if para_data["alignment"] in alignment_map: + paragraph.alignment = alignment_map[para_data["alignment"]] + + # Apply spacing + if "space_before" in para_data: + paragraph.space_before = Pt(para_data["space_before"]) + if "space_after" in para_data: + paragraph.space_after = Pt(para_data["space_after"]) + if "line_spacing" in para_data: + paragraph.line_spacing = Pt(para_data["line_spacing"]) + + # Apply run-level formatting + if not paragraph.runs: + run = paragraph.add_run() + run.text = text + else: + run = paragraph.runs[0] + run.text = text + + # Apply font properties + apply_font_properties(run, para_data) + + +def apply_font_properties(run, para_data: Dict[str, Any]): + """Apply font properties to a text run.""" + if "bold" in para_data: + run.font.bold = para_data["bold"] + if "italic" in para_data: + run.font.italic = para_data["italic"] + if "underline" in para_data: + run.font.underline = para_data["underline"] + if "font_size" in para_data: + run.font.size = Pt(para_data["font_size"]) + if "font_name" in para_data: + run.font.name = para_data["font_name"] + + # Apply color - prefer RGB, fall back to theme_color + if "color" in para_data: + color_hex = para_data["color"].lstrip("#") + if len(color_hex) == 6: + r = int(color_hex[0:2], 16) + g = int(color_hex[2:4], 16) + b = int(color_hex[4:6], 16) + run.font.color.rgb = RGBColor(r, g, b) + elif "theme_color" in para_data: + # Get theme color by name (e.g., "DARK_1", "ACCENT_1") + theme_name = para_data["theme_color"] + try: + run.font.color.theme_color = getattr(MSO_THEME_COLOR, theme_name) + except AttributeError: + print(f" WARNING: Unknown theme color name '{theme_name}'") + + +def detect_frame_overflow(inventory: InventoryData) -> Dict[str, Dict[str, float]]: + """Detect text overflow in shapes (text exceeding shape bounds). + + Returns dict of slide_key -> shape_key -> overflow_inches. + Only includes shapes that have text overflow. + """ + overflow_map = {} + + for slide_key, shapes_dict in inventory.items(): + for shape_key, shape_data in shapes_dict.items(): + # Check for frame overflow (text exceeding shape bounds) + if shape_data.frame_overflow_bottom is not None: + if slide_key not in overflow_map: + overflow_map[slide_key] = {} + overflow_map[slide_key][shape_key] = shape_data.frame_overflow_bottom + + return overflow_map + + +def validate_replacements(inventory: InventoryData, replacements: Dict) -> List[str]: + """Validate that all shapes in replacements exist in inventory. + + Returns list of error messages. + """ + errors = [] + + for slide_key, shapes_data in replacements.items(): + if not slide_key.startswith("slide-"): + continue + + # Check if slide exists + if slide_key not in inventory: + errors.append(f"Slide '{slide_key}' not found in inventory") + continue + + # Check each shape + for shape_key in shapes_data.keys(): + if shape_key not in inventory[slide_key]: + # Find shapes without replacements defined and show their content + unused_with_content = [] + for k in inventory[slide_key].keys(): + if k not in shapes_data: + shape_data = inventory[slide_key][k] + # Get text from paragraphs as preview + paragraphs = shape_data.paragraphs + if paragraphs and paragraphs[0].text: + first_text = paragraphs[0].text[:50] + if len(paragraphs[0].text) > 50: + first_text += "..." + unused_with_content.append(f"{k} ('{first_text}')") + else: + unused_with_content.append(k) + + errors.append( + f"Shape '{shape_key}' not found on '{slide_key}'. " + f"Shapes without replacements: {', '.join(sorted(unused_with_content)) if unused_with_content else 'none'}" + ) + + return errors + + +def check_duplicate_keys(pairs): + """Check for duplicate keys when loading JSON.""" + result = {} + for key, value in pairs: + if key in result: + raise ValueError(f"Duplicate key found in JSON: '{key}'") + result[key] = value + return result + + +def apply_replacements(pptx_file: str, json_file: str, output_file: str): + """Apply text replacements from JSON to PowerPoint presentation.""" + + # Load presentation + prs = Presentation(pptx_file) + + # Get inventory of all text shapes (returns ShapeData objects) + # Pass prs to use same Presentation instance + inventory = extract_text_inventory(Path(pptx_file), prs) + + # Detect text overflow in original presentation + original_overflow = detect_frame_overflow(inventory) + + # Load replacement data with duplicate key detection + with open(json_file, "r") as f: + replacements = json.load(f, object_pairs_hook=check_duplicate_keys) + + # Validate replacements + errors = validate_replacements(inventory, replacements) + if errors: + print("ERROR: Invalid shapes in replacement JSON:") + for error in errors: + print(f" - {error}") + print("\nPlease check the inventory and update your replacement JSON.") + print( + "You can regenerate the inventory with: python inventory.py " + ) + raise ValueError(f"Found {len(errors)} validation error(s)") + + # Track statistics + shapes_processed = 0 + shapes_cleared = 0 + shapes_replaced = 0 + + # Process each slide from inventory + for slide_key, shapes_dict in inventory.items(): + if not slide_key.startswith("slide-"): + continue + + slide_index = int(slide_key.split("-")[1]) + + if slide_index >= len(prs.slides): + print(f"Warning: Slide {slide_index} not found") + continue + + # Process each shape from inventory + for shape_key, shape_data in shapes_dict.items(): + shapes_processed += 1 + + # Get the shape directly from ShapeData + shape = shape_data.shape + if not shape: + print(f"Warning: {shape_key} has no shape reference") + continue + + # ShapeData already validates text_frame in __init__ + text_frame = shape.text_frame # type: ignore + + text_frame.clear() # type: ignore + shapes_cleared += 1 + + # Check for replacement paragraphs + replacement_shape_data = replacements.get(slide_key, {}).get(shape_key, {}) + if "paragraphs" not in replacement_shape_data: + continue + + shapes_replaced += 1 + + # Add replacement paragraphs + for i, para_data in enumerate(replacement_shape_data["paragraphs"]): + if i == 0: + p = text_frame.paragraphs[0] # type: ignore + else: + p = text_frame.add_paragraph() # type: ignore + + apply_paragraph_properties(p, para_data) + + # Check for issues after replacements + # Save to a temporary file and reload to avoid modifying the presentation during inventory + # (extract_text_inventory accesses font.color which adds empty elements) + import tempfile + + with tempfile.NamedTemporaryFile(suffix=".pptx", delete=False) as tmp: + tmp_path = Path(tmp.name) + prs.save(str(tmp_path)) + + try: + updated_inventory = extract_text_inventory(tmp_path) + updated_overflow = detect_frame_overflow(updated_inventory) + finally: + tmp_path.unlink() # Clean up temp file + + # Check if any text overflow got worse + overflow_errors = [] + for slide_key, shape_overflows in updated_overflow.items(): + for shape_key, new_overflow in shape_overflows.items(): + # Get original overflow (0 if there was no overflow before) + original = original_overflow.get(slide_key, {}).get(shape_key, 0.0) + + # Error if overflow increased + if new_overflow > original + 0.01: # Small tolerance for rounding + increase = new_overflow - original + overflow_errors.append( + f'{slide_key}/{shape_key}: overflow worsened by {increase:.2f}" ' + f'(was {original:.2f}", now {new_overflow:.2f}")' + ) + + # Collect warnings from updated shapes + warnings = [] + for slide_key, shapes_dict in updated_inventory.items(): + for shape_key, shape_data in shapes_dict.items(): + if shape_data.warnings: + for warning in shape_data.warnings: + warnings.append(f"{slide_key}/{shape_key}: {warning}") + + # Fail if there are any issues + if overflow_errors or warnings: + print("\nERROR: Issues detected in replacement output:") + if overflow_errors: + print("\nText overflow worsened:") + for error in overflow_errors: + print(f" - {error}") + if warnings: + print("\nFormatting warnings:") + for warning in warnings: + print(f" - {warning}") + print("\nPlease fix these issues before saving.") + raise ValueError( + f"Found {len(overflow_errors)} overflow error(s) and {len(warnings)} warning(s)" + ) + + # Save the presentation + prs.save(output_file) + + # Report results + print(f"Saved updated presentation to: {output_file}") + print(f"Processed {len(prs.slides)} slides") + print(f" - Shapes processed: {shapes_processed}") + print(f" - Shapes cleared: {shapes_cleared}") + print(f" - Shapes replaced: {shapes_replaced}") + + +def main(): + """Main entry point for command-line usage.""" + if len(sys.argv) != 4: + print(__doc__) + sys.exit(1) + + input_pptx = Path(sys.argv[1]) + replacements_json = Path(sys.argv[2]) + output_pptx = Path(sys.argv[3]) + + if not input_pptx.exists(): + print(f"Error: Input file '{input_pptx}' not found") + sys.exit(1) + + if not replacements_json.exists(): + print(f"Error: Replacements JSON file '{replacements_json}' not found") + sys.exit(1) + + try: + apply_replacements(str(input_pptx), str(replacements_json), str(output_pptx)) + except Exception as e: + print(f"Error applying replacements: {e}") + import traceback + + traceback.print_exc() + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/skills/pptx/scripts/thumbnail.py b/skills/pptx/scripts/thumbnail.py new file mode 100644 index 0000000..5c7fdf1 --- /dev/null +++ b/skills/pptx/scripts/thumbnail.py @@ -0,0 +1,450 @@ +#!/usr/bin/env python3 +""" +Create thumbnail grids from PowerPoint presentation slides. + +Creates a grid layout of slide thumbnails with configurable columns (max 6). +Each grid contains up to cols×(cols+1) images. For presentations with more +slides, multiple numbered grid files are created automatically. + +The program outputs the names of all files created. + +Output: +- Single grid: {prefix}.jpg (if slides fit in one grid) +- Multiple grids: {prefix}-1.jpg, {prefix}-2.jpg, etc. + +Grid limits by column count: +- 3 cols: max 12 slides per grid (3×4) +- 4 cols: max 20 slides per grid (4×5) +- 5 cols: max 30 slides per grid (5×6) [default] +- 6 cols: max 42 slides per grid (6×7) + +Usage: + python thumbnail.py input.pptx [output_prefix] [--cols N] [--outline-placeholders] + +Examples: + python thumbnail.py presentation.pptx + # Creates: thumbnails.jpg (using default prefix) + # Outputs: + # Created 1 grid(s): + # - thumbnails.jpg + + python thumbnail.py large-deck.pptx grid --cols 4 + # Creates: grid-1.jpg, grid-2.jpg, grid-3.jpg + # Outputs: + # Created 3 grid(s): + # - grid-1.jpg + # - grid-2.jpg + # - grid-3.jpg + + python thumbnail.py template.pptx analysis --outline-placeholders + # Creates thumbnail grids with red outlines around text placeholders +""" + +import argparse +import subprocess +import sys +import tempfile +from pathlib import Path + +from inventory import extract_text_inventory +from PIL import Image, ImageDraw, ImageFont +from pptx import Presentation + +# Constants +THUMBNAIL_WIDTH = 300 # Fixed thumbnail width in pixels +CONVERSION_DPI = 100 # DPI for PDF to image conversion +MAX_COLS = 6 # Maximum number of columns +DEFAULT_COLS = 5 # Default number of columns +JPEG_QUALITY = 95 # JPEG compression quality + +# Grid layout constants +GRID_PADDING = 20 # Padding between thumbnails +BORDER_WIDTH = 2 # Border width around thumbnails +FONT_SIZE_RATIO = 0.12 # Font size as fraction of thumbnail width +LABEL_PADDING_RATIO = 0.4 # Label padding as fraction of font size + + +def main(): + parser = argparse.ArgumentParser( + description="Create thumbnail grids from PowerPoint slides." + ) + parser.add_argument("input", help="Input PowerPoint file (.pptx)") + parser.add_argument( + "output_prefix", + nargs="?", + default="thumbnails", + help="Output prefix for image files (default: thumbnails, will create prefix.jpg or prefix-N.jpg)", + ) + parser.add_argument( + "--cols", + type=int, + default=DEFAULT_COLS, + help=f"Number of columns (default: {DEFAULT_COLS}, max: {MAX_COLS})", + ) + parser.add_argument( + "--outline-placeholders", + action="store_true", + help="Outline text placeholders with a colored border", + ) + + args = parser.parse_args() + + # Validate columns + cols = min(args.cols, MAX_COLS) + if args.cols > MAX_COLS: + print(f"Warning: Columns limited to {MAX_COLS} (requested {args.cols})") + + # Validate input + input_path = Path(args.input) + if not input_path.exists() or input_path.suffix.lower() != ".pptx": + print(f"Error: Invalid PowerPoint file: {args.input}") + sys.exit(1) + + # Construct output path (always JPG) + output_path = Path(f"{args.output_prefix}.jpg") + + print(f"Processing: {args.input}") + + try: + with tempfile.TemporaryDirectory() as temp_dir: + # Get placeholder regions if outlining is enabled + placeholder_regions = None + slide_dimensions = None + if args.outline_placeholders: + print("Extracting placeholder regions...") + placeholder_regions, slide_dimensions = get_placeholder_regions( + input_path + ) + if placeholder_regions: + print(f"Found placeholders on {len(placeholder_regions)} slides") + + # Convert slides to images + slide_images = convert_to_images(input_path, Path(temp_dir), CONVERSION_DPI) + if not slide_images: + print("Error: No slides found") + sys.exit(1) + + print(f"Found {len(slide_images)} slides") + + # Create grids (max cols×(cols+1) images per grid) + grid_files = create_grids( + slide_images, + cols, + THUMBNAIL_WIDTH, + output_path, + placeholder_regions, + slide_dimensions, + ) + + # Print saved files + print(f"Created {len(grid_files)} grid(s):") + for grid_file in grid_files: + print(f" - {grid_file}") + + except Exception as e: + print(f"Error: {e}") + sys.exit(1) + + +def create_hidden_slide_placeholder(size): + """Create placeholder image for hidden slides.""" + img = Image.new("RGB", size, color="#F0F0F0") + draw = ImageDraw.Draw(img) + line_width = max(5, min(size) // 100) + draw.line([(0, 0), size], fill="#CCCCCC", width=line_width) + draw.line([(size[0], 0), (0, size[1])], fill="#CCCCCC", width=line_width) + return img + + +def get_placeholder_regions(pptx_path): + """Extract ALL text regions from the presentation. + + Returns a tuple of (placeholder_regions, slide_dimensions). + text_regions is a dict mapping slide indices to lists of text regions. + Each region is a dict with 'left', 'top', 'width', 'height' in inches. + slide_dimensions is a tuple of (width_inches, height_inches). + """ + prs = Presentation(str(pptx_path)) + inventory = extract_text_inventory(pptx_path, prs) + placeholder_regions = {} + + # Get actual slide dimensions in inches (EMU to inches conversion) + slide_width_inches = (prs.slide_width or 9144000) / 914400.0 + slide_height_inches = (prs.slide_height or 5143500) / 914400.0 + + for slide_key, shapes in inventory.items(): + # Extract slide index from "slide-N" format + slide_idx = int(slide_key.split("-")[1]) + regions = [] + + for shape_key, shape_data in shapes.items(): + # The inventory only contains shapes with text, so all shapes should be highlighted + regions.append( + { + "left": shape_data.left, + "top": shape_data.top, + "width": shape_data.width, + "height": shape_data.height, + } + ) + + if regions: + placeholder_regions[slide_idx] = regions + + return placeholder_regions, (slide_width_inches, slide_height_inches) + + +def convert_to_images(pptx_path, temp_dir, dpi): + """Convert PowerPoint to images via PDF, handling hidden slides.""" + # Detect hidden slides + print("Analyzing presentation...") + prs = Presentation(str(pptx_path)) + total_slides = len(prs.slides) + + # Find hidden slides (1-based indexing for display) + hidden_slides = { + idx + 1 + for idx, slide in enumerate(prs.slides) + if slide.element.get("show") == "0" + } + + print(f"Total slides: {total_slides}") + if hidden_slides: + print(f"Hidden slides: {sorted(hidden_slides)}") + + pdf_path = temp_dir / f"{pptx_path.stem}.pdf" + + # Convert to PDF + print("Converting to PDF...") + result = subprocess.run( + [ + "soffice", + "--headless", + "--convert-to", + "pdf", + "--outdir", + str(temp_dir), + str(pptx_path), + ], + capture_output=True, + text=True, + ) + if result.returncode != 0 or not pdf_path.exists(): + raise RuntimeError("PDF conversion failed") + + # Convert PDF to images + print(f"Converting to images at {dpi} DPI...") + result = subprocess.run( + ["pdftoppm", "-jpeg", "-r", str(dpi), str(pdf_path), str(temp_dir / "slide")], + capture_output=True, + text=True, + ) + if result.returncode != 0: + raise RuntimeError("Image conversion failed") + + visible_images = sorted(temp_dir.glob("slide-*.jpg")) + + # Create full list with placeholders for hidden slides + all_images = [] + visible_idx = 0 + + # Get placeholder dimensions from first visible slide + if visible_images: + with Image.open(visible_images[0]) as img: + placeholder_size = img.size + else: + placeholder_size = (1920, 1080) + + for slide_num in range(1, total_slides + 1): + if slide_num in hidden_slides: + # Create placeholder image for hidden slide + placeholder_path = temp_dir / f"hidden-{slide_num:03d}.jpg" + placeholder_img = create_hidden_slide_placeholder(placeholder_size) + placeholder_img.save(placeholder_path, "JPEG") + all_images.append(placeholder_path) + else: + # Use the actual visible slide image + if visible_idx < len(visible_images): + all_images.append(visible_images[visible_idx]) + visible_idx += 1 + + return all_images + + +def create_grids( + image_paths, + cols, + width, + output_path, + placeholder_regions=None, + slide_dimensions=None, +): + """Create multiple thumbnail grids from slide images, max cols×(cols+1) images per grid.""" + # Maximum images per grid is cols × (cols + 1) for better proportions + max_images_per_grid = cols * (cols + 1) + grid_files = [] + + print( + f"Creating grids with {cols} columns (max {max_images_per_grid} images per grid)" + ) + + # Split images into chunks + for chunk_idx, start_idx in enumerate( + range(0, len(image_paths), max_images_per_grid) + ): + end_idx = min(start_idx + max_images_per_grid, len(image_paths)) + chunk_images = image_paths[start_idx:end_idx] + + # Create grid for this chunk + grid = create_grid( + chunk_images, cols, width, start_idx, placeholder_regions, slide_dimensions + ) + + # Generate output filename + if len(image_paths) <= max_images_per_grid: + # Single grid - use base filename without suffix + grid_filename = output_path + else: + # Multiple grids - insert index before extension with dash + stem = output_path.stem + suffix = output_path.suffix + grid_filename = output_path.parent / f"{stem}-{chunk_idx + 1}{suffix}" + + # Save grid + grid_filename.parent.mkdir(parents=True, exist_ok=True) + grid.save(str(grid_filename), quality=JPEG_QUALITY) + grid_files.append(str(grid_filename)) + + return grid_files + + +def create_grid( + image_paths, + cols, + width, + start_slide_num=0, + placeholder_regions=None, + slide_dimensions=None, +): + """Create thumbnail grid from slide images with optional placeholder outlining.""" + font_size = int(width * FONT_SIZE_RATIO) + label_padding = int(font_size * LABEL_PADDING_RATIO) + + # Get dimensions + with Image.open(image_paths[0]) as img: + aspect = img.height / img.width + height = int(width * aspect) + + # Calculate grid size + rows = (len(image_paths) + cols - 1) // cols + grid_w = cols * width + (cols + 1) * GRID_PADDING + grid_h = rows * (height + font_size + label_padding * 2) + (rows + 1) * GRID_PADDING + + # Create grid + grid = Image.new("RGB", (grid_w, grid_h), "white") + draw = ImageDraw.Draw(grid) + + # Load font with size based on thumbnail width + try: + # Use Pillow's default font with size + font = ImageFont.load_default(size=font_size) + except Exception: + # Fall back to basic default font if size parameter not supported + font = ImageFont.load_default() + + # Place thumbnails + for i, img_path in enumerate(image_paths): + row, col = i // cols, i % cols + x = col * width + (col + 1) * GRID_PADDING + y_base = ( + row * (height + font_size + label_padding * 2) + (row + 1) * GRID_PADDING + ) + + # Add label with actual slide number + label = f"{start_slide_num + i}" + bbox = draw.textbbox((0, 0), label, font=font) + text_w = bbox[2] - bbox[0] + draw.text( + (x + (width - text_w) // 2, y_base + label_padding), + label, + fill="black", + font=font, + ) + + # Add thumbnail below label with proportional spacing + y_thumbnail = y_base + label_padding + font_size + label_padding + + with Image.open(img_path) as img: + # Get original dimensions before thumbnail + orig_w, orig_h = img.size + + # Apply placeholder outlines if enabled + if placeholder_regions and (start_slide_num + i) in placeholder_regions: + # Convert to RGBA for transparency support + if img.mode != "RGBA": + img = img.convert("RGBA") + + # Get the regions for this slide + regions = placeholder_regions[start_slide_num + i] + + # Calculate scale factors using actual slide dimensions + if slide_dimensions: + slide_width_inches, slide_height_inches = slide_dimensions + else: + # Fallback: estimate from image size at CONVERSION_DPI + slide_width_inches = orig_w / CONVERSION_DPI + slide_height_inches = orig_h / CONVERSION_DPI + + x_scale = orig_w / slide_width_inches + y_scale = orig_h / slide_height_inches + + # Create a highlight overlay + overlay = Image.new("RGBA", img.size, (255, 255, 255, 0)) + overlay_draw = ImageDraw.Draw(overlay) + + # Highlight each placeholder region + for region in regions: + # Convert from inches to pixels in the original image + px_left = int(region["left"] * x_scale) + px_top = int(region["top"] * y_scale) + px_width = int(region["width"] * x_scale) + px_height = int(region["height"] * y_scale) + + # Draw highlight outline with red color and thick stroke + # Using a bright red outline instead of fill + stroke_width = max( + 5, min(orig_w, orig_h) // 150 + ) # Thicker proportional stroke width + overlay_draw.rectangle( + [(px_left, px_top), (px_left + px_width, px_top + px_height)], + outline=(255, 0, 0, 255), # Bright red, fully opaque + width=stroke_width, + ) + + # Composite the overlay onto the image using alpha blending + img = Image.alpha_composite(img, overlay) + # Convert back to RGB for JPEG saving + img = img.convert("RGB") + + img.thumbnail((width, height), Image.Resampling.LANCZOS) + w, h = img.size + tx = x + (width - w) // 2 + ty = y_thumbnail + (height - h) // 2 + grid.paste(img, (tx, ty)) + + # Add border + if BORDER_WIDTH > 0: + draw.rectangle( + [ + (tx - BORDER_WIDTH, ty - BORDER_WIDTH), + (tx + w + BORDER_WIDTH - 1, ty + h + BORDER_WIDTH - 1), + ], + outline="gray", + width=BORDER_WIDTH, + ) + + return grid + + +if __name__ == "__main__": + main() diff --git a/skills/xlsx/LICENSE.txt b/skills/xlsx/LICENSE.txt new file mode 100644 index 0000000..c55ab42 --- /dev/null +++ b/skills/xlsx/LICENSE.txt @@ -0,0 +1,30 @@ +© 2025 Anthropic, PBC. All rights reserved. + +LICENSE: Use of these materials (including all code, prompts, assets, files, +and other components of this Skill) is governed by your agreement with +Anthropic regarding use of Anthropic's services. If no separate agreement +exists, use is governed by Anthropic's Consumer Terms of Service or +Commercial Terms of Service, as applicable: +https://www.anthropic.com/legal/consumer-terms +https://www.anthropic.com/legal/commercial-terms +Your applicable agreement is referred to as the "Agreement." "Services" are +as defined in the Agreement. + +ADDITIONAL RESTRICTIONS: Notwithstanding anything in the Agreement to the +contrary, users may not: + +- Extract these materials from the Services or retain copies of these + materials outside the Services +- Reproduce or copy these materials, except for temporary copies created + automatically during authorized use of the Services +- Create derivative works based on these materials +- Distribute, sublicense, or transfer these materials to any third party +- Make, offer to sell, sell, or import any inventions embodied in these + materials +- Reverse engineer, decompile, or disassemble these materials + +The receipt, viewing, or possession of these materials does not convey or +imply any license or right beyond those expressly granted above. + +Anthropic retains all right, title, and interest in these materials, +including all copyrights, patents, and other intellectual property rights. diff --git a/skills/xlsx/SKILL.md b/skills/xlsx/SKILL.md new file mode 100644 index 0000000..22db189 --- /dev/null +++ b/skills/xlsx/SKILL.md @@ -0,0 +1,289 @@ +--- +name: xlsx +description: "Comprehensive spreadsheet creation, editing, and analysis with support for formulas, formatting, data analysis, and visualization. When Claude needs to work with spreadsheets (.xlsx, .xlsm, .csv, .tsv, etc) for: (1) Creating new spreadsheets with formulas and formatting, (2) Reading or analyzing data, (3) Modify existing spreadsheets while preserving formulas, (4) Data analysis and visualization in spreadsheets, or (5) Recalculating formulas" +license: Proprietary. LICENSE.txt has complete terms +--- + +# Requirements for Outputs + +## All Excel files + +### Zero Formula Errors +- Every Excel model MUST be delivered with ZERO formula errors (#REF!, #DIV/0!, #VALUE!, #N/A, #NAME?) + +### Preserve Existing Templates (when updating templates) +- Study and EXACTLY match existing format, style, and conventions when modifying files +- Never impose standardized formatting on files with established patterns +- Existing template conventions ALWAYS override these guidelines + +## Financial models + +### Color Coding Standards +Unless otherwise stated by the user or existing template + +#### Industry-Standard Color Conventions +- **Blue text (RGB: 0,0,255)**: Hardcoded inputs, and numbers users will change for scenarios +- **Black text (RGB: 0,0,0)**: ALL formulas and calculations +- **Green text (RGB: 0,128,0)**: Links pulling from other worksheets within same workbook +- **Red text (RGB: 255,0,0)**: External links to other files +- **Yellow background (RGB: 255,255,0)**: Key assumptions needing attention or cells that need to be updated + +### Number Formatting Standards + +#### Required Format Rules +- **Years**: Format as text strings (e.g., "2024" not "2,024") +- **Currency**: Use $#,##0 format; ALWAYS specify units in headers ("Revenue ($mm)") +- **Zeros**: Use number formatting to make all zeros "-", including percentages (e.g., "$#,##0;($#,##0);-") +- **Percentages**: Default to 0.0% format (one decimal) +- **Multiples**: Format as 0.0x for valuation multiples (EV/EBITDA, P/E) +- **Negative numbers**: Use parentheses (123) not minus -123 + +### Formula Construction Rules + +#### Assumptions Placement +- Place ALL assumptions (growth rates, margins, multiples, etc.) in separate assumption cells +- Use cell references instead of hardcoded values in formulas +- Example: Use =B5*(1+$B$6) instead of =B5*1.05 + +#### Formula Error Prevention +- Verify all cell references are correct +- Check for off-by-one errors in ranges +- Ensure consistent formulas across all projection periods +- Test with edge cases (zero values, negative numbers) +- Verify no unintended circular references + +#### Documentation Requirements for Hardcodes +- Comment or in cells beside (if end of table). Format: "Source: [System/Document], [Date], [Specific Reference], [URL if applicable]" +- Examples: + - "Source: Company 10-K, FY2024, Page 45, Revenue Note, [SEC EDGAR URL]" + - "Source: Company 10-Q, Q2 2025, Exhibit 99.1, [SEC EDGAR URL]" + - "Source: Bloomberg Terminal, 8/15/2025, AAPL US Equity" + - "Source: FactSet, 8/20/2025, Consensus Estimates Screen" + +# XLSX creation, editing, and analysis + +## Overview + +A user may ask you to create, edit, or analyze the contents of an .xlsx file. You have different tools and workflows available for different tasks. + +## Important Requirements + +**LibreOffice Required for Formula Recalculation**: You can assume LibreOffice is installed for recalculating formula values using the `recalc.py` script. The script automatically configures LibreOffice on first run + +## Reading and analyzing data + +### Data analysis with pandas +For data analysis, visualization, and basic operations, use **pandas** which provides powerful data manipulation capabilities: + +```python +import pandas as pd + +# Read Excel +df = pd.read_excel('file.xlsx') # Default: first sheet +all_sheets = pd.read_excel('file.xlsx', sheet_name=None) # All sheets as dict + +# Analyze +df.head() # Preview data +df.info() # Column info +df.describe() # Statistics + +# Write Excel +df.to_excel('output.xlsx', index=False) +``` + +## Excel File Workflows + +## CRITICAL: Use Formulas, Not Hardcoded Values + +**Always use Excel formulas instead of calculating values in Python and hardcoding them.** This ensures the spreadsheet remains dynamic and updateable. + +### ❌ WRONG - Hardcoding Calculated Values +```python +# Bad: Calculating in Python and hardcoding result +total = df['Sales'].sum() +sheet['B10'] = total # Hardcodes 5000 + +# Bad: Computing growth rate in Python +growth = (df.iloc[-1]['Revenue'] - df.iloc[0]['Revenue']) / df.iloc[0]['Revenue'] +sheet['C5'] = growth # Hardcodes 0.15 + +# Bad: Python calculation for average +avg = sum(values) / len(values) +sheet['D20'] = avg # Hardcodes 42.5 +``` + +### ✅ CORRECT - Using Excel Formulas +```python +# Good: Let Excel calculate the sum +sheet['B10'] = '=SUM(B2:B9)' + +# Good: Growth rate as Excel formula +sheet['C5'] = '=(C4-C2)/C2' + +# Good: Average using Excel function +sheet['D20'] = '=AVERAGE(D2:D19)' +``` + +This applies to ALL calculations - totals, percentages, ratios, differences, etc. The spreadsheet should be able to recalculate when source data changes. + +## Common Workflow +1. **Choose tool**: pandas for data, openpyxl for formulas/formatting +2. **Create/Load**: Create new workbook or load existing file +3. **Modify**: Add/edit data, formulas, and formatting +4. **Save**: Write to file +5. **Recalculate formulas (MANDATORY IF USING FORMULAS)**: Use the recalc.py script + ```bash + python recalc.py output.xlsx + ``` +6. **Verify and fix any errors**: + - The script returns JSON with error details + - If `status` is `errors_found`, check `error_summary` for specific error types and locations + - Fix the identified errors and recalculate again + - Common errors to fix: + - `#REF!`: Invalid cell references + - `#DIV/0!`: Division by zero + - `#VALUE!`: Wrong data type in formula + - `#NAME?`: Unrecognized formula name + +### Creating new Excel files + +```python +# Using openpyxl for formulas and formatting +from openpyxl import Workbook +from openpyxl.styles import Font, PatternFill, Alignment + +wb = Workbook() +sheet = wb.active + +# Add data +sheet['A1'] = 'Hello' +sheet['B1'] = 'World' +sheet.append(['Row', 'of', 'data']) + +# Add formula +sheet['B2'] = '=SUM(A1:A10)' + +# Formatting +sheet['A1'].font = Font(bold=True, color='FF0000') +sheet['A1'].fill = PatternFill('solid', start_color='FFFF00') +sheet['A1'].alignment = Alignment(horizontal='center') + +# Column width +sheet.column_dimensions['A'].width = 20 + +wb.save('output.xlsx') +``` + +### Editing existing Excel files + +```python +# Using openpyxl to preserve formulas and formatting +from openpyxl import load_workbook + +# Load existing file +wb = load_workbook('existing.xlsx') +sheet = wb.active # or wb['SheetName'] for specific sheet + +# Working with multiple sheets +for sheet_name in wb.sheetnames: + sheet = wb[sheet_name] + print(f"Sheet: {sheet_name}") + +# Modify cells +sheet['A1'] = 'New Value' +sheet.insert_rows(2) # Insert row at position 2 +sheet.delete_cols(3) # Delete column 3 + +# Add new sheet +new_sheet = wb.create_sheet('NewSheet') +new_sheet['A1'] = 'Data' + +wb.save('modified.xlsx') +``` + +## Recalculating formulas + +Excel files created or modified by openpyxl contain formulas as strings but not calculated values. Use the provided `recalc.py` script to recalculate formulas: + +```bash +python recalc.py [timeout_seconds] +``` + +Example: +```bash +python recalc.py output.xlsx 30 +``` + +The script: +- Automatically sets up LibreOffice macro on first run +- Recalculates all formulas in all sheets +- Scans ALL cells for Excel errors (#REF!, #DIV/0!, etc.) +- Returns JSON with detailed error locations and counts +- Works on both Linux and macOS + +## Formula Verification Checklist + +Quick checks to ensure formulas work correctly: + +### Essential Verification +- [ ] **Test 2-3 sample references**: Verify they pull correct values before building full model +- [ ] **Column mapping**: Confirm Excel columns match (e.g., column 64 = BL, not BK) +- [ ] **Row offset**: Remember Excel rows are 1-indexed (DataFrame row 5 = Excel row 6) + +### Common Pitfalls +- [ ] **NaN handling**: Check for null values with `pd.notna()` +- [ ] **Far-right columns**: FY data often in columns 50+ +- [ ] **Multiple matches**: Search all occurrences, not just first +- [ ] **Division by zero**: Check denominators before using `/` in formulas (#DIV/0!) +- [ ] **Wrong references**: Verify all cell references point to intended cells (#REF!) +- [ ] **Cross-sheet references**: Use correct format (Sheet1!A1) for linking sheets + +### Formula Testing Strategy +- [ ] **Start small**: Test formulas on 2-3 cells before applying broadly +- [ ] **Verify dependencies**: Check all cells referenced in formulas exist +- [ ] **Test edge cases**: Include zero, negative, and very large values + +### Interpreting recalc.py Output +The script returns JSON with error details: +```json +{ + "status": "success", // or "errors_found" + "total_errors": 0, // Total error count + "total_formulas": 42, // Number of formulas in file + "error_summary": { // Only present if errors found + "#REF!": { + "count": 2, + "locations": ["Sheet1!B5", "Sheet1!C10"] + } + } +} +``` + +## Best Practices + +### Library Selection +- **pandas**: Best for data analysis, bulk operations, and simple data export +- **openpyxl**: Best for complex formatting, formulas, and Excel-specific features + +### Working with openpyxl +- Cell indices are 1-based (row=1, column=1 refers to cell A1) +- Use `data_only=True` to read calculated values: `load_workbook('file.xlsx', data_only=True)` +- **Warning**: If opened with `data_only=True` and saved, formulas are replaced with values and permanently lost +- For large files: Use `read_only=True` for reading or `write_only=True` for writing +- Formulas are preserved but not evaluated - use recalc.py to update values + +### Working with pandas +- Specify data types to avoid inference issues: `pd.read_excel('file.xlsx', dtype={'id': str})` +- For large files, read specific columns: `pd.read_excel('file.xlsx', usecols=['A', 'C', 'E'])` +- Handle dates properly: `pd.read_excel('file.xlsx', parse_dates=['date_column'])` + +## Code Style Guidelines +**IMPORTANT**: When generating Python code for Excel operations: +- Write minimal, concise Python code without unnecessary comments +- Avoid verbose variable names and redundant operations +- Avoid unnecessary print statements + +**For Excel files themselves**: +- Add comments to cells with complex formulas or important assumptions +- Document data sources for hardcoded values +- Include notes for key calculations and model sections \ No newline at end of file diff --git a/skills/xlsx/recalc.py b/skills/xlsx/recalc.py new file mode 100644 index 0000000..102e157 --- /dev/null +++ b/skills/xlsx/recalc.py @@ -0,0 +1,178 @@ +#!/usr/bin/env python3 +""" +Excel Formula Recalculation Script +Recalculates all formulas in an Excel file using LibreOffice +""" + +import json +import sys +import subprocess +import os +import platform +from pathlib import Path +from openpyxl import load_workbook + + +def setup_libreoffice_macro(): + """Setup LibreOffice macro for recalculation if not already configured""" + if platform.system() == 'Darwin': + macro_dir = os.path.expanduser('~/Library/Application Support/LibreOffice/4/user/basic/Standard') + else: + macro_dir = os.path.expanduser('~/.config/libreoffice/4/user/basic/Standard') + + macro_file = os.path.join(macro_dir, 'Module1.xba') + + if os.path.exists(macro_file): + with open(macro_file, 'r') as f: + if 'RecalculateAndSave' in f.read(): + return True + + if not os.path.exists(macro_dir): + subprocess.run(['soffice', '--headless', '--terminate_after_init'], + capture_output=True, timeout=10) + os.makedirs(macro_dir, exist_ok=True) + + macro_content = ''' + + + Sub RecalculateAndSave() + ThisComponent.calculateAll() + ThisComponent.store() + ThisComponent.close(True) + End Sub +''' + + try: + with open(macro_file, 'w') as f: + f.write(macro_content) + return True + except Exception: + return False + + +def recalc(filename, timeout=30): + """ + Recalculate formulas in Excel file and report any errors + + Args: + filename: Path to Excel file + timeout: Maximum time to wait for recalculation (seconds) + + Returns: + dict with error locations and counts + """ + if not Path(filename).exists(): + return {'error': f'File {filename} does not exist'} + + abs_path = str(Path(filename).absolute()) + + if not setup_libreoffice_macro(): + return {'error': 'Failed to setup LibreOffice macro'} + + cmd = [ + 'soffice', '--headless', '--norestore', + 'vnd.sun.star.script:Standard.Module1.RecalculateAndSave?language=Basic&location=application', + abs_path + ] + + # Handle timeout command differences between Linux and macOS + if platform.system() != 'Windows': + timeout_cmd = 'timeout' if platform.system() == 'Linux' else None + if platform.system() == 'Darwin': + # Check if gtimeout is available on macOS + try: + subprocess.run(['gtimeout', '--version'], capture_output=True, timeout=1, check=False) + timeout_cmd = 'gtimeout' + except (FileNotFoundError, subprocess.TimeoutExpired): + pass + + if timeout_cmd: + cmd = [timeout_cmd, str(timeout)] + cmd + + result = subprocess.run(cmd, capture_output=True, text=True) + + if result.returncode != 0 and result.returncode != 124: # 124 is timeout exit code + error_msg = result.stderr or 'Unknown error during recalculation' + if 'Module1' in error_msg or 'RecalculateAndSave' not in error_msg: + return {'error': 'LibreOffice macro not configured properly'} + else: + return {'error': error_msg} + + # Check for Excel errors in the recalculated file - scan ALL cells + try: + wb = load_workbook(filename, data_only=True) + + excel_errors = ['#VALUE!', '#DIV/0!', '#REF!', '#NAME?', '#NULL!', '#NUM!', '#N/A'] + error_details = {err: [] for err in excel_errors} + total_errors = 0 + + for sheet_name in wb.sheetnames: + ws = wb[sheet_name] + # Check ALL rows and columns - no limits + for row in ws.iter_rows(): + for cell in row: + if cell.value is not None and isinstance(cell.value, str): + for err in excel_errors: + if err in cell.value: + location = f"{sheet_name}!{cell.coordinate}" + error_details[err].append(location) + total_errors += 1 + break + + wb.close() + + # Build result summary + result = { + 'status': 'success' if total_errors == 0 else 'errors_found', + 'total_errors': total_errors, + 'error_summary': {} + } + + # Add non-empty error categories + for err_type, locations in error_details.items(): + if locations: + result['error_summary'][err_type] = { + 'count': len(locations), + 'locations': locations[:20] # Show up to 20 locations + } + + # Add formula count for context - also check ALL cells + wb_formulas = load_workbook(filename, data_only=False) + formula_count = 0 + for sheet_name in wb_formulas.sheetnames: + ws = wb_formulas[sheet_name] + for row in ws.iter_rows(): + for cell in row: + if cell.value and isinstance(cell.value, str) and cell.value.startswith('='): + formula_count += 1 + wb_formulas.close() + + result['total_formulas'] = formula_count + + return result + + except Exception as e: + return {'error': str(e)} + + +def main(): + if len(sys.argv) < 2: + print("Usage: python recalc.py [timeout_seconds]") + print("\nRecalculates all formulas in an Excel file using LibreOffice") + print("\nReturns JSON with error details:") + print(" - status: 'success' or 'errors_found'") + print(" - total_errors: Total number of Excel errors found") + print(" - total_formulas: Number of formulas in the file") + print(" - error_summary: Breakdown by error type with locations") + print(" - #VALUE!, #DIV/0!, #REF!, #NAME?, #NULL!, #NUM!, #N/A") + sys.exit(1) + + filename = sys.argv[1] + timeout = int(sys.argv[2]) if len(sys.argv) > 2 else 30 + + result = recalc(filename, timeout) + print(json.dumps(result, indent=2)) + + +if __name__ == '__main__': + main() \ No newline at end of file