Initial commit
This commit is contained in:
1
skills/document-skills/docx/scripts/__init__.py
Executable file
1
skills/document-skills/docx/scripts/__init__.py
Executable file
@@ -0,0 +1 @@
|
||||
# Make scripts directory a package for relative imports in tests
|
||||
1276
skills/document-skills/docx/scripts/document.py
Executable file
1276
skills/document-skills/docx/scripts/document.py
Executable file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,3 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w:comments xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:cx="http://schemas.microsoft.com/office/drawing/2014/chartex" xmlns:cx1="http://schemas.microsoft.com/office/drawing/2015/9/8/chartex" xmlns:cx2="http://schemas.microsoft.com/office/drawing/2015/10/21/chartex" xmlns:cx3="http://schemas.microsoft.com/office/drawing/2016/5/9/chartex" xmlns:cx4="http://schemas.microsoft.com/office/drawing/2016/5/10/chartex" xmlns:cx5="http://schemas.microsoft.com/office/drawing/2016/5/11/chartex" xmlns:cx6="http://schemas.microsoft.com/office/drawing/2016/5/12/chartex" xmlns:cx7="http://schemas.microsoft.com/office/drawing/2016/5/13/chartex" xmlns:cx8="http://schemas.microsoft.com/office/drawing/2016/5/14/chartex" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:aink="http://schemas.microsoft.com/office/drawing/2016/ink" xmlns:am3d="http://schemas.microsoft.com/office/drawing/2017/model3d" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:oel="http://schemas.microsoft.com/office/2019/extlst" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl w16du wp14">
|
||||
</w:comments>
|
||||
@@ -0,0 +1,3 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w15:commentsEx xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:cx="http://schemas.microsoft.com/office/drawing/2014/chartex" xmlns:cx1="http://schemas.microsoft.com/office/drawing/2015/9/8/chartex" xmlns:cx2="http://schemas.microsoft.com/office/drawing/2015/10/21/chartex" xmlns:cx3="http://schemas.microsoft.com/office/drawing/2016/5/9/chartex" xmlns:cx4="http://schemas.microsoft.com/office/drawing/2016/5/10/chartex" xmlns:cx5="http://schemas.microsoft.com/office/drawing/2016/5/11/chartex" xmlns:cx6="http://schemas.microsoft.com/office/drawing/2016/5/12/chartex" xmlns:cx7="http://schemas.microsoft.com/office/drawing/2016/5/13/chartex" xmlns:cx8="http://schemas.microsoft.com/office/drawing/2016/5/14/chartex" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:aink="http://schemas.microsoft.com/office/drawing/2016/ink" xmlns:am3d="http://schemas.microsoft.com/office/drawing/2017/model3d" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:oel="http://schemas.microsoft.com/office/2019/extlst" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl w16du wp14">
|
||||
</w15:commentsEx>
|
||||
@@ -0,0 +1,3 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w16cex:commentsExtensible xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:cx="http://schemas.microsoft.com/office/drawing/2014/chartex" xmlns:cx1="http://schemas.microsoft.com/office/drawing/2015/9/8/chartex" xmlns:cx2="http://schemas.microsoft.com/office/drawing/2015/10/21/chartex" xmlns:cx3="http://schemas.microsoft.com/office/drawing/2016/5/9/chartex" xmlns:cx4="http://schemas.microsoft.com/office/drawing/2016/5/10/chartex" xmlns:cx5="http://schemas.microsoft.com/office/drawing/2016/5/11/chartex" xmlns:cx6="http://schemas.microsoft.com/office/drawing/2016/5/12/chartex" xmlns:cx7="http://schemas.microsoft.com/office/drawing/2016/5/13/chartex" xmlns:cx8="http://schemas.microsoft.com/office/drawing/2016/5/14/chartex" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:aink="http://schemas.microsoft.com/office/drawing/2016/ink" xmlns:am3d="http://schemas.microsoft.com/office/drawing/2017/model3d" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:oel="http://schemas.microsoft.com/office/2019/extlst" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" xmlns:cr="http://schemas.microsoft.com/office/comments/2020/reactions" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl cr w16du wp14">
|
||||
</w16cex:commentsExtensible>
|
||||
@@ -0,0 +1,3 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w16cid:commentsIds xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:cx="http://schemas.microsoft.com/office/drawing/2014/chartex" xmlns:cx1="http://schemas.microsoft.com/office/drawing/2015/9/8/chartex" xmlns:cx2="http://schemas.microsoft.com/office/drawing/2015/10/21/chartex" xmlns:cx3="http://schemas.microsoft.com/office/drawing/2016/5/9/chartex" xmlns:cx4="http://schemas.microsoft.com/office/drawing/2016/5/10/chartex" xmlns:cx5="http://schemas.microsoft.com/office/drawing/2016/5/11/chartex" xmlns:cx6="http://schemas.microsoft.com/office/drawing/2016/5/12/chartex" xmlns:cx7="http://schemas.microsoft.com/office/drawing/2016/5/13/chartex" xmlns:cx8="http://schemas.microsoft.com/office/drawing/2016/5/14/chartex" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:aink="http://schemas.microsoft.com/office/drawing/2016/ink" xmlns:am3d="http://schemas.microsoft.com/office/drawing/2017/model3d" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:oel="http://schemas.microsoft.com/office/2019/extlst" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl w16du wp14">
|
||||
</w16cid:commentsIds>
|
||||
3
skills/document-skills/docx/scripts/templates/people.xml
Normal file
3
skills/document-skills/docx/scripts/templates/people.xml
Normal file
@@ -0,0 +1,3 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w15:people xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml">
|
||||
</w15:people>
|
||||
374
skills/document-skills/docx/scripts/utilities.py
Executable file
374
skills/document-skills/docx/scripts/utilities.py
Executable file
@@ -0,0 +1,374 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Utilities for editing OOXML documents.
|
||||
|
||||
This module provides XMLEditor, a tool for manipulating XML files with support for
|
||||
line-number-based node finding and DOM manipulation. Each element is automatically
|
||||
annotated with its original line and column position during parsing.
|
||||
|
||||
Example usage:
|
||||
editor = XMLEditor("document.xml")
|
||||
|
||||
# Find node by line number or range
|
||||
elem = editor.get_node(tag="w:r", line_number=519)
|
||||
elem = editor.get_node(tag="w:p", line_number=range(100, 200))
|
||||
|
||||
# Find node by text content
|
||||
elem = editor.get_node(tag="w:p", contains="specific text")
|
||||
|
||||
# Find node by attributes
|
||||
elem = editor.get_node(tag="w:r", attrs={"w:id": "target"})
|
||||
|
||||
# Combine filters
|
||||
elem = editor.get_node(tag="w:p", line_number=range(1, 50), contains="text")
|
||||
|
||||
# Replace, insert, or manipulate
|
||||
new_elem = editor.replace_node(elem, "<w:r><w:t>new text</w:t></w:r>")
|
||||
editor.insert_after(new_elem, "<w:r><w:t>more</w:t></w:r>")
|
||||
|
||||
# Save changes
|
||||
editor.save()
|
||||
"""
|
||||
|
||||
import html
|
||||
from pathlib import Path
|
||||
from typing import Optional, Union
|
||||
|
||||
import defusedxml.minidom
|
||||
import defusedxml.sax
|
||||
|
||||
|
||||
class XMLEditor:
|
||||
"""
|
||||
Editor for manipulating OOXML XML files with line-number-based node finding.
|
||||
|
||||
This class parses XML files and tracks the original line and column position
|
||||
of each element. This enables finding nodes by their line number in the original
|
||||
file, which is useful when working with Read tool output.
|
||||
|
||||
Attributes:
|
||||
xml_path: Path to the XML file being edited
|
||||
encoding: Detected encoding of the XML file ('ascii' or 'utf-8')
|
||||
dom: Parsed DOM tree with parse_position attributes on elements
|
||||
"""
|
||||
|
||||
def __init__(self, xml_path):
|
||||
"""
|
||||
Initialize with path to XML file and parse with line number tracking.
|
||||
|
||||
Args:
|
||||
xml_path: Path to XML file to edit (str or Path)
|
||||
|
||||
Raises:
|
||||
ValueError: If the XML file does not exist
|
||||
"""
|
||||
self.xml_path = Path(xml_path)
|
||||
if not self.xml_path.exists():
|
||||
raise ValueError(f"XML file not found: {xml_path}")
|
||||
|
||||
with open(self.xml_path, "rb") as f:
|
||||
header = f.read(200).decode("utf-8", errors="ignore")
|
||||
self.encoding = "ascii" if 'encoding="ascii"' in header else "utf-8"
|
||||
|
||||
parser = _create_line_tracking_parser()
|
||||
self.dom = defusedxml.minidom.parse(str(self.xml_path), parser)
|
||||
|
||||
def get_node(
|
||||
self,
|
||||
tag: str,
|
||||
attrs: Optional[dict[str, str]] = None,
|
||||
line_number: Optional[Union[int, range]] = None,
|
||||
contains: Optional[str] = None,
|
||||
):
|
||||
"""
|
||||
Get a DOM element by tag and identifier.
|
||||
|
||||
Finds an element by either its line number in the original file or by
|
||||
matching attribute values. Exactly one match must be found.
|
||||
|
||||
Args:
|
||||
tag: The XML tag name (e.g., "w:del", "w:ins", "w:r")
|
||||
attrs: Dictionary of attribute name-value pairs to match (e.g., {"w:id": "1"})
|
||||
line_number: Line number (int) or line range (range) in original XML file (1-indexed)
|
||||
contains: Text string that must appear in any text node within the element.
|
||||
Supports both entity notation (“) and Unicode characters (\u201c).
|
||||
|
||||
Returns:
|
||||
defusedxml.minidom.Element: The matching DOM element
|
||||
|
||||
Raises:
|
||||
ValueError: If node not found or multiple matches found
|
||||
|
||||
Example:
|
||||
elem = editor.get_node(tag="w:r", line_number=519)
|
||||
elem = editor.get_node(tag="w:r", line_number=range(100, 200))
|
||||
elem = editor.get_node(tag="w:del", attrs={"w:id": "1"})
|
||||
elem = editor.get_node(tag="w:p", attrs={"w14:paraId": "12345678"})
|
||||
elem = editor.get_node(tag="w:commentRangeStart", attrs={"w:id": "0"})
|
||||
elem = editor.get_node(tag="w:p", contains="specific text")
|
||||
elem = editor.get_node(tag="w:t", contains="“Agreement") # Entity notation
|
||||
elem = editor.get_node(tag="w:t", contains="\u201cAgreement") # Unicode character
|
||||
"""
|
||||
matches = []
|
||||
for elem in self.dom.getElementsByTagName(tag):
|
||||
# Check line_number filter
|
||||
if line_number is not None:
|
||||
parse_pos = getattr(elem, "parse_position", (None,))
|
||||
elem_line = parse_pos[0]
|
||||
|
||||
# Handle both single line number and range
|
||||
if isinstance(line_number, range):
|
||||
if elem_line not in line_number:
|
||||
continue
|
||||
else:
|
||||
if elem_line != line_number:
|
||||
continue
|
||||
|
||||
# Check attrs filter
|
||||
if attrs is not None:
|
||||
if not all(
|
||||
elem.getAttribute(attr_name) == attr_value
|
||||
for attr_name, attr_value in attrs.items()
|
||||
):
|
||||
continue
|
||||
|
||||
# Check contains filter
|
||||
if contains is not None:
|
||||
elem_text = self._get_element_text(elem)
|
||||
# Normalize the search string: convert HTML entities to Unicode characters
|
||||
# This allows searching for both "“Rowan" and ""Rowan"
|
||||
normalized_contains = html.unescape(contains)
|
||||
if normalized_contains not in elem_text:
|
||||
continue
|
||||
|
||||
# If all applicable filters passed, this is a match
|
||||
matches.append(elem)
|
||||
|
||||
if not matches:
|
||||
# Build descriptive error message
|
||||
filters = []
|
||||
if line_number is not None:
|
||||
line_str = (
|
||||
f"lines {line_number.start}-{line_number.stop - 1}"
|
||||
if isinstance(line_number, range)
|
||||
else f"line {line_number}"
|
||||
)
|
||||
filters.append(f"at {line_str}")
|
||||
if attrs is not None:
|
||||
filters.append(f"with attributes {attrs}")
|
||||
if contains is not None:
|
||||
filters.append(f"containing '{contains}'")
|
||||
|
||||
filter_desc = " ".join(filters) if filters else ""
|
||||
base_msg = f"Node not found: <{tag}> {filter_desc}".strip()
|
||||
|
||||
# Add helpful hint based on filters used
|
||||
if contains:
|
||||
hint = "Text may be split across elements or use different wording."
|
||||
elif line_number:
|
||||
hint = "Line numbers may have changed if document was modified."
|
||||
elif attrs:
|
||||
hint = "Verify attribute values are correct."
|
||||
else:
|
||||
hint = "Try adding filters (attrs, line_number, or contains)."
|
||||
|
||||
raise ValueError(f"{base_msg}. {hint}")
|
||||
if len(matches) > 1:
|
||||
raise ValueError(
|
||||
f"Multiple nodes found: <{tag}>. "
|
||||
f"Add more filters (attrs, line_number, or contains) to narrow the search."
|
||||
)
|
||||
return matches[0]
|
||||
|
||||
def _get_element_text(self, elem):
|
||||
"""
|
||||
Recursively extract all text content from an element.
|
||||
|
||||
Skips text nodes that contain only whitespace (spaces, tabs, newlines),
|
||||
which typically represent XML formatting rather than document content.
|
||||
|
||||
Args:
|
||||
elem: defusedxml.minidom.Element to extract text from
|
||||
|
||||
Returns:
|
||||
str: Concatenated text from all non-whitespace text nodes within the element
|
||||
"""
|
||||
text_parts = []
|
||||
for node in elem.childNodes:
|
||||
if node.nodeType == node.TEXT_NODE:
|
||||
# Skip whitespace-only text nodes (XML formatting)
|
||||
if node.data.strip():
|
||||
text_parts.append(node.data)
|
||||
elif node.nodeType == node.ELEMENT_NODE:
|
||||
text_parts.append(self._get_element_text(node))
|
||||
return "".join(text_parts)
|
||||
|
||||
def replace_node(self, elem, new_content):
|
||||
"""
|
||||
Replace a DOM element with new XML content.
|
||||
|
||||
Args:
|
||||
elem: defusedxml.minidom.Element to replace
|
||||
new_content: String containing XML to replace the node with
|
||||
|
||||
Returns:
|
||||
List[defusedxml.minidom.Node]: All inserted nodes
|
||||
|
||||
Example:
|
||||
new_nodes = editor.replace_node(old_elem, "<w:r><w:t>text</w:t></w:r>")
|
||||
"""
|
||||
parent = elem.parentNode
|
||||
nodes = self._parse_fragment(new_content)
|
||||
for node in nodes:
|
||||
parent.insertBefore(node, elem)
|
||||
parent.removeChild(elem)
|
||||
return nodes
|
||||
|
||||
def insert_after(self, elem, xml_content):
|
||||
"""
|
||||
Insert XML content after a DOM element.
|
||||
|
||||
Args:
|
||||
elem: defusedxml.minidom.Element to insert after
|
||||
xml_content: String containing XML to insert
|
||||
|
||||
Returns:
|
||||
List[defusedxml.minidom.Node]: All inserted nodes
|
||||
|
||||
Example:
|
||||
new_nodes = editor.insert_after(elem, "<w:r><w:t>text</w:t></w:r>")
|
||||
"""
|
||||
parent = elem.parentNode
|
||||
next_sibling = elem.nextSibling
|
||||
nodes = self._parse_fragment(xml_content)
|
||||
for node in nodes:
|
||||
if next_sibling:
|
||||
parent.insertBefore(node, next_sibling)
|
||||
else:
|
||||
parent.appendChild(node)
|
||||
return nodes
|
||||
|
||||
def insert_before(self, elem, xml_content):
|
||||
"""
|
||||
Insert XML content before a DOM element.
|
||||
|
||||
Args:
|
||||
elem: defusedxml.minidom.Element to insert before
|
||||
xml_content: String containing XML to insert
|
||||
|
||||
Returns:
|
||||
List[defusedxml.minidom.Node]: All inserted nodes
|
||||
|
||||
Example:
|
||||
new_nodes = editor.insert_before(elem, "<w:r><w:t>text</w:t></w:r>")
|
||||
"""
|
||||
parent = elem.parentNode
|
||||
nodes = self._parse_fragment(xml_content)
|
||||
for node in nodes:
|
||||
parent.insertBefore(node, elem)
|
||||
return nodes
|
||||
|
||||
def append_to(self, elem, xml_content):
|
||||
"""
|
||||
Append XML content as a child of a DOM element.
|
||||
|
||||
Args:
|
||||
elem: defusedxml.minidom.Element to append to
|
||||
xml_content: String containing XML to append
|
||||
|
||||
Returns:
|
||||
List[defusedxml.minidom.Node]: All inserted nodes
|
||||
|
||||
Example:
|
||||
new_nodes = editor.append_to(elem, "<w:r><w:t>text</w:t></w:r>")
|
||||
"""
|
||||
nodes = self._parse_fragment(xml_content)
|
||||
for node in nodes:
|
||||
elem.appendChild(node)
|
||||
return nodes
|
||||
|
||||
def get_next_rid(self):
|
||||
"""Get the next available rId for relationships files."""
|
||||
max_id = 0
|
||||
for rel_elem in self.dom.getElementsByTagName("Relationship"):
|
||||
rel_id = rel_elem.getAttribute("Id")
|
||||
if rel_id.startswith("rId"):
|
||||
try:
|
||||
max_id = max(max_id, int(rel_id[3:]))
|
||||
except ValueError:
|
||||
pass
|
||||
return f"rId{max_id + 1}"
|
||||
|
||||
def save(self):
|
||||
"""
|
||||
Save the edited XML back to the file.
|
||||
|
||||
Serializes the DOM tree and writes it back to the original file path,
|
||||
preserving the original encoding (ascii or utf-8).
|
||||
"""
|
||||
content = self.dom.toxml(encoding=self.encoding)
|
||||
self.xml_path.write_bytes(content)
|
||||
|
||||
def _parse_fragment(self, xml_content):
|
||||
"""
|
||||
Parse XML fragment and return list of imported nodes.
|
||||
|
||||
Args:
|
||||
xml_content: String containing XML fragment
|
||||
|
||||
Returns:
|
||||
List of defusedxml.minidom.Node objects imported into this document
|
||||
|
||||
Raises:
|
||||
AssertionError: If fragment contains no element nodes
|
||||
"""
|
||||
# Extract namespace declarations from the root document element
|
||||
root_elem = self.dom.documentElement
|
||||
namespaces = []
|
||||
if root_elem and root_elem.attributes:
|
||||
for i in range(root_elem.attributes.length):
|
||||
attr = root_elem.attributes.item(i)
|
||||
if attr.name.startswith("xmlns"): # type: ignore
|
||||
namespaces.append(f'{attr.name}="{attr.value}"') # type: ignore
|
||||
|
||||
ns_decl = " ".join(namespaces)
|
||||
wrapper = f"<root {ns_decl}>{xml_content}</root>"
|
||||
fragment_doc = defusedxml.minidom.parseString(wrapper)
|
||||
nodes = [
|
||||
self.dom.importNode(child, deep=True)
|
||||
for child in fragment_doc.documentElement.childNodes # type: ignore
|
||||
]
|
||||
elements = [n for n in nodes if n.nodeType == n.ELEMENT_NODE]
|
||||
assert elements, "Fragment must contain at least one element"
|
||||
return nodes
|
||||
|
||||
|
||||
def _create_line_tracking_parser():
|
||||
"""
|
||||
Create a SAX parser that tracks line and column numbers for each element.
|
||||
|
||||
Monkey patches the SAX content handler to store the current line and column
|
||||
position from the underlying expat parser onto each element as a parse_position
|
||||
attribute (line, column) tuple.
|
||||
|
||||
Returns:
|
||||
defusedxml.sax.xmlreader.XMLReader: Configured SAX parser
|
||||
"""
|
||||
|
||||
def set_content_handler(dom_handler):
|
||||
def startElementNS(name, tagName, attrs):
|
||||
orig_start_cb(name, tagName, attrs)
|
||||
cur_elem = dom_handler.elementStack[-1]
|
||||
cur_elem.parse_position = (
|
||||
parser._parser.CurrentLineNumber, # type: ignore
|
||||
parser._parser.CurrentColumnNumber, # type: ignore
|
||||
)
|
||||
|
||||
orig_start_cb = dom_handler.startElementNS
|
||||
dom_handler.startElementNS = startElementNS
|
||||
orig_set_content_handler(dom_handler)
|
||||
|
||||
parser = defusedxml.sax.make_parser()
|
||||
orig_set_content_handler = parser.setContentHandler
|
||||
parser.setContentHandler = set_content_handler # type: ignore
|
||||
return parser
|
||||
Reference in New Issue
Block a user