Files
2025-11-30 08:30:10 +08:00

1277 lines
49 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Library for working with Word documents: comments, tracked changes, and editing.
Usage:
from skills.docx.scripts.document import Document
# Initialize
doc = Document('workspace/unpacked')
doc = Document('workspace/unpacked', author="John Doe", initials="JD")
# Find nodes
node = doc["word/document.xml"].get_node(tag="w:del", attrs={"w:id": "1"})
node = doc["word/document.xml"].get_node(tag="w:p", line_number=10)
# Add comments
doc.add_comment(start=node, end=node, text="Comment text")
doc.reply_to_comment(parent_comment_id=0, text="Reply text")
# Suggest tracked changes
doc["word/document.xml"].suggest_deletion(node) # Delete content
doc["word/document.xml"].revert_insertion(ins_node) # Reject insertion
doc["word/document.xml"].revert_deletion(del_node) # Reject deletion
# Save
doc.save()
"""
import html
import random
import shutil
import tempfile
from datetime import datetime, timezone
from pathlib import Path
from defusedxml import minidom
from ooxml.scripts.pack import pack_document
from ooxml.scripts.validation.docx import DOCXSchemaValidator
from ooxml.scripts.validation.redlining import RedliningValidator
from .utilities import XMLEditor
# Path to template files
TEMPLATE_DIR = Path(__file__).parent / "templates"
class DocxXMLEditor(XMLEditor):
"""XMLEditor that automatically applies RSID, author, and date to new elements.
Automatically adds attributes to elements that support them when inserting new content:
- w:rsidR, w:rsidRDefault, w:rsidP (for w:p and w:r elements)
- w:author and w:date (for w:ins, w:del, w:comment elements)
- w:id (for w:ins and w:del elements)
Attributes:
dom (defusedxml.minidom.Document): The DOM document for direct manipulation
"""
def __init__(
self, xml_path, rsid: str, author: str = "Claude", initials: str = "C"
):
"""Initialize with required RSID and optional author.
Args:
xml_path: Path to XML file to edit
rsid: RSID to automatically apply to new elements
author: Author name for tracked changes and comments (default: "Claude")
initials: Author initials (default: "C")
"""
super().__init__(xml_path)
self.rsid = rsid
self.author = author
self.initials = initials
def _get_next_change_id(self):
"""Get the next available change ID by checking all tracked change elements."""
max_id = -1
for tag in ("w:ins", "w:del"):
elements = self.dom.getElementsByTagName(tag)
for elem in elements:
change_id = elem.getAttribute("w:id")
if change_id:
try:
max_id = max(max_id, int(change_id))
except ValueError:
pass
return max_id + 1
def _ensure_w16du_namespace(self):
"""Ensure w16du namespace is declared on the root element."""
root = self.dom.documentElement
if not root.hasAttribute("xmlns:w16du"): # type: ignore
root.setAttribute( # type: ignore
"xmlns:w16du",
"http://schemas.microsoft.com/office/word/2023/wordml/word16du",
)
def _ensure_w16cex_namespace(self):
"""Ensure w16cex namespace is declared on the root element."""
root = self.dom.documentElement
if not root.hasAttribute("xmlns:w16cex"): # type: ignore
root.setAttribute( # type: ignore
"xmlns:w16cex",
"http://schemas.microsoft.com/office/word/2018/wordml/cex",
)
def _ensure_w14_namespace(self):
"""Ensure w14 namespace is declared on the root element."""
root = self.dom.documentElement
if not root.hasAttribute("xmlns:w14"): # type: ignore
root.setAttribute( # type: ignore
"xmlns:w14",
"http://schemas.microsoft.com/office/word/2010/wordml",
)
def _inject_attributes_to_nodes(self, nodes):
"""Inject RSID, author, and date attributes into DOM nodes where applicable.
Adds attributes to elements that support them:
- w:r: gets w:rsidR (or w:rsidDel if inside w:del)
- w:p: gets w:rsidR, w:rsidRDefault, w:rsidP, w14:paraId, w14:textId
- w:t: gets xml:space="preserve" if text has leading/trailing whitespace
- w:ins, w:del: get w:id, w:author, w:date, w16du:dateUtc
- w:comment: gets w:author, w:date, w:initials
- w16cex:commentExtensible: gets w16cex:dateUtc
Args:
nodes: List of DOM nodes to process
"""
from datetime import datetime, timezone
timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
def is_inside_deletion(elem):
"""Check if element is inside a w:del element."""
parent = elem.parentNode
while parent:
if parent.nodeType == parent.ELEMENT_NODE and parent.tagName == "w:del":
return True
parent = parent.parentNode
return False
def add_rsid_to_p(elem):
if not elem.hasAttribute("w:rsidR"):
elem.setAttribute("w:rsidR", self.rsid)
if not elem.hasAttribute("w:rsidRDefault"):
elem.setAttribute("w:rsidRDefault", self.rsid)
if not elem.hasAttribute("w:rsidP"):
elem.setAttribute("w:rsidP", self.rsid)
# Add w14:paraId and w14:textId if not present
if not elem.hasAttribute("w14:paraId"):
self._ensure_w14_namespace()
elem.setAttribute("w14:paraId", _generate_hex_id())
if not elem.hasAttribute("w14:textId"):
self._ensure_w14_namespace()
elem.setAttribute("w14:textId", _generate_hex_id())
def add_rsid_to_r(elem):
# Use w:rsidDel for <w:r> inside <w:del>, otherwise w:rsidR
if is_inside_deletion(elem):
if not elem.hasAttribute("w:rsidDel"):
elem.setAttribute("w:rsidDel", self.rsid)
else:
if not elem.hasAttribute("w:rsidR"):
elem.setAttribute("w:rsidR", self.rsid)
def add_tracked_change_attrs(elem):
# Auto-assign w:id if not present
if not elem.hasAttribute("w:id"):
elem.setAttribute("w:id", str(self._get_next_change_id()))
if not elem.hasAttribute("w:author"):
elem.setAttribute("w:author", self.author)
if not elem.hasAttribute("w:date"):
elem.setAttribute("w:date", timestamp)
# Add w16du:dateUtc for tracked changes (same as w:date since we generate UTC timestamps)
if elem.tagName in ("w:ins", "w:del") and not elem.hasAttribute(
"w16du:dateUtc"
):
self._ensure_w16du_namespace()
elem.setAttribute("w16du:dateUtc", timestamp)
def add_comment_attrs(elem):
if not elem.hasAttribute("w:author"):
elem.setAttribute("w:author", self.author)
if not elem.hasAttribute("w:date"):
elem.setAttribute("w:date", timestamp)
if not elem.hasAttribute("w:initials"):
elem.setAttribute("w:initials", self.initials)
def add_comment_extensible_date(elem):
# Add w16cex:dateUtc for comment extensible elements
if not elem.hasAttribute("w16cex:dateUtc"):
self._ensure_w16cex_namespace()
elem.setAttribute("w16cex:dateUtc", timestamp)
def add_xml_space_to_t(elem):
# Add xml:space="preserve" to w:t if text has leading/trailing whitespace
if (
elem.firstChild
and elem.firstChild.nodeType == elem.firstChild.TEXT_NODE
):
text = elem.firstChild.data
if text and (text[0].isspace() or text[-1].isspace()):
if not elem.hasAttribute("xml:space"):
elem.setAttribute("xml:space", "preserve")
for node in nodes:
if node.nodeType != node.ELEMENT_NODE:
continue
# Handle the node itself
if node.tagName == "w:p":
add_rsid_to_p(node)
elif node.tagName == "w:r":
add_rsid_to_r(node)
elif node.tagName == "w:t":
add_xml_space_to_t(node)
elif node.tagName in ("w:ins", "w:del"):
add_tracked_change_attrs(node)
elif node.tagName == "w:comment":
add_comment_attrs(node)
elif node.tagName == "w16cex:commentExtensible":
add_comment_extensible_date(node)
# Process descendants (getElementsByTagName doesn't return the element itself)
for elem in node.getElementsByTagName("w:p"):
add_rsid_to_p(elem)
for elem in node.getElementsByTagName("w:r"):
add_rsid_to_r(elem)
for elem in node.getElementsByTagName("w:t"):
add_xml_space_to_t(elem)
for tag in ("w:ins", "w:del"):
for elem in node.getElementsByTagName(tag):
add_tracked_change_attrs(elem)
for elem in node.getElementsByTagName("w:comment"):
add_comment_attrs(elem)
for elem in node.getElementsByTagName("w16cex:commentExtensible"):
add_comment_extensible_date(elem)
def replace_node(self, elem, new_content):
"""Replace node with automatic attribute injection."""
nodes = super().replace_node(elem, new_content)
self._inject_attributes_to_nodes(nodes)
return nodes
def insert_after(self, elem, xml_content):
"""Insert after with automatic attribute injection."""
nodes = super().insert_after(elem, xml_content)
self._inject_attributes_to_nodes(nodes)
return nodes
def insert_before(self, elem, xml_content):
"""Insert before with automatic attribute injection."""
nodes = super().insert_before(elem, xml_content)
self._inject_attributes_to_nodes(nodes)
return nodes
def append_to(self, elem, xml_content):
"""Append to with automatic attribute injection."""
nodes = super().append_to(elem, xml_content)
self._inject_attributes_to_nodes(nodes)
return nodes
def revert_insertion(self, elem):
"""Reject an insertion by wrapping its content in a deletion.
Wraps all runs inside w:ins in w:del, converting w:t to w:delText.
Can process a single w:ins element or a container element with multiple w:ins.
Args:
elem: Element to process (w:ins, w:p, w:body, etc.)
Returns:
list: List containing the processed element(s)
Raises:
ValueError: If the element contains no w:ins elements
Example:
# Reject a single insertion
ins = doc["word/document.xml"].get_node(tag="w:ins", attrs={"w:id": "5"})
doc["word/document.xml"].revert_insertion(ins)
# Reject all insertions in a paragraph
para = doc["word/document.xml"].get_node(tag="w:p", line_number=42)
doc["word/document.xml"].revert_insertion(para)
"""
# Collect insertions
ins_elements = []
if elem.tagName == "w:ins":
ins_elements.append(elem)
else:
ins_elements.extend(elem.getElementsByTagName("w:ins"))
# Validate that there are insertions to reject
if not ins_elements:
raise ValueError(
f"revert_insertion requires w:ins elements. "
f"The provided element <{elem.tagName}> contains no insertions. "
)
# Process all insertions - wrap all children in w:del
for ins_elem in ins_elements:
runs = list(ins_elem.getElementsByTagName("w:r"))
if not runs:
continue
# Create deletion wrapper
del_wrapper = self.dom.createElement("w:del")
# Process each run
for run in runs:
# Convert w:t → w:delText and w:rsidR → w:rsidDel
if run.hasAttribute("w:rsidR"):
run.setAttribute("w:rsidDel", run.getAttribute("w:rsidR"))
run.removeAttribute("w:rsidR")
elif not run.hasAttribute("w:rsidDel"):
run.setAttribute("w:rsidDel", self.rsid)
for t_elem in list(run.getElementsByTagName("w:t")):
del_text = self.dom.createElement("w:delText")
# Copy ALL child nodes (not just firstChild) to handle entities
while t_elem.firstChild:
del_text.appendChild(t_elem.firstChild)
for i in range(t_elem.attributes.length):
attr = t_elem.attributes.item(i)
del_text.setAttribute(attr.name, attr.value)
t_elem.parentNode.replaceChild(del_text, t_elem)
# Move all children from ins to del wrapper
while ins_elem.firstChild:
del_wrapper.appendChild(ins_elem.firstChild)
# Add del wrapper back to ins
ins_elem.appendChild(del_wrapper)
# Inject attributes to the deletion wrapper
self._inject_attributes_to_nodes([del_wrapper])
return [elem]
def revert_deletion(self, elem):
"""Reject a deletion by re-inserting the deleted content.
Creates w:ins elements after each w:del, copying deleted content and
converting w:delText back to w:t.
Can process a single w:del element or a container element with multiple w:del.
Args:
elem: Element to process (w:del, w:p, w:body, etc.)
Returns:
list: If elem is w:del, returns [elem, new_ins]. Otherwise returns [elem].
Raises:
ValueError: If the element contains no w:del elements
Example:
# Reject a single deletion - returns [w:del, w:ins]
del_elem = doc["word/document.xml"].get_node(tag="w:del", attrs={"w:id": "3"})
nodes = doc["word/document.xml"].revert_deletion(del_elem)
# Reject all deletions in a paragraph - returns [para]
para = doc["word/document.xml"].get_node(tag="w:p", line_number=42)
nodes = doc["word/document.xml"].revert_deletion(para)
"""
# Collect deletions FIRST - before we modify the DOM
del_elements = []
is_single_del = elem.tagName == "w:del"
if is_single_del:
del_elements.append(elem)
else:
del_elements.extend(elem.getElementsByTagName("w:del"))
# Validate that there are deletions to reject
if not del_elements:
raise ValueError(
f"revert_deletion requires w:del elements. "
f"The provided element <{elem.tagName}> contains no deletions. "
)
# Track created insertion (only relevant if elem is a single w:del)
created_insertion = None
# Process all deletions - create insertions that copy the deleted content
for del_elem in del_elements:
# Clone the deleted runs and convert them to insertions
runs = list(del_elem.getElementsByTagName("w:r"))
if not runs:
continue
# Create insertion wrapper
ins_elem = self.dom.createElement("w:ins")
for run in runs:
# Clone the run
new_run = run.cloneNode(True)
# Convert w:delText → w:t
for del_text in list(new_run.getElementsByTagName("w:delText")):
t_elem = self.dom.createElement("w:t")
# Copy ALL child nodes (not just firstChild) to handle entities
while del_text.firstChild:
t_elem.appendChild(del_text.firstChild)
for i in range(del_text.attributes.length):
attr = del_text.attributes.item(i)
t_elem.setAttribute(attr.name, attr.value)
del_text.parentNode.replaceChild(t_elem, del_text)
# Update run attributes: w:rsidDel → w:rsidR
if new_run.hasAttribute("w:rsidDel"):
new_run.setAttribute("w:rsidR", new_run.getAttribute("w:rsidDel"))
new_run.removeAttribute("w:rsidDel")
elif not new_run.hasAttribute("w:rsidR"):
new_run.setAttribute("w:rsidR", self.rsid)
ins_elem.appendChild(new_run)
# Insert the new insertion after the deletion
nodes = self.insert_after(del_elem, ins_elem.toxml())
# If processing a single w:del, track the created insertion
if is_single_del and nodes:
created_insertion = nodes[0]
# Return based on input type
if is_single_del and created_insertion:
return [elem, created_insertion]
else:
return [elem]
@staticmethod
def suggest_paragraph(xml_content: str) -> str:
"""Transform paragraph XML to add tracked change wrapping for insertion.
Wraps runs in <w:ins> and adds <w:ins/> to w:rPr in w:pPr for numbered lists.
Args:
xml_content: XML string containing a <w:p> element
Returns:
str: Transformed XML with tracked change wrapping
"""
wrapper = f'<root xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">{xml_content}</root>'
doc = minidom.parseString(wrapper)
para = doc.getElementsByTagName("w:p")[0]
# Ensure w:pPr exists
pPr_list = para.getElementsByTagName("w:pPr")
if not pPr_list:
pPr = doc.createElement("w:pPr")
para.insertBefore(
pPr, para.firstChild
) if para.firstChild else para.appendChild(pPr)
else:
pPr = pPr_list[0]
# Ensure w:rPr exists in w:pPr
rPr_list = pPr.getElementsByTagName("w:rPr")
if not rPr_list:
rPr = doc.createElement("w:rPr")
pPr.appendChild(rPr)
else:
rPr = rPr_list[0]
# Add <w:ins/> to w:rPr
ins_marker = doc.createElement("w:ins")
rPr.insertBefore(
ins_marker, rPr.firstChild
) if rPr.firstChild else rPr.appendChild(ins_marker)
# Wrap all non-pPr children in <w:ins>
ins_wrapper = doc.createElement("w:ins")
for child in [c for c in para.childNodes if c.nodeName != "w:pPr"]:
para.removeChild(child)
ins_wrapper.appendChild(child)
para.appendChild(ins_wrapper)
return para.toxml()
def suggest_deletion(self, elem):
"""Mark a w:r or w:p element as deleted with tracked changes (in-place DOM manipulation).
For w:r: wraps in <w:del>, converts <w:t> to <w:delText>, preserves w:rPr
For w:p (regular): wraps content in <w:del>, converts <w:t> to <w:delText>
For w:p (numbered list): adds <w:del/> to w:rPr in w:pPr, wraps content in <w:del>
Args:
elem: A w:r or w:p DOM element without existing tracked changes
Returns:
Element: The modified element
Raises:
ValueError: If element has existing tracked changes or invalid structure
"""
if elem.nodeName == "w:r":
# Check for existing w:delText
if elem.getElementsByTagName("w:delText"):
raise ValueError("w:r element already contains w:delText")
# Convert w:t → w:delText
for t_elem in list(elem.getElementsByTagName("w:t")):
del_text = self.dom.createElement("w:delText")
# Copy ALL child nodes (not just firstChild) to handle entities
while t_elem.firstChild:
del_text.appendChild(t_elem.firstChild)
# Preserve attributes like xml:space
for i in range(t_elem.attributes.length):
attr = t_elem.attributes.item(i)
del_text.setAttribute(attr.name, attr.value)
t_elem.parentNode.replaceChild(del_text, t_elem)
# Update run attributes: w:rsidR → w:rsidDel
if elem.hasAttribute("w:rsidR"):
elem.setAttribute("w:rsidDel", elem.getAttribute("w:rsidR"))
elem.removeAttribute("w:rsidR")
elif not elem.hasAttribute("w:rsidDel"):
elem.setAttribute("w:rsidDel", self.rsid)
# Wrap in w:del
del_wrapper = self.dom.createElement("w:del")
parent = elem.parentNode
parent.insertBefore(del_wrapper, elem)
parent.removeChild(elem)
del_wrapper.appendChild(elem)
# Inject attributes to the deletion wrapper
self._inject_attributes_to_nodes([del_wrapper])
return del_wrapper
elif elem.nodeName == "w:p":
# Check for existing tracked changes
if elem.getElementsByTagName("w:ins") or elem.getElementsByTagName("w:del"):
raise ValueError("w:p element already contains tracked changes")
# Check if it's a numbered list item
pPr_list = elem.getElementsByTagName("w:pPr")
is_numbered = pPr_list and pPr_list[0].getElementsByTagName("w:numPr")
if is_numbered:
# Add <w:del/> to w:rPr in w:pPr
pPr = pPr_list[0]
rPr_list = pPr.getElementsByTagName("w:rPr")
if not rPr_list:
rPr = self.dom.createElement("w:rPr")
pPr.appendChild(rPr)
else:
rPr = rPr_list[0]
# Add <w:del/> marker
del_marker = self.dom.createElement("w:del")
rPr.insertBefore(
del_marker, rPr.firstChild
) if rPr.firstChild else rPr.appendChild(del_marker)
# Convert w:t → w:delText in all runs
for t_elem in list(elem.getElementsByTagName("w:t")):
del_text = self.dom.createElement("w:delText")
# Copy ALL child nodes (not just firstChild) to handle entities
while t_elem.firstChild:
del_text.appendChild(t_elem.firstChild)
# Preserve attributes like xml:space
for i in range(t_elem.attributes.length):
attr = t_elem.attributes.item(i)
del_text.setAttribute(attr.name, attr.value)
t_elem.parentNode.replaceChild(del_text, t_elem)
# Update run attributes: w:rsidR → w:rsidDel
for run in elem.getElementsByTagName("w:r"):
if run.hasAttribute("w:rsidR"):
run.setAttribute("w:rsidDel", run.getAttribute("w:rsidR"))
run.removeAttribute("w:rsidR")
elif not run.hasAttribute("w:rsidDel"):
run.setAttribute("w:rsidDel", self.rsid)
# Wrap all non-pPr children in <w:del>
del_wrapper = self.dom.createElement("w:del")
for child in [c for c in elem.childNodes if c.nodeName != "w:pPr"]:
elem.removeChild(child)
del_wrapper.appendChild(child)
elem.appendChild(del_wrapper)
# Inject attributes to the deletion wrapper
self._inject_attributes_to_nodes([del_wrapper])
return elem
else:
raise ValueError(f"Element must be w:r or w:p, got {elem.nodeName}")
def _generate_hex_id() -> str:
"""Generate random 8-character hex ID for para/durable IDs.
Values are constrained to be less than 0x7FFFFFFF per OOXML spec:
- paraId must be < 0x80000000
- durableId must be < 0x7FFFFFFF
We use the stricter constraint (0x7FFFFFFF) for both.
"""
return f"{random.randint(1, 0x7FFFFFFE):08X}"
def _generate_rsid() -> str:
"""Generate random 8-character hex RSID."""
return "".join(random.choices("0123456789ABCDEF", k=8))
class Document:
"""Manages comments in unpacked Word documents."""
def __init__(
self,
unpacked_dir,
rsid=None,
track_revisions=False,
author="Claude",
initials="C",
):
"""
Initialize with path to unpacked Word document directory.
Automatically sets up comment infrastructure (people.xml, RSIDs).
Args:
unpacked_dir: Path to unpacked DOCX directory (must contain word/ subdirectory)
rsid: Optional RSID to use for all comment elements. If not provided, one will be generated.
track_revisions: If True, enables track revisions in settings.xml (default: False)
author: Default author name for comments (default: "Claude")
initials: Default author initials for comments (default: "C")
"""
self.original_path = Path(unpacked_dir)
if not self.original_path.exists() or not self.original_path.is_dir():
raise ValueError(f"Directory not found: {unpacked_dir}")
# Create temporary directory with subdirectories for unpacked content and baseline
self.temp_dir = tempfile.mkdtemp(prefix="docx_")
self.unpacked_path = Path(self.temp_dir) / "unpacked"
shutil.copytree(self.original_path, self.unpacked_path)
# Pack original directory into temporary .docx for validation baseline (outside unpacked dir)
self.original_docx = Path(self.temp_dir) / "original.docx"
pack_document(self.original_path, self.original_docx, validate=False)
self.word_path = self.unpacked_path / "word"
# Generate RSID if not provided
self.rsid = rsid if rsid else _generate_rsid()
print(f"Using RSID: {self.rsid}")
# Set default author and initials
self.author = author
self.initials = initials
# Cache for lazy-loaded editors
self._editors = {}
# Comment file paths
self.comments_path = self.word_path / "comments.xml"
self.comments_extended_path = self.word_path / "commentsExtended.xml"
self.comments_ids_path = self.word_path / "commentsIds.xml"
self.comments_extensible_path = self.word_path / "commentsExtensible.xml"
# Load existing comments and determine next ID (before setup modifies files)
self.existing_comments = self._load_existing_comments()
self.next_comment_id = self._get_next_comment_id()
# Convenient access to document.xml editor (semi-private)
self._document = self["word/document.xml"]
# Setup tracked changes infrastructure
self._setup_tracking(track_revisions=track_revisions)
# Add author to people.xml
self._add_author_to_people(author)
def __getitem__(self, xml_path: str) -> DocxXMLEditor:
"""
Get or create a DocxXMLEditor for the specified XML file.
Enables lazy-loaded editors with bracket notation:
node = doc["word/document.xml"].get_node(tag="w:p", line_number=42)
Args:
xml_path: Relative path to XML file (e.g., "word/document.xml", "word/comments.xml")
Returns:
DocxXMLEditor instance for the specified file
Raises:
ValueError: If the file does not exist
Example:
# Get node from document.xml
node = doc["word/document.xml"].get_node(tag="w:del", attrs={"w:id": "1"})
# Get node from comments.xml
comment = doc["word/comments.xml"].get_node(tag="w:comment", attrs={"w:id": "0"})
"""
if xml_path not in self._editors:
file_path = self.unpacked_path / xml_path
if not file_path.exists():
raise ValueError(f"XML file not found: {xml_path}")
# Use DocxXMLEditor with RSID, author, and initials for all editors
self._editors[xml_path] = DocxXMLEditor(
file_path, rsid=self.rsid, author=self.author, initials=self.initials
)
return self._editors[xml_path]
def add_comment(self, start, end, text: str) -> int:
"""
Add a comment spanning from one element to another.
Args:
start: DOM element for the starting point
end: DOM element for the ending point
text: Comment content
Returns:
The comment ID that was created
Example:
start_node = cm.get_document_node(tag="w:del", id="1")
end_node = cm.get_document_node(tag="w:ins", id="2")
cm.add_comment(start=start_node, end=end_node, text="Explanation")
"""
comment_id = self.next_comment_id
para_id = _generate_hex_id()
durable_id = _generate_hex_id()
timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
# Add comment ranges to document.xml immediately
self._document.insert_before(start, self._comment_range_start_xml(comment_id))
# If end node is a paragraph, append comment markup inside it
# Otherwise insert after it (for run-level anchors)
if end.tagName == "w:p":
self._document.append_to(end, self._comment_range_end_xml(comment_id))
else:
self._document.insert_after(end, self._comment_range_end_xml(comment_id))
# Add to comments.xml immediately
self._add_to_comments_xml(
comment_id, para_id, text, self.author, self.initials, timestamp
)
# Add to commentsExtended.xml immediately
self._add_to_comments_extended_xml(para_id, parent_para_id=None)
# Add to commentsIds.xml immediately
self._add_to_comments_ids_xml(para_id, durable_id)
# Add to commentsExtensible.xml immediately
self._add_to_comments_extensible_xml(durable_id)
# Update existing_comments so replies work
self.existing_comments[comment_id] = {"para_id": para_id}
self.next_comment_id += 1
return comment_id
def reply_to_comment(
self,
parent_comment_id: int,
text: str,
) -> int:
"""
Add a reply to an existing comment.
Args:
parent_comment_id: The w:id of the parent comment to reply to
text: Reply text
Returns:
The comment ID that was created for the reply
Example:
cm.reply_to_comment(parent_comment_id=0, text="I agree with this change")
"""
if parent_comment_id not in self.existing_comments:
raise ValueError(f"Parent comment with id={parent_comment_id} not found")
parent_info = self.existing_comments[parent_comment_id]
comment_id = self.next_comment_id
para_id = _generate_hex_id()
durable_id = _generate_hex_id()
timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
# Add comment ranges to document.xml immediately
parent_start_elem = self._document.get_node(
tag="w:commentRangeStart", attrs={"w:id": str(parent_comment_id)}
)
parent_ref_elem = self._document.get_node(
tag="w:commentReference", attrs={"w:id": str(parent_comment_id)}
)
self._document.insert_after(
parent_start_elem, self._comment_range_start_xml(comment_id)
)
parent_ref_run = parent_ref_elem.parentNode
self._document.insert_after(
parent_ref_run, f'<w:commentRangeEnd w:id="{comment_id}"/>'
)
self._document.insert_after(
parent_ref_run, self._comment_ref_run_xml(comment_id)
)
# Add to comments.xml immediately
self._add_to_comments_xml(
comment_id, para_id, text, self.author, self.initials, timestamp
)
# Add to commentsExtended.xml immediately (with parent)
self._add_to_comments_extended_xml(
para_id, parent_para_id=parent_info["para_id"]
)
# Add to commentsIds.xml immediately
self._add_to_comments_ids_xml(para_id, durable_id)
# Add to commentsExtensible.xml immediately
self._add_to_comments_extensible_xml(durable_id)
# Update existing_comments so replies work
self.existing_comments[comment_id] = {"para_id": para_id}
self.next_comment_id += 1
return comment_id
def __del__(self):
"""Clean up temporary directory on deletion."""
if hasattr(self, "temp_dir") and Path(self.temp_dir).exists():
shutil.rmtree(self.temp_dir)
def validate(self) -> None:
"""
Validate the document against XSD schema and redlining rules.
Raises:
ValueError: If validation fails.
"""
# Create validators with current state
schema_validator = DOCXSchemaValidator(
self.unpacked_path, self.original_docx, verbose=False
)
redlining_validator = RedliningValidator(
self.unpacked_path, self.original_docx, verbose=False
)
# Run validations
if not schema_validator.validate():
raise ValueError("Schema validation failed")
if not redlining_validator.validate():
raise ValueError("Redlining validation failed")
def save(self, destination=None, validate=True) -> None:
"""
Save all modified XML files to disk and copy to destination directory.
This persists all changes made via add_comment() and reply_to_comment().
Args:
destination: Optional path to save to. If None, saves back to original directory.
validate: If True, validates document before saving (default: True).
"""
# Only ensure comment relationships and content types if comment files exist
if self.comments_path.exists():
self._ensure_comment_relationships()
self._ensure_comment_content_types()
# Save all modified XML files in temp directory
for editor in self._editors.values():
editor.save()
# Validate by default
if validate:
self.validate()
# Copy contents from temp directory to destination (or original directory)
target_path = Path(destination) if destination else self.original_path
shutil.copytree(self.unpacked_path, target_path, dirs_exist_ok=True)
# ==================== Private: Initialization ====================
def _get_next_comment_id(self):
"""Get the next available comment ID."""
if not self.comments_path.exists():
return 0
editor = self["word/comments.xml"]
max_id = -1
for comment_elem in editor.dom.getElementsByTagName("w:comment"):
comment_id = comment_elem.getAttribute("w:id")
if comment_id:
try:
max_id = max(max_id, int(comment_id))
except ValueError:
pass
return max_id + 1
def _load_existing_comments(self):
"""Load existing comments from files to enable replies."""
if not self.comments_path.exists():
return {}
editor = self["word/comments.xml"]
existing = {}
for comment_elem in editor.dom.getElementsByTagName("w:comment"):
comment_id = comment_elem.getAttribute("w:id")
if not comment_id:
continue
# Find para_id from the w:p element within the comment
para_id = None
for p_elem in comment_elem.getElementsByTagName("w:p"):
para_id = p_elem.getAttribute("w14:paraId")
if para_id:
break
if not para_id:
continue
existing[int(comment_id)] = {"para_id": para_id}
return existing
# ==================== Private: Setup Methods ====================
def _setup_tracking(self, track_revisions=False):
"""Set up comment infrastructure in unpacked directory.
Args:
track_revisions: If True, enables track revisions in settings.xml
"""
# Create or update word/people.xml
people_file = self.word_path / "people.xml"
self._update_people_xml(people_file)
# Update XML files
self._add_content_type_for_people(self.unpacked_path / "[Content_Types].xml")
self._add_relationship_for_people(
self.word_path / "_rels" / "document.xml.rels"
)
# Always add RSID to settings.xml, optionally enable trackRevisions
self._update_settings(
self.word_path / "settings.xml", track_revisions=track_revisions
)
def _update_people_xml(self, path):
"""Create people.xml if it doesn't exist."""
if not path.exists():
# Copy from template
shutil.copy(TEMPLATE_DIR / "people.xml", path)
def _add_content_type_for_people(self, path):
"""Add people.xml content type to [Content_Types].xml if not already present."""
editor = self["[Content_Types].xml"]
if self._has_override(editor, "/word/people.xml"):
return
# Add Override element
root = editor.dom.documentElement
override_xml = '<Override PartName="/word/people.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.people+xml"/>'
editor.append_to(root, override_xml)
def _add_relationship_for_people(self, path):
"""Add people.xml relationship to document.xml.rels if not already present."""
editor = self["word/_rels/document.xml.rels"]
if self._has_relationship(editor, "people.xml"):
return
root = editor.dom.documentElement
root_tag = root.tagName # type: ignore
prefix = root_tag.split(":")[0] + ":" if ":" in root_tag else ""
next_rid = editor.get_next_rid()
# Create the relationship entry
rel_xml = f'<{prefix}Relationship Id="{next_rid}" Type="http://schemas.microsoft.com/office/2011/relationships/people" Target="people.xml"/>'
editor.append_to(root, rel_xml)
def _update_settings(self, path, track_revisions=False):
"""Add RSID and optionally enable track revisions in settings.xml.
Args:
path: Path to settings.xml
track_revisions: If True, adds trackRevisions element
Places elements per OOXML schema order:
- trackRevisions: early (before defaultTabStop)
- rsids: late (after compat)
"""
editor = self["word/settings.xml"]
root = editor.get_node(tag="w:settings")
prefix = root.tagName.split(":")[0] if ":" in root.tagName else "w"
# Conditionally add trackRevisions if requested
if track_revisions:
track_revisions_exists = any(
elem.tagName == f"{prefix}:trackRevisions"
for elem in editor.dom.getElementsByTagName(f"{prefix}:trackRevisions")
)
if not track_revisions_exists:
track_rev_xml = f"<{prefix}:trackRevisions/>"
# Try to insert before documentProtection, defaultTabStop, or at start
inserted = False
for tag in [f"{prefix}:documentProtection", f"{prefix}:defaultTabStop"]:
elements = editor.dom.getElementsByTagName(tag)
if elements:
editor.insert_before(elements[0], track_rev_xml)
inserted = True
break
if not inserted:
# Insert as first child of settings
if root.firstChild:
editor.insert_before(root.firstChild, track_rev_xml)
else:
editor.append_to(root, track_rev_xml)
# Always check if rsids section exists
rsids_elements = editor.dom.getElementsByTagName(f"{prefix}:rsids")
if not rsids_elements:
# Add new rsids section
rsids_xml = f'''<{prefix}:rsids>
<{prefix}:rsidRoot {prefix}:val="{self.rsid}"/>
<{prefix}:rsid {prefix}:val="{self.rsid}"/>
</{prefix}:rsids>'''
# Try to insert after compat, before clrSchemeMapping, or before closing tag
inserted = False
compat_elements = editor.dom.getElementsByTagName(f"{prefix}:compat")
if compat_elements:
editor.insert_after(compat_elements[0], rsids_xml)
inserted = True
if not inserted:
clr_elements = editor.dom.getElementsByTagName(
f"{prefix}:clrSchemeMapping"
)
if clr_elements:
editor.insert_before(clr_elements[0], rsids_xml)
inserted = True
if not inserted:
editor.append_to(root, rsids_xml)
else:
# Check if this rsid already exists
rsids_elem = rsids_elements[0]
rsid_exists = any(
elem.getAttribute(f"{prefix}:val") == self.rsid
for elem in rsids_elem.getElementsByTagName(f"{prefix}:rsid")
)
if not rsid_exists:
rsid_xml = f'<{prefix}:rsid {prefix}:val="{self.rsid}"/>'
editor.append_to(rsids_elem, rsid_xml)
# ==================== Private: XML File Creation ====================
def _add_to_comments_xml(
self, comment_id, para_id, text, author, initials, timestamp
):
"""Add a single comment to comments.xml."""
if not self.comments_path.exists():
shutil.copy(TEMPLATE_DIR / "comments.xml", self.comments_path)
editor = self["word/comments.xml"]
root = editor.get_node(tag="w:comments")
escaped_text = (
text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
)
# Note: w:rsidR, w:rsidRDefault, w:rsidP on w:p, w:rsidR on w:r,
# and w:author, w:date, w:initials on w:comment are automatically added by DocxXMLEditor
comment_xml = f'''<w:comment w:id="{comment_id}">
<w:p w14:paraId="{para_id}" w14:textId="77777777">
<w:r><w:rPr><w:rStyle w:val="CommentReference"/></w:rPr><w:annotationRef/></w:r>
<w:r><w:rPr><w:color w:val="000000"/><w:sz w:val="20"/><w:szCs w:val="20"/></w:rPr><w:t>{escaped_text}</w:t></w:r>
</w:p>
</w:comment>'''
editor.append_to(root, comment_xml)
def _add_to_comments_extended_xml(self, para_id, parent_para_id):
"""Add a single comment to commentsExtended.xml."""
if not self.comments_extended_path.exists():
shutil.copy(
TEMPLATE_DIR / "commentsExtended.xml", self.comments_extended_path
)
editor = self["word/commentsExtended.xml"]
root = editor.get_node(tag="w15:commentsEx")
if parent_para_id:
xml = f'<w15:commentEx w15:paraId="{para_id}" w15:paraIdParent="{parent_para_id}" w15:done="0"/>'
else:
xml = f'<w15:commentEx w15:paraId="{para_id}" w15:done="0"/>'
editor.append_to(root, xml)
def _add_to_comments_ids_xml(self, para_id, durable_id):
"""Add a single comment to commentsIds.xml."""
if not self.comments_ids_path.exists():
shutil.copy(TEMPLATE_DIR / "commentsIds.xml", self.comments_ids_path)
editor = self["word/commentsIds.xml"]
root = editor.get_node(tag="w16cid:commentsIds")
xml = f'<w16cid:commentId w16cid:paraId="{para_id}" w16cid:durableId="{durable_id}"/>'
editor.append_to(root, xml)
def _add_to_comments_extensible_xml(self, durable_id):
"""Add a single comment to commentsExtensible.xml."""
if not self.comments_extensible_path.exists():
shutil.copy(
TEMPLATE_DIR / "commentsExtensible.xml", self.comments_extensible_path
)
editor = self["word/commentsExtensible.xml"]
root = editor.get_node(tag="w16cex:commentsExtensible")
xml = f'<w16cex:commentExtensible w16cex:durableId="{durable_id}"/>'
editor.append_to(root, xml)
# ==================== Private: XML Fragments ====================
def _comment_range_start_xml(self, comment_id):
"""Generate XML for comment range start."""
return f'<w:commentRangeStart w:id="{comment_id}"/>'
def _comment_range_end_xml(self, comment_id):
"""Generate XML for comment range end with reference run.
Note: w:rsidR is automatically added by DocxXMLEditor.
"""
return f'''<w:commentRangeEnd w:id="{comment_id}"/>
<w:r>
<w:rPr><w:rStyle w:val="CommentReference"/></w:rPr>
<w:commentReference w:id="{comment_id}"/>
</w:r>'''
def _comment_ref_run_xml(self, comment_id):
"""Generate XML for comment reference run.
Note: w:rsidR is automatically added by DocxXMLEditor.
"""
return f'''<w:r>
<w:rPr><w:rStyle w:val="CommentReference"/></w:rPr>
<w:commentReference w:id="{comment_id}"/>
</w:r>'''
# ==================== Private: Metadata Updates ====================
def _has_relationship(self, editor, target):
"""Check if a relationship with given target exists."""
for rel_elem in editor.dom.getElementsByTagName("Relationship"):
if rel_elem.getAttribute("Target") == target:
return True
return False
def _has_override(self, editor, part_name):
"""Check if an override with given part name exists."""
for override_elem in editor.dom.getElementsByTagName("Override"):
if override_elem.getAttribute("PartName") == part_name:
return True
return False
def _has_author(self, editor, author):
"""Check if an author already exists in people.xml."""
for person_elem in editor.dom.getElementsByTagName("w15:person"):
if person_elem.getAttribute("w15:author") == author:
return True
return False
def _add_author_to_people(self, author):
"""Add author to people.xml (called during initialization)."""
people_path = self.word_path / "people.xml"
# people.xml should already exist from _setup_tracking
if not people_path.exists():
raise ValueError("people.xml should exist after _setup_tracking")
editor = self["word/people.xml"]
root = editor.get_node(tag="w15:people")
# Check if author already exists
if self._has_author(editor, author):
return
# Add author with proper XML escaping to prevent injection
escaped_author = html.escape(author, quote=True)
person_xml = f'''<w15:person w15:author="{escaped_author}">
<w15:presenceInfo w15:providerId="None" w15:userId="{escaped_author}"/>
</w15:person>'''
editor.append_to(root, person_xml)
def _ensure_comment_relationships(self):
"""Ensure word/_rels/document.xml.rels has comment relationships."""
editor = self["word/_rels/document.xml.rels"]
if self._has_relationship(editor, "comments.xml"):
return
root = editor.dom.documentElement
root_tag = root.tagName # type: ignore
prefix = root_tag.split(":")[0] + ":" if ":" in root_tag else ""
next_rid_num = int(editor.get_next_rid()[3:])
# Add relationship elements
rels = [
(
next_rid_num,
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments",
"comments.xml",
),
(
next_rid_num + 1,
"http://schemas.microsoft.com/office/2011/relationships/commentsExtended",
"commentsExtended.xml",
),
(
next_rid_num + 2,
"http://schemas.microsoft.com/office/2016/09/relationships/commentsIds",
"commentsIds.xml",
),
(
next_rid_num + 3,
"http://schemas.microsoft.com/office/2018/08/relationships/commentsExtensible",
"commentsExtensible.xml",
),
]
for rel_id, rel_type, target in rels:
rel_xml = f'<{prefix}Relationship Id="rId{rel_id}" Type="{rel_type}" Target="{target}"/>'
editor.append_to(root, rel_xml)
def _ensure_comment_content_types(self):
"""Ensure [Content_Types].xml has comment content types."""
editor = self["[Content_Types].xml"]
if self._has_override(editor, "/word/comments.xml"):
return
root = editor.dom.documentElement
# Add Override elements
overrides = [
(
"/word/comments.xml",
"application/vnd.openxmlformats-officedocument.wordprocessingml.comments+xml",
),
(
"/word/commentsExtended.xml",
"application/vnd.openxmlformats-officedocument.wordprocessingml.commentsExtended+xml",
),
(
"/word/commentsIds.xml",
"application/vnd.openxmlformats-officedocument.wordprocessingml.commentsIds+xml",
),
(
"/word/commentsExtensible.xml",
"application/vnd.openxmlformats-officedocument.wordprocessingml.commentsExtensible+xml",
),
]
for part_name, content_type in overrides:
override_xml = (
f'<Override PartName="{part_name}" ContentType="{content_type}"/>'
)
editor.append_to(root, override_xml)