""" Validator for tracked changes in Word documents. """ import subprocess import tempfile import zipfile from pathlib import Path class RedliningValidator: """Validator for tracked changes in Word documents.""" def __init__(self, unpacked_dir, original_docx, verbose=False): self.unpacked_dir = Path(unpacked_dir) self.original_docx = Path(original_docx) self.verbose = verbose self.namespaces = { "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main" } def validate(self): """Main validation method that returns True if valid, False otherwise.""" # Verify unpacked directory exists and has correct structure modified_file = self.unpacked_dir / "word" / "document.xml" if not modified_file.exists(): print(f"FAILED - Modified document.xml not found at {modified_file}") return False # First, check if there are any tracked changes by Claude to validate try: import xml.etree.ElementTree as ET tree = ET.parse(modified_file) root = tree.getroot() # Check for w:del or w:ins tags authored by Claude del_elements = root.findall(".//w:del", self.namespaces) ins_elements = root.findall(".//w:ins", self.namespaces) # Filter to only include changes by Claude claude_del_elements = [ elem for elem in del_elements if elem.get(f"{{{self.namespaces['w']}}}author") == "Claude" ] claude_ins_elements = [ elem for elem in ins_elements if elem.get(f"{{{self.namespaces['w']}}}author") == "Claude" ] # Redlining validation is only needed if tracked changes by Claude have been used. if not claude_del_elements and not claude_ins_elements: if self.verbose: print("PASSED - No tracked changes by Claude found.") return True except Exception: # If we can't parse the XML, continue with full validation pass # Create temporary directory for unpacking original docx with tempfile.TemporaryDirectory() as temp_dir: temp_path = Path(temp_dir) # Unpack original docx try: with zipfile.ZipFile(self.original_docx, "r") as zip_ref: zip_ref.extractall(temp_path) except Exception as e: print(f"FAILED - Error unpacking original docx: {e}") return False original_file = temp_path / "word" / "document.xml" if not original_file.exists(): print( f"FAILED - Original document.xml not found in {self.original_docx}" ) return False # Parse both XML files using xml.etree.ElementTree for redlining validation try: import xml.etree.ElementTree as ET modified_tree = ET.parse(modified_file) modified_root = modified_tree.getroot() original_tree = ET.parse(original_file) original_root = original_tree.getroot() except ET.ParseError as e: print(f"FAILED - Error parsing XML files: {e}") return False # Remove Claude's tracked changes from both documents self._remove_claude_tracked_changes(original_root) self._remove_claude_tracked_changes(modified_root) # Extract and compare text content modified_text = self._extract_text_content(modified_root) original_text = self._extract_text_content(original_root) if modified_text != original_text: # Show detailed character-level differences for each paragraph error_message = self._generate_detailed_diff( original_text, modified_text ) print(error_message) return False if self.verbose: print("PASSED - All changes by Claude are properly tracked") return True def _generate_detailed_diff(self, original_text, modified_text): """Generate detailed word-level differences using git word diff.""" error_parts = [ "FAILED - Document text doesn't match after removing Claude's tracked changes", "", "Likely causes:", " 1. Modified text inside another author's or tags", " 2. Made edits without proper tracked changes", " 3. Didn't nest inside when deleting another's insertion", "", "For pre-redlined documents, use correct patterns:", " - To reject another's INSERTION: Nest inside their ", " - To restore another's DELETION: Add new AFTER their ", "", ] # Show git word diff git_diff = self._get_git_word_diff(original_text, modified_text) if git_diff: error_parts.extend(["Differences:", "============", git_diff]) else: error_parts.append("Unable to generate word diff (git not available)") return "\n".join(error_parts) def _get_git_word_diff(self, original_text, modified_text): """Generate word diff using git with character-level precision.""" try: with tempfile.TemporaryDirectory() as temp_dir: temp_path = Path(temp_dir) # Create two files original_file = temp_path / "original.txt" modified_file = temp_path / "modified.txt" original_file.write_text(original_text, encoding="utf-8") modified_file.write_text(modified_text, encoding="utf-8") # Try character-level diff first for precise differences result = subprocess.run( [ "git", "diff", "--word-diff=plain", "--word-diff-regex=.", # Character-by-character diff "-U0", # Zero lines of context - show only changed lines "--no-index", str(original_file), str(modified_file), ], capture_output=True, text=True, ) if result.stdout.strip(): # Clean up the output - remove git diff header lines lines = result.stdout.split("\n") # Skip the header lines (diff --git, index, +++, ---, @@) content_lines = [] in_content = False for line in lines: if line.startswith("@@"): in_content = True continue if in_content and line.strip(): content_lines.append(line) if content_lines: return "\n".join(content_lines) # Fallback to word-level diff if character-level is too verbose result = subprocess.run( [ "git", "diff", "--word-diff=plain", "-U0", # Zero lines of context "--no-index", str(original_file), str(modified_file), ], capture_output=True, text=True, ) if result.stdout.strip(): lines = result.stdout.split("\n") content_lines = [] in_content = False for line in lines: if line.startswith("@@"): in_content = True continue if in_content and line.strip(): content_lines.append(line) return "\n".join(content_lines) except (subprocess.CalledProcessError, FileNotFoundError, Exception): # Git not available or other error, return None to use fallback pass return None def _remove_claude_tracked_changes(self, root): """Remove tracked changes authored by Claude from the XML root.""" ins_tag = f"{{{self.namespaces['w']}}}ins" del_tag = f"{{{self.namespaces['w']}}}del" author_attr = f"{{{self.namespaces['w']}}}author" # Remove w:ins elements for parent in root.iter(): to_remove = [] for child in parent: if child.tag == ins_tag and child.get(author_attr) == "Claude": to_remove.append(child) for elem in to_remove: parent.remove(elem) # Unwrap content in w:del elements where author is "Claude" deltext_tag = f"{{{self.namespaces['w']}}}delText" t_tag = f"{{{self.namespaces['w']}}}t" for parent in root.iter(): to_process = [] for child in parent: if child.tag == del_tag and child.get(author_attr) == "Claude": to_process.append((child, list(parent).index(child))) # Process in reverse order to maintain indices for del_elem, del_index in reversed(to_process): # Convert w:delText to w:t before moving for elem in del_elem.iter(): if elem.tag == deltext_tag: elem.tag = t_tag # Move all children of w:del to its parent before removing w:del for child in reversed(list(del_elem)): parent.insert(del_index, child) parent.remove(del_elem) def _extract_text_content(self, root): """Extract text content from Word XML, preserving paragraph structure. Empty paragraphs are skipped to avoid false positives when tracked insertions add only structural elements without text content. """ p_tag = f"{{{self.namespaces['w']}}}p" t_tag = f"{{{self.namespaces['w']}}}t" paragraphs = [] for p_elem in root.findall(f".//{p_tag}"): # Get all text elements within this paragraph text_parts = [] for t_elem in p_elem.findall(f".//{t_tag}"): if t_elem.text: text_parts.append(t_elem.text) paragraph_text = "".join(text_parts) # Skip empty paragraphs - they don't affect content validation if paragraph_text: paragraphs.append(paragraph_text) return "\n".join(paragraphs) if __name__ == "__main__": raise RuntimeError("This module should not be run directly.")