#!/usr/bin/env python3 """ XML Element Extractor Extracts the first matching XML element from a source file using Python's standard library. Optionally formats the output with xmllint if available. Usage: python extract_xml_element.py Example: python extract_xml_element.py source.xml dest.xml '' """ import sys import os import re import subprocess import xml.etree.ElementTree as ET from xml.dom import minidom def print_usage(): """Print usage information and exit.""" print("Usage: python extract_xml_element.py ") print("Example: python extract_xml_element.py source.xml dest.xml ''") sys.exit(1) def check_xmllint(): """Check if xmllint is available.""" try: result = subprocess.run(['xmllint', '--version'], capture_output=True, text=True, timeout=5) if result.returncode == 0: print("xmllint found, will format output XML") return True except (subprocess.TimeoutExpired, subprocess.CalledProcessError, FileNotFoundError): pass print("xmllint not found, output will not be formatted") return False def extract_tag_name(element_tag): """Extract tag name from element tag (without attributes).""" match = re.match(r'<\s*([^>\s]+)', element_tag) if match: return match.group(1) return None def find_element_by_tag_string(tree_root, element_tag): """Find the first element matching the exact tag string including attributes.""" tag_name = extract_tag_name(element_tag) if not tag_name: return None # Find all elements with the matching tag name for element in tree_root.iter(tag_name): # Reconstruct the opening tag string with attributes if element.attrib: attrs = ' '.join(f'{k}="{v}"' for k, v in sorted(element.attrib.items())) constructed_tag = f'<{tag_name} {attrs}>' else: constructed_tag = f'<{tag_name}>' # Check if this matches our target tag (case-sensitive and attribute order insensitive) if normalize_tag(constructed_tag) == normalize_tag(element_tag): return element return None def normalize_tag(tag): """Normalize tag for comparison by removing extra whitespace and normalizing attribute order.""" # Parse the tag to normalize it match = re.match(r'<\s*([^>\s]+)(.*)>', tag) if not match: return tag tag_name = match.group(1) attrs_str = match.group(2).strip() if not attrs_str: return f'<{tag_name}>' # Parse attributes attrs = {} for attr_match in re.finditer(r'(\w+)\s*=\s*"([^"]*)"', attrs_str): attr_name = attr_match.group(1) attr_value = attr_match.group(2) attrs[attr_name] = attr_value # Reconstruct with sorted attributes if attrs: sorted_attrs = ' '.join(f'{k}="{v}"' for k, v in sorted(attrs.items())) return f'<{tag_name} {sorted_attrs}>' else: return f'<{tag_name}>' def extract_xml_element(source_file, dest_file, element_tag): """Extract XML element from source file to dest file.""" try: print(f"Extracting element: {element_tag}") print(f"From: {source_file}") print(f"To: {dest_file}") # Parse the XML file tree = ET.parse(source_file) root = tree.getroot() # Find the matching element target_element = find_element_by_tag_string(root, element_tag) if target_element is None: print("Error: No matching element found or extraction failed") return False # Convert element back to XML string rough_string = ET.tostring(target_element, encoding='unicode') # Write to destination file with open(dest_file, 'w', encoding='utf-8') as f: f.write(rough_string) print("Extraction completed successfully") return True except ET.ParseError as e: print(f"Error: XML parsing failed - {e}") return False except IOError as e: print(f"Error: File I/O failed - {e}") return False except Exception as e: print(f"Error: Unexpected error - {e}") return False def format_xml_with_xmllint(file_path): """Format XML file using xmllint with 2-space indentation.""" try: print("Formatting XML with 2-space indentation...") # Set environment variable for indentation env = os.environ.copy() env['XMLLINT_INDENT'] = ' ' # Run xmllint with open(file_path, 'r', encoding='utf-8') as input_file: result = subprocess.run(['xmllint', '--format', '-'], input=input_file.read(), text=True, capture_output=True, env=env, timeout=30) if result.returncode == 0: # Write formatted output back to file with open(file_path, 'w', encoding='utf-8') as output_file: output_file.write(result.stdout) print("XML formatting completed") return True else: print(f"Warning: xmllint formatting failed - {result.stderr}") return False except subprocess.TimeoutExpired: print("Warning: xmllint formatting timed out") return False except Exception as e: print(f"Warning: XML formatting failed - {e}") return False def format_xml_with_python(file_path): """Format XML file using Python's minidom as fallback.""" try: print("Formatting XML with Python (fallback)...") with open(file_path, 'r', encoding='utf-8') as f: content = f.read() # Parse and format with minidom dom = minidom.parseString(content) pretty_xml = dom.toprettyxml(indent=' ') # Remove extra blank lines that minidom adds lines = [line for line in pretty_xml.split('\n') if line.strip()] pretty_xml = '\n'.join(lines) # Write formatted output back to file with open(file_path, 'w', encoding='utf-8') as f: f.write(pretty_xml) print("XML formatting completed") return True except Exception as e: print(f"Warning: Python XML formatting failed - {e}") return False def format_xml(file_path, has_xmllint): """Format XML file using available tools.""" if has_xmllint: if not format_xml_with_xmllint(file_path): # Fallback to Python formatting if xmllint fails format_xml_with_python(file_path) else: format_xml_with_python(file_path) def main(): """Main function.""" # Check arguments if len(sys.argv) != 4: print_usage() source_file = sys.argv[1] dest_file = sys.argv[2] element_tag = sys.argv[3] # Validate source file exists if not os.path.exists(source_file): print(f"Error: Source file '{source_file}' does not exist") sys.exit(1) # Check if source file is readable if not os.access(source_file, os.R_OK): print(f"Error: Source file '{source_file}' is not readable") sys.exit(1) # Check for xmllint availability has_xmllint = check_xmllint() # Extract the XML element if extract_xml_element(source_file, dest_file, element_tag): # Format the extracted XML format_xml(dest_file, has_xmllint) print(f"Success: XML element extracted to '{dest_file}'") else: print("Error: XML element extraction failed") sys.exit(1) if __name__ == "__main__": main()