248 lines
7.6 KiB
Python
Executable File
248 lines
7.6 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
"""
|
|
XML Element Extractor
|
|
|
|
Extracts the first matching XML element from a source file using Python's standard library.
|
|
Optionally formats the output with xmllint if available.
|
|
|
|
Usage: python extract_xml_element.py <source_xml> <dest_xml> <element_tag>
|
|
Example: python extract_xml_element.py source.xml dest.xml '<InstrumentVector Id="0">'
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import xml.etree.ElementTree as ET
|
|
from xml.dom import minidom
|
|
|
|
|
|
def print_usage():
|
|
"""Print usage information and exit."""
|
|
print("Usage: python extract_xml_element.py <source_xml> <dest_xml> <element_tag>")
|
|
print("Example: python extract_xml_element.py source.xml dest.xml '<InstrumentVector Id=\"0\">'")
|
|
sys.exit(1)
|
|
|
|
|
|
def check_xmllint():
|
|
"""Check if xmllint is available."""
|
|
try:
|
|
result = subprocess.run(['xmllint', '--version'],
|
|
capture_output=True, text=True, timeout=5)
|
|
if result.returncode == 0:
|
|
print("xmllint found, will format output XML")
|
|
return True
|
|
except (subprocess.TimeoutExpired, subprocess.CalledProcessError, FileNotFoundError):
|
|
pass
|
|
|
|
print("xmllint not found, output will not be formatted")
|
|
return False
|
|
|
|
|
|
def extract_tag_name(element_tag):
|
|
"""Extract tag name from element tag (without attributes)."""
|
|
match = re.match(r'<\s*([^>\s]+)', element_tag)
|
|
if match:
|
|
return match.group(1)
|
|
return None
|
|
|
|
|
|
def find_element_by_tag_string(tree_root, element_tag):
|
|
"""Find the first element matching the exact tag string including attributes."""
|
|
tag_name = extract_tag_name(element_tag)
|
|
if not tag_name:
|
|
return None
|
|
|
|
# Find all elements with the matching tag name
|
|
for element in tree_root.iter(tag_name):
|
|
# Reconstruct the opening tag string with attributes
|
|
if element.attrib:
|
|
attrs = ' '.join(f'{k}="{v}"' for k, v in sorted(element.attrib.items()))
|
|
constructed_tag = f'<{tag_name} {attrs}>'
|
|
else:
|
|
constructed_tag = f'<{tag_name}>'
|
|
|
|
# Check if this matches our target tag (case-sensitive and attribute order insensitive)
|
|
if normalize_tag(constructed_tag) == normalize_tag(element_tag):
|
|
return element
|
|
|
|
return None
|
|
|
|
|
|
def normalize_tag(tag):
|
|
"""Normalize tag for comparison by removing extra whitespace and normalizing attribute order."""
|
|
# Parse the tag to normalize it
|
|
match = re.match(r'<\s*([^>\s]+)(.*)>', tag)
|
|
if not match:
|
|
return tag
|
|
|
|
tag_name = match.group(1)
|
|
attrs_str = match.group(2).strip()
|
|
|
|
if not attrs_str:
|
|
return f'<{tag_name}>'
|
|
|
|
# Parse attributes
|
|
attrs = {}
|
|
for attr_match in re.finditer(r'(\w+)\s*=\s*"([^"]*)"', attrs_str):
|
|
attr_name = attr_match.group(1)
|
|
attr_value = attr_match.group(2)
|
|
attrs[attr_name] = attr_value
|
|
|
|
# Reconstruct with sorted attributes
|
|
if attrs:
|
|
sorted_attrs = ' '.join(f'{k}="{v}"' for k, v in sorted(attrs.items()))
|
|
return f'<{tag_name} {sorted_attrs}>'
|
|
else:
|
|
return f'<{tag_name}>'
|
|
|
|
|
|
def extract_xml_element(source_file, dest_file, element_tag):
|
|
"""Extract XML element from source file to dest file."""
|
|
try:
|
|
print(f"Extracting element: {element_tag}")
|
|
print(f"From: {source_file}")
|
|
print(f"To: {dest_file}")
|
|
|
|
# Parse the XML file
|
|
tree = ET.parse(source_file)
|
|
root = tree.getroot()
|
|
|
|
# Find the matching element
|
|
target_element = find_element_by_tag_string(root, element_tag)
|
|
|
|
if target_element is None:
|
|
print("Error: No matching element found or extraction failed")
|
|
return False
|
|
|
|
# Convert element back to XML string
|
|
rough_string = ET.tostring(target_element, encoding='unicode')
|
|
|
|
# Write to destination file
|
|
with open(dest_file, 'w', encoding='utf-8') as f:
|
|
f.write(rough_string)
|
|
|
|
print("Extraction completed successfully")
|
|
return True
|
|
|
|
except ET.ParseError as e:
|
|
print(f"Error: XML parsing failed - {e}")
|
|
return False
|
|
except IOError as e:
|
|
print(f"Error: File I/O failed - {e}")
|
|
return False
|
|
except Exception as e:
|
|
print(f"Error: Unexpected error - {e}")
|
|
return False
|
|
|
|
|
|
def format_xml_with_xmllint(file_path):
|
|
"""Format XML file using xmllint with 2-space indentation."""
|
|
try:
|
|
print("Formatting XML with 2-space indentation...")
|
|
|
|
# Set environment variable for indentation
|
|
env = os.environ.copy()
|
|
env['XMLLINT_INDENT'] = ' '
|
|
|
|
# Run xmllint
|
|
with open(file_path, 'r', encoding='utf-8') as input_file:
|
|
result = subprocess.run(['xmllint', '--format', '-'],
|
|
input=input_file.read(),
|
|
text=True,
|
|
capture_output=True,
|
|
env=env,
|
|
timeout=30)
|
|
|
|
if result.returncode == 0:
|
|
# Write formatted output back to file
|
|
with open(file_path, 'w', encoding='utf-8') as output_file:
|
|
output_file.write(result.stdout)
|
|
print("XML formatting completed")
|
|
return True
|
|
else:
|
|
print(f"Warning: xmllint formatting failed - {result.stderr}")
|
|
return False
|
|
|
|
except subprocess.TimeoutExpired:
|
|
print("Warning: xmllint formatting timed out")
|
|
return False
|
|
except Exception as e:
|
|
print(f"Warning: XML formatting failed - {e}")
|
|
return False
|
|
|
|
|
|
def format_xml_with_python(file_path):
|
|
"""Format XML file using Python's minidom as fallback."""
|
|
try:
|
|
print("Formatting XML with Python (fallback)...")
|
|
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Parse and format with minidom
|
|
dom = minidom.parseString(content)
|
|
pretty_xml = dom.toprettyxml(indent=' ')
|
|
|
|
# Remove extra blank lines that minidom adds
|
|
lines = [line for line in pretty_xml.split('\n') if line.strip()]
|
|
pretty_xml = '\n'.join(lines)
|
|
|
|
# Write formatted output back to file
|
|
with open(file_path, 'w', encoding='utf-8') as f:
|
|
f.write(pretty_xml)
|
|
|
|
print("XML formatting completed")
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f"Warning: Python XML formatting failed - {e}")
|
|
return False
|
|
|
|
|
|
def format_xml(file_path, has_xmllint):
|
|
"""Format XML file using available tools."""
|
|
if has_xmllint:
|
|
if not format_xml_with_xmllint(file_path):
|
|
# Fallback to Python formatting if xmllint fails
|
|
format_xml_with_python(file_path)
|
|
else:
|
|
format_xml_with_python(file_path)
|
|
|
|
|
|
def main():
|
|
"""Main function."""
|
|
# Check arguments
|
|
if len(sys.argv) != 4:
|
|
print_usage()
|
|
|
|
source_file = sys.argv[1]
|
|
dest_file = sys.argv[2]
|
|
element_tag = sys.argv[3]
|
|
|
|
# Validate source file exists
|
|
if not os.path.exists(source_file):
|
|
print(f"Error: Source file '{source_file}' does not exist")
|
|
sys.exit(1)
|
|
|
|
# Check if source file is readable
|
|
if not os.access(source_file, os.R_OK):
|
|
print(f"Error: Source file '{source_file}' is not readable")
|
|
sys.exit(1)
|
|
|
|
# Check for xmllint availability
|
|
has_xmllint = check_xmllint()
|
|
|
|
# Extract the XML element
|
|
if extract_xml_element(source_file, dest_file, element_tag):
|
|
# Format the extracted XML
|
|
format_xml(dest_file, has_xmllint)
|
|
print(f"Success: XML element extracted to '{dest_file}'")
|
|
else:
|
|
print("Error: XML element extraction failed")
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |