Initial commit
This commit is contained in:
248
skills/xml-element-extractor/scripts/extract_xml_element.py
Executable file
248
skills/xml-element-extractor/scripts/extract_xml_element.py
Executable file
@@ -0,0 +1,248 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
XML Element Extractor
|
||||
|
||||
Extracts the first matching XML element from a source file using Python's standard library.
|
||||
Optionally formats the output with xmllint if available.
|
||||
|
||||
Usage: python extract_xml_element.py <source_xml> <dest_xml> <element_tag>
|
||||
Example: python extract_xml_element.py source.xml dest.xml '<InstrumentVector Id="0">'
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import xml.etree.ElementTree as ET
|
||||
from xml.dom import minidom
|
||||
|
||||
|
||||
def print_usage():
|
||||
"""Print usage information and exit."""
|
||||
print("Usage: python extract_xml_element.py <source_xml> <dest_xml> <element_tag>")
|
||||
print("Example: python extract_xml_element.py source.xml dest.xml '<InstrumentVector Id=\"0\">'")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def check_xmllint():
|
||||
"""Check if xmllint is available."""
|
||||
try:
|
||||
result = subprocess.run(['xmllint', '--version'],
|
||||
capture_output=True, text=True, timeout=5)
|
||||
if result.returncode == 0:
|
||||
print("xmllint found, will format output XML")
|
||||
return True
|
||||
except (subprocess.TimeoutExpired, subprocess.CalledProcessError, FileNotFoundError):
|
||||
pass
|
||||
|
||||
print("xmllint not found, output will not be formatted")
|
||||
return False
|
||||
|
||||
|
||||
def extract_tag_name(element_tag):
|
||||
"""Extract tag name from element tag (without attributes)."""
|
||||
match = re.match(r'<\s*([^>\s]+)', element_tag)
|
||||
if match:
|
||||
return match.group(1)
|
||||
return None
|
||||
|
||||
|
||||
def find_element_by_tag_string(tree_root, element_tag):
|
||||
"""Find the first element matching the exact tag string including attributes."""
|
||||
tag_name = extract_tag_name(element_tag)
|
||||
if not tag_name:
|
||||
return None
|
||||
|
||||
# Find all elements with the matching tag name
|
||||
for element in tree_root.iter(tag_name):
|
||||
# Reconstruct the opening tag string with attributes
|
||||
if element.attrib:
|
||||
attrs = ' '.join(f'{k}="{v}"' for k, v in sorted(element.attrib.items()))
|
||||
constructed_tag = f'<{tag_name} {attrs}>'
|
||||
else:
|
||||
constructed_tag = f'<{tag_name}>'
|
||||
|
||||
# Check if this matches our target tag (case-sensitive and attribute order insensitive)
|
||||
if normalize_tag(constructed_tag) == normalize_tag(element_tag):
|
||||
return element
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def normalize_tag(tag):
|
||||
"""Normalize tag for comparison by removing extra whitespace and normalizing attribute order."""
|
||||
# Parse the tag to normalize it
|
||||
match = re.match(r'<\s*([^>\s]+)(.*)>', tag)
|
||||
if not match:
|
||||
return tag
|
||||
|
||||
tag_name = match.group(1)
|
||||
attrs_str = match.group(2).strip()
|
||||
|
||||
if not attrs_str:
|
||||
return f'<{tag_name}>'
|
||||
|
||||
# Parse attributes
|
||||
attrs = {}
|
||||
for attr_match in re.finditer(r'(\w+)\s*=\s*"([^"]*)"', attrs_str):
|
||||
attr_name = attr_match.group(1)
|
||||
attr_value = attr_match.group(2)
|
||||
attrs[attr_name] = attr_value
|
||||
|
||||
# Reconstruct with sorted attributes
|
||||
if attrs:
|
||||
sorted_attrs = ' '.join(f'{k}="{v}"' for k, v in sorted(attrs.items()))
|
||||
return f'<{tag_name} {sorted_attrs}>'
|
||||
else:
|
||||
return f'<{tag_name}>'
|
||||
|
||||
|
||||
def extract_xml_element(source_file, dest_file, element_tag):
|
||||
"""Extract XML element from source file to dest file."""
|
||||
try:
|
||||
print(f"Extracting element: {element_tag}")
|
||||
print(f"From: {source_file}")
|
||||
print(f"To: {dest_file}")
|
||||
|
||||
# Parse the XML file
|
||||
tree = ET.parse(source_file)
|
||||
root = tree.getroot()
|
||||
|
||||
# Find the matching element
|
||||
target_element = find_element_by_tag_string(root, element_tag)
|
||||
|
||||
if target_element is None:
|
||||
print("Error: No matching element found or extraction failed")
|
||||
return False
|
||||
|
||||
# Convert element back to XML string
|
||||
rough_string = ET.tostring(target_element, encoding='unicode')
|
||||
|
||||
# Write to destination file
|
||||
with open(dest_file, 'w', encoding='utf-8') as f:
|
||||
f.write(rough_string)
|
||||
|
||||
print("Extraction completed successfully")
|
||||
return True
|
||||
|
||||
except ET.ParseError as e:
|
||||
print(f"Error: XML parsing failed - {e}")
|
||||
return False
|
||||
except IOError as e:
|
||||
print(f"Error: File I/O failed - {e}")
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f"Error: Unexpected error - {e}")
|
||||
return False
|
||||
|
||||
|
||||
def format_xml_with_xmllint(file_path):
|
||||
"""Format XML file using xmllint with 2-space indentation."""
|
||||
try:
|
||||
print("Formatting XML with 2-space indentation...")
|
||||
|
||||
# Set environment variable for indentation
|
||||
env = os.environ.copy()
|
||||
env['XMLLINT_INDENT'] = ' '
|
||||
|
||||
# Run xmllint
|
||||
with open(file_path, 'r', encoding='utf-8') as input_file:
|
||||
result = subprocess.run(['xmllint', '--format', '-'],
|
||||
input=input_file.read(),
|
||||
text=True,
|
||||
capture_output=True,
|
||||
env=env,
|
||||
timeout=30)
|
||||
|
||||
if result.returncode == 0:
|
||||
# Write formatted output back to file
|
||||
with open(file_path, 'w', encoding='utf-8') as output_file:
|
||||
output_file.write(result.stdout)
|
||||
print("XML formatting completed")
|
||||
return True
|
||||
else:
|
||||
print(f"Warning: xmllint formatting failed - {result.stderr}")
|
||||
return False
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
print("Warning: xmllint formatting timed out")
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f"Warning: XML formatting failed - {e}")
|
||||
return False
|
||||
|
||||
|
||||
def format_xml_with_python(file_path):
|
||||
"""Format XML file using Python's minidom as fallback."""
|
||||
try:
|
||||
print("Formatting XML with Python (fallback)...")
|
||||
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
# Parse and format with minidom
|
||||
dom = minidom.parseString(content)
|
||||
pretty_xml = dom.toprettyxml(indent=' ')
|
||||
|
||||
# Remove extra blank lines that minidom adds
|
||||
lines = [line for line in pretty_xml.split('\n') if line.strip()]
|
||||
pretty_xml = '\n'.join(lines)
|
||||
|
||||
# Write formatted output back to file
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
f.write(pretty_xml)
|
||||
|
||||
print("XML formatting completed")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"Warning: Python XML formatting failed - {e}")
|
||||
return False
|
||||
|
||||
|
||||
def format_xml(file_path, has_xmllint):
|
||||
"""Format XML file using available tools."""
|
||||
if has_xmllint:
|
||||
if not format_xml_with_xmllint(file_path):
|
||||
# Fallback to Python formatting if xmllint fails
|
||||
format_xml_with_python(file_path)
|
||||
else:
|
||||
format_xml_with_python(file_path)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function."""
|
||||
# Check arguments
|
||||
if len(sys.argv) != 4:
|
||||
print_usage()
|
||||
|
||||
source_file = sys.argv[1]
|
||||
dest_file = sys.argv[2]
|
||||
element_tag = sys.argv[3]
|
||||
|
||||
# Validate source file exists
|
||||
if not os.path.exists(source_file):
|
||||
print(f"Error: Source file '{source_file}' does not exist")
|
||||
sys.exit(1)
|
||||
|
||||
# Check if source file is readable
|
||||
if not os.access(source_file, os.R_OK):
|
||||
print(f"Error: Source file '{source_file}' is not readable")
|
||||
sys.exit(1)
|
||||
|
||||
# Check for xmllint availability
|
||||
has_xmllint = check_xmllint()
|
||||
|
||||
# Extract the XML element
|
||||
if extract_xml_element(source_file, dest_file, element_tag):
|
||||
# Format the extracted XML
|
||||
format_xml(dest_file, has_xmllint)
|
||||
print(f"Success: XML element extracted to '{dest_file}'")
|
||||
else:
|
||||
print("Error: XML element extraction failed")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user