Files
gh-krfantasy-alsdiff-plugin…/skills/xml-element-extractor/scripts/extract_xml_element.py
2025-11-30 08:35:51 +08:00

248 lines
7.6 KiB
Python
Executable File

#!/usr/bin/env python3
"""
XML Element Extractor
Extracts the first matching XML element from a source file using Python's standard library.
Optionally formats the output with xmllint if available.
Usage: python extract_xml_element.py <source_xml> <dest_xml> <element_tag>
Example: python extract_xml_element.py source.xml dest.xml '<InstrumentVector Id="0">'
"""
import sys
import os
import re
import subprocess
import xml.etree.ElementTree as ET
from xml.dom import minidom
def print_usage():
"""Print usage information and exit."""
print("Usage: python extract_xml_element.py <source_xml> <dest_xml> <element_tag>")
print("Example: python extract_xml_element.py source.xml dest.xml '<InstrumentVector Id=\"0\">'")
sys.exit(1)
def check_xmllint():
"""Check if xmllint is available."""
try:
result = subprocess.run(['xmllint', '--version'],
capture_output=True, text=True, timeout=5)
if result.returncode == 0:
print("xmllint found, will format output XML")
return True
except (subprocess.TimeoutExpired, subprocess.CalledProcessError, FileNotFoundError):
pass
print("xmllint not found, output will not be formatted")
return False
def extract_tag_name(element_tag):
"""Extract tag name from element tag (without attributes)."""
match = re.match(r'<\s*([^>\s]+)', element_tag)
if match:
return match.group(1)
return None
def find_element_by_tag_string(tree_root, element_tag):
"""Find the first element matching the exact tag string including attributes."""
tag_name = extract_tag_name(element_tag)
if not tag_name:
return None
# Find all elements with the matching tag name
for element in tree_root.iter(tag_name):
# Reconstruct the opening tag string with attributes
if element.attrib:
attrs = ' '.join(f'{k}="{v}"' for k, v in sorted(element.attrib.items()))
constructed_tag = f'<{tag_name} {attrs}>'
else:
constructed_tag = f'<{tag_name}>'
# Check if this matches our target tag (case-sensitive and attribute order insensitive)
if normalize_tag(constructed_tag) == normalize_tag(element_tag):
return element
return None
def normalize_tag(tag):
"""Normalize tag for comparison by removing extra whitespace and normalizing attribute order."""
# Parse the tag to normalize it
match = re.match(r'<\s*([^>\s]+)(.*)>', tag)
if not match:
return tag
tag_name = match.group(1)
attrs_str = match.group(2).strip()
if not attrs_str:
return f'<{tag_name}>'
# Parse attributes
attrs = {}
for attr_match in re.finditer(r'(\w+)\s*=\s*"([^"]*)"', attrs_str):
attr_name = attr_match.group(1)
attr_value = attr_match.group(2)
attrs[attr_name] = attr_value
# Reconstruct with sorted attributes
if attrs:
sorted_attrs = ' '.join(f'{k}="{v}"' for k, v in sorted(attrs.items()))
return f'<{tag_name} {sorted_attrs}>'
else:
return f'<{tag_name}>'
def extract_xml_element(source_file, dest_file, element_tag):
"""Extract XML element from source file to dest file."""
try:
print(f"Extracting element: {element_tag}")
print(f"From: {source_file}")
print(f"To: {dest_file}")
# Parse the XML file
tree = ET.parse(source_file)
root = tree.getroot()
# Find the matching element
target_element = find_element_by_tag_string(root, element_tag)
if target_element is None:
print("Error: No matching element found or extraction failed")
return False
# Convert element back to XML string
rough_string = ET.tostring(target_element, encoding='unicode')
# Write to destination file
with open(dest_file, 'w', encoding='utf-8') as f:
f.write(rough_string)
print("Extraction completed successfully")
return True
except ET.ParseError as e:
print(f"Error: XML parsing failed - {e}")
return False
except IOError as e:
print(f"Error: File I/O failed - {e}")
return False
except Exception as e:
print(f"Error: Unexpected error - {e}")
return False
def format_xml_with_xmllint(file_path):
"""Format XML file using xmllint with 2-space indentation."""
try:
print("Formatting XML with 2-space indentation...")
# Set environment variable for indentation
env = os.environ.copy()
env['XMLLINT_INDENT'] = ' '
# Run xmllint
with open(file_path, 'r', encoding='utf-8') as input_file:
result = subprocess.run(['xmllint', '--format', '-'],
input=input_file.read(),
text=True,
capture_output=True,
env=env,
timeout=30)
if result.returncode == 0:
# Write formatted output back to file
with open(file_path, 'w', encoding='utf-8') as output_file:
output_file.write(result.stdout)
print("XML formatting completed")
return True
else:
print(f"Warning: xmllint formatting failed - {result.stderr}")
return False
except subprocess.TimeoutExpired:
print("Warning: xmllint formatting timed out")
return False
except Exception as e:
print(f"Warning: XML formatting failed - {e}")
return False
def format_xml_with_python(file_path):
"""Format XML file using Python's minidom as fallback."""
try:
print("Formatting XML with Python (fallback)...")
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# Parse and format with minidom
dom = minidom.parseString(content)
pretty_xml = dom.toprettyxml(indent=' ')
# Remove extra blank lines that minidom adds
lines = [line for line in pretty_xml.split('\n') if line.strip()]
pretty_xml = '\n'.join(lines)
# Write formatted output back to file
with open(file_path, 'w', encoding='utf-8') as f:
f.write(pretty_xml)
print("XML formatting completed")
return True
except Exception as e:
print(f"Warning: Python XML formatting failed - {e}")
return False
def format_xml(file_path, has_xmllint):
"""Format XML file using available tools."""
if has_xmllint:
if not format_xml_with_xmllint(file_path):
# Fallback to Python formatting if xmllint fails
format_xml_with_python(file_path)
else:
format_xml_with_python(file_path)
def main():
"""Main function."""
# Check arguments
if len(sys.argv) != 4:
print_usage()
source_file = sys.argv[1]
dest_file = sys.argv[2]
element_tag = sys.argv[3]
# Validate source file exists
if not os.path.exists(source_file):
print(f"Error: Source file '{source_file}' does not exist")
sys.exit(1)
# Check if source file is readable
if not os.access(source_file, os.R_OK):
print(f"Error: Source file '{source_file}' is not readable")
sys.exit(1)
# Check for xmllint availability
has_xmllint = check_xmllint()
# Extract the XML element
if extract_xml_element(source_file, dest_file, element_tag):
# Format the extracted XML
format_xml(dest_file, has_xmllint)
print(f"Success: XML element extracted to '{dest_file}'")
else:
print("Error: XML element extraction failed")
sys.exit(1)
if __name__ == "__main__":
main()