commit d6a802149236c81bf0a89043038d18b191e28382 Author: Zhongwei Li Date: Sun Nov 30 08:35:51 2025 +0800 Initial commit diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json new file mode 100644 index 0000000..9049370 --- /dev/null +++ b/.claude-plugin/plugin.json @@ -0,0 +1,12 @@ +{ + "name": "xml-element-extractor", + "description": "Extract XML elements from source files using Python's standard library with optional xmllint formatting", + "version": "1.0.0", + "author": { + "name": "Jay Xu", + "email": "jay.xu.krfantasy@gmail.com" + }, + "skills": [ + "./skills" + ] +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..a25677c --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# xml-element-extractor + +Extract XML elements from source files using Python's standard library with optional xmllint formatting diff --git a/plugin.lock.json b/plugin.lock.json new file mode 100644 index 0000000..420939e --- /dev/null +++ b/plugin.lock.json @@ -0,0 +1,53 @@ +{ + "$schema": "internal://schemas/plugin.lock.v1.json", + "pluginId": "gh:krfantasy/alsdiff:plugins/xml-element-extractor", + "normalized": { + "repo": null, + "ref": "refs/tags/v20251128.0", + "commit": "208b08b000db4d0623c45264674d498a6952c82d", + "treeHash": "36c5a4149f671f2428764c5b35cad0e71d65e2a61a116bed240d32964b97c836", + "generatedAt": "2025-11-28T10:19:57.223439Z", + "toolVersion": "publish_plugins.py@0.2.0" + }, + "origin": { + "remote": "git@github.com:zhongweili/42plugin-data.git", + "branch": "master", + "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390", + "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data" + }, + "manifest": { + "name": "xml-element-extractor", + "description": "Extract XML elements from source files using Python's standard library with optional xmllint formatting", + "version": "1.0.0" + }, + "content": { + "files": [ + { + "path": "README.md", + "sha256": "72b6f91c642d0de5fcaeb6f38f82918947c92ee1652fc832ab746bc322259e80" + }, + { + "path": ".claude-plugin/plugin.json", + "sha256": "99a99927a566aefda0acda622f37aadc0384ca988bae49b54cbf9fdf0948d9de" + }, + { + "path": "skills/xml-element-extractor/reference.md", + "sha256": "94910d4dbec68c606ec4cd026e8de082cf966cb462a6a92ed6868a2c3351f666" + }, + { + "path": "skills/xml-element-extractor/SKILL.md", + "sha256": "d0c5017a1a708ba77b57ab65fb8b2802ef43ad22654980ff294e61dad38fff86" + }, + { + "path": "skills/xml-element-extractor/scripts/extract_xml_element.py", + "sha256": "858b9fc06a6c0c7a1ffeec6e83315ce243a0ad989d5874b051d57030b05843a6" + } + ], + "dirSha256": "36c5a4149f671f2428764c5b35cad0e71d65e2a61a116bed240d32964b97c836" + }, + "security": { + "scannedAt": null, + "scannerVersion": null, + "flags": [] + } +} \ No newline at end of file diff --git a/skills/xml-element-extractor/SKILL.md b/skills/xml-element-extractor/SKILL.md new file mode 100644 index 0000000..1789d0b --- /dev/null +++ b/skills/xml-element-extractor/SKILL.md @@ -0,0 +1,164 @@ +--- +name: xml-element-extractor +description: Extract specific XML elements from source files using Python and optionally format with xmllint. This skill should be used when users need to isolate a single XML element (like InstrumentVector with Id="0") from a larger XML document, preserving the complete element structure including opening and closing tags. +--- + +# XML Element Extractor + +## Overview + +This skill enables precise extraction of XML elements from source files using Python's standard library (xml.etree.ElementTree), with optional xmllint formatting for clean output. Use this skill when you need to isolate a specific XML element from a larger document while maintaining its complete structure. + +## ⚠️ Agent Guidelines + +**DO NOT READ XML FILES DIRECTLY** + +- **Never use Read tool on XML files** - Large files can exceed context limits +- **Never display XML content directly** - Always use the extraction script +- **Always use the Python extraction script** - Optimized for efficient XML processing +- **Process XML externally** - Let the script handle parsing, not the agent + +This prevents context overflow and maintains performance. + +**Key advantages of the Python implementation:** +- **Maximum compatibility**: Uses only Python standard library, no third-party dependencies +- **Robust parsing**: Proper XML parsing handles complex structures, special characters, and encoding +- **Attribute order insensitive**: Works regardless of attribute order in XML tags +- **Better error handling**: Provides clear error messages and graceful failure handling +- **Cross-platform**: Works consistently across different operating systems + +## Quick Start + +To extract an XML element: + +1. Identify the source XML file path +2. Choose destination file path for the extracted element +3. Specify the exact opening tag (including attributes) of the element to extract +4. Execute the extraction script +5. Verify the output file contains the desired element + +## Extraction Process + +### Step 1: Prepare Input Parameters + +Gather the following parameters: +- **Source XML file**: Path to the input XML file containing the element to extract +- **Destination XML file**: Path where the extracted element will be saved +- **Element tag**: Exact opening tag including all attributes (e.g., ``) + +### Step 2: Execute Extraction Script + +Run the extraction script with the three parameters: + +```bash +python3 scripts/extract_xml_element.py +``` + +Example: +```bash +python3 scripts/extract_xml_element.py source.xml dest.xml '' +``` + +**Note:** The script is implemented in Python but maintains the `.py` extension for compatibility with existing workflows. The shebang line ensures it executes with Python. + +### Step 3: Verify Output + +The script will: +- Extract the first matching XML element from the source file +- Save it to the destination file +- Optionally format the output with xmllint if available +- Report success or failure with appropriate error messages + +## Error Handling + +The script provides basic error handling for common issues: + +- **Missing source file**: Displays error if source file doesn't exist +- **Unreadable source file**: Displays error if source file cannot be read +- **No matching element**: Displays error if the specified element tag is not found +- **Failed extraction**: Displays error if the extraction process fails + +## XML Formatting + +If xmllint is available on the system, the script will automatically format the extracted XML with: +- 2-space indentation +- Proper XML structure validation +- Clean, readable output format + +If xmllint is not available, the extracted element will be saved in its original formatting. + +## Usage Examples + +### Basic Extraction +```bash +# Extract InstrumentVector with Id="0" +python3 scripts/extract_xml_element.py live_set.xml instrument_vector.xml '' + +# Extract MidiTrack element +python3 scripts/extract_xml_element.py tracks.xml midi_track.xml '' + +# Extract complex elements with quotes and special characters +python3 scripts/extract_xml_element.py live_set.xml device.xml '' +``` + +### Error Cases +```bash +# This will fail - source file doesn't exist +python3 scripts/extract_xml_element.py missing.xml output.xml '' + +# This will fail - no matching element found +python3 scripts/extract_xml_element.py source.xml output.xml '' + +# This will fail - malformed XML +python3 scripts/extract_xml_element.py malformed.xml output.xml '' +``` + +## Resources + +### scripts/ + +**extract_xml_element.py**: Main Python script that performs XML element extraction using Python's standard library. The script: +- Takes three parameters: source file, destination file, and element tag +- Validates input parameters and file accessibility +- Uses xml.etree.ElementTree for robust XML parsing and element extraction +- Handles complex XML structures, special characters, and attribute order variations +- Optionally formats output with xmllint if available, with Python minidom fallback +- Provides clear error messages for common failure cases +- Cross-platform compatible (Windows, macOS, Linux) + +### references/ + +**python_xml.md**: Reference documentation for XML element extraction process, including explanations of the Python parsing logic and troubleshooting guides for common extraction issues. + +## Technical Details + +### Python XML Processing +The script uses Python's standard library with the following key components: + +**XML Parsing:** +- `xml.etree.ElementTree.parse()`: Parses XML files into structured element trees +- `tree.iter(tag_name)`: Iterates through all elements with matching tag names +- `ET.tostring()`: Converts elements back to XML strings + +**Tag Matching Algorithm:** +- **Attribute order normalization**: Sorts attributes for consistent comparison +- **Regex-based tag parsing**: Extracts tag names and attributes using regular expressions +- **Exact matching**: Ensures precise element identification including all attributes + +**Error Detection:** +- XML parsing validation using `ET.ParseError` +- File I/O error handling with proper exception catching +- Empty output file detection and cleanup +- Timeout handling for external tool calls + +**Formatting Options:** +- **Primary**: xmllint with `XMLLINT_INDENT` environment variable for 2-space indentation +- **Fallback**: Python's `xml.dom.minidom` for cross-platform compatibility + +### Cross-Platform Compatibility +The Python implementation provides consistent behavior across: +- **Windows**: Uses subprocess calls with proper path handling +- **macOS**: Native Unix-style subprocess execution +- **Linux**: Standard Python subprocess behavior + +No platform-specific tools are required, making the skill truly portable. diff --git a/skills/xml-element-extractor/reference.md b/skills/xml-element-extractor/reference.md new file mode 100644 index 0000000..3e964b5 --- /dev/null +++ b/skills/xml-element-extractor/reference.md @@ -0,0 +1,260 @@ +# XML Element Extraction - Python Implementation + +This reference document explains the XML element extraction process used in the Python-based extraction script and provides troubleshooting guidance. The Python implementation provides better compatibility and handles special characters more reliably than the previous sed-based approach. + +## Core XML Processing Logic + +### Python ElementTree Parsing + +The Python script uses the standard library `xml.etree.ElementTree` for robust XML parsing: + +```python +import xml.etree.ElementTree as ET + +# Parse the XML file +tree = ET.parse(source_file) +root = tree.getroot() +``` + +**Key advantages:** +- Proper XML parsing that understands structure and encoding +- Handles special characters, quotes, and XML entities correctly +- Provides structured access to XML elements and attributes +- Better error handling and validation +- Cross-platform compatibility using Python standard library + +### Tag Matching Algorithm + +The script implements sophisticated tag matching to find exact element matches: + +```python +def find_element_by_tag_string(tree_root, element_tag): + tag_name = extract_tag_name(element_tag) + + # Find all elements with the matching tag name + for element in tree_root.iter(tag_name): + # Reconstruct the opening tag string with attributes + if element.attrib: + attrs = ' '.join(f'{k}="{v}"' for k, v in sorted(element.attrib.items())) + constructed_tag = f'<{tag_name} {attrs}>' + else: + constructed_tag = f'<{tag_name}>' + + # Compare normalized tags for case-insensitive attribute order matching + if normalize_tag(constructed_tag) == normalize_tag(element_tag): + return element +``` + +### Tag Name Extraction + +The script extracts tag names using regular expressions: + +```python +def extract_tag_name(element_tag): + match = re.match(r'<\s*([^>\s]+)', element_tag) + if match: + return match.group(1) + return None +``` + +**Examples:** +- `` → `InstrumentVector` +- `` → `MidiTrack` +- `` → `ComplexTag` + +### Tag Normalization + +To handle different attribute orders and whitespace variations: + +```python +def normalize_tag(tag): + # Parse the tag to normalize it + match = re.match(r'<\s*([^>\s]+)(.*)>', tag) + if not match: + return tag + + tag_name = match.group(1) + attrs_str = match.group(2).strip() + + # Parse attributes and sort them for consistent comparison + attrs = {} + for attr_match in re.finditer(r'(\w+)\s*=\s*"([^"]*)"', attrs_str): + attr_name = attr_match.group(1) + attr_value = attr_match.group(2) + attrs[attr_name] = attr_value + + # Reconstruct with sorted attributes + if attrs: + sorted_attrs = ' '.join(f'{k}="{v}"' for k, v in sorted(attrs.items())) + return f'<{tag_name} {sorted_attrs}>' + else: + return f'<{tag_name}>' +``` + +## Troubleshooting Guide + +### Common Issues and Solutions + +#### 1. No Matching Element Found + +**Symptoms:** +- Empty destination file +- "No matching element found or extraction failed" error + +**Possible Causes:** +- Incorrect tag spelling or case sensitivity +- Missing or extra whitespace in tag +- Attributes don't match exactly +- Tag contains special characters needing escaping + +**Solutions:** +- Verify exact tag spelling and case +- Check for exact attribute matching +- Use quotes around special characters in attributes +- Validate source file contains the expected element + +#### 2. Multiple Elements Extracted + +**Symptoms:** +- Output contains more than one element +- Unexpected content in destination file + +**Possible Causes:** +- Source file has nested identical elements +- Closing tag matching is ambiguous + +**Solutions:** +- The script should handle this with the "first element only" pattern +- If issue persists, check source file structure +- Consider using more specific attributes in opening tag + +#### 3. Special Characters in Tags + +**Symptoms:** +- Sed syntax errors +- Failed pattern matching + +**Common Special Characters:** +- Quotes (single or double) +- Ampersands (&) +- Greater/less than signs within attributes +- Unicode characters + +**Solutions:** +- Properly quote the element tag when calling the script +- Escape special characters if needed +- Use exact character encoding from source file + +#### 4. File Permission Issues + +**Symptoms:** +- "Source file does not exist" error +- "Source file is not readable" error + +**Solutions:** +- Verify file path is correct +- Check file permissions: `ls -la source.xml` +- Ensure read permissions: `chmod +r source.xml` +- Check directory permissions if file is in subdirectory + +### Debugging Tips + +#### Test XML Patterns Manually + +Before using the script, test XML patterns: + +```bash +# Test opening tag detection +grep -n "' | python3 -c " +import sys, re +line = sys.stdin.read().strip() +match = re.match(r'<\s*([^>\s]+)', line) +if match: print(match.group(1)) +" +``` + +#### Validate XML Structure + +```bash +# Check if XML is well-formed +xmllint --noout source.xml + +# Pretty-print XML to understand structure +xmllint --format source.xml | head -50 +``` + +#### Check Element Count + +```bash +# Count occurrences of specific element +grep -c "" source.xml +``` + +## Advanced Usage + +### Customizing for Specific XML Structures + +For complex XML structures, you can extend the Python script or use additional Python logic: + +**Nested Elements:** +```python +# Extract only direct children, not nested ones +import xml.etree.ElementTree as ET + +tree = ET.parse('source.xml') +parent = tree.find('.//Parent') +if parent: + for child in parent: + if child.tag == 'Child': + print(ET.tostring(child).decode()) +``` + +**Multiple Attributes:** +```python +# More specific matching with multiple attributes +tree = ET.parse('source.xml') +for elem in tree.iter('InstrumentVector'): + if elem.get('Id') == '0' and elem.get('Type') == 'Audio': + print(ET.tostring(elem).decode()) + break +``` + +**Conditional Extraction:** +```python +# Extract only if element contains specific content +tree = ET.parse('source.xml') +for elem in tree.iter('InstrumentVector'): + if elem.findtext('.//SpecificContent') is not None: + print(ET.tostring(elem).decode()) + break +``` + +### Performance Considerations + +For large XML files: +- Consider using XML-specific tools like `xmlstarlet` +- Process files in chunks if memory is limited +- Use more specific patterns to reduce processing time + +## Alternative Tools + +For more complex XML processing needs: +- `xmlstarlet`: XML-specific command-line tool with XPath support +- `xmllint`: More robust XML processing and validation +- Python with `lxml`: For advanced XML manipulation and XPath +- `xpath`: Command-line XPath-based extraction +- Python `BeautifulSoup`: For HTML/XML parsing with tolerance for malformed documents \ No newline at end of file diff --git a/skills/xml-element-extractor/scripts/extract_xml_element.py b/skills/xml-element-extractor/scripts/extract_xml_element.py new file mode 100755 index 0000000..ed5fec9 --- /dev/null +++ b/skills/xml-element-extractor/scripts/extract_xml_element.py @@ -0,0 +1,248 @@ +#!/usr/bin/env python3 + +""" +XML Element Extractor + +Extracts the first matching XML element from a source file using Python's standard library. +Optionally formats the output with xmllint if available. + +Usage: python extract_xml_element.py +Example: python extract_xml_element.py source.xml dest.xml '' +""" + +import sys +import os +import re +import subprocess +import xml.etree.ElementTree as ET +from xml.dom import minidom + + +def print_usage(): + """Print usage information and exit.""" + print("Usage: python extract_xml_element.py ") + print("Example: python extract_xml_element.py source.xml dest.xml ''") + sys.exit(1) + + +def check_xmllint(): + """Check if xmllint is available.""" + try: + result = subprocess.run(['xmllint', '--version'], + capture_output=True, text=True, timeout=5) + if result.returncode == 0: + print("xmllint found, will format output XML") + return True + except (subprocess.TimeoutExpired, subprocess.CalledProcessError, FileNotFoundError): + pass + + print("xmllint not found, output will not be formatted") + return False + + +def extract_tag_name(element_tag): + """Extract tag name from element tag (without attributes).""" + match = re.match(r'<\s*([^>\s]+)', element_tag) + if match: + return match.group(1) + return None + + +def find_element_by_tag_string(tree_root, element_tag): + """Find the first element matching the exact tag string including attributes.""" + tag_name = extract_tag_name(element_tag) + if not tag_name: + return None + + # Find all elements with the matching tag name + for element in tree_root.iter(tag_name): + # Reconstruct the opening tag string with attributes + if element.attrib: + attrs = ' '.join(f'{k}="{v}"' for k, v in sorted(element.attrib.items())) + constructed_tag = f'<{tag_name} {attrs}>' + else: + constructed_tag = f'<{tag_name}>' + + # Check if this matches our target tag (case-sensitive and attribute order insensitive) + if normalize_tag(constructed_tag) == normalize_tag(element_tag): + return element + + return None + + +def normalize_tag(tag): + """Normalize tag for comparison by removing extra whitespace and normalizing attribute order.""" + # Parse the tag to normalize it + match = re.match(r'<\s*([^>\s]+)(.*)>', tag) + if not match: + return tag + + tag_name = match.group(1) + attrs_str = match.group(2).strip() + + if not attrs_str: + return f'<{tag_name}>' + + # Parse attributes + attrs = {} + for attr_match in re.finditer(r'(\w+)\s*=\s*"([^"]*)"', attrs_str): + attr_name = attr_match.group(1) + attr_value = attr_match.group(2) + attrs[attr_name] = attr_value + + # Reconstruct with sorted attributes + if attrs: + sorted_attrs = ' '.join(f'{k}="{v}"' for k, v in sorted(attrs.items())) + return f'<{tag_name} {sorted_attrs}>' + else: + return f'<{tag_name}>' + + +def extract_xml_element(source_file, dest_file, element_tag): + """Extract XML element from source file to dest file.""" + try: + print(f"Extracting element: {element_tag}") + print(f"From: {source_file}") + print(f"To: {dest_file}") + + # Parse the XML file + tree = ET.parse(source_file) + root = tree.getroot() + + # Find the matching element + target_element = find_element_by_tag_string(root, element_tag) + + if target_element is None: + print("Error: No matching element found or extraction failed") + return False + + # Convert element back to XML string + rough_string = ET.tostring(target_element, encoding='unicode') + + # Write to destination file + with open(dest_file, 'w', encoding='utf-8') as f: + f.write(rough_string) + + print("Extraction completed successfully") + return True + + except ET.ParseError as e: + print(f"Error: XML parsing failed - {e}") + return False + except IOError as e: + print(f"Error: File I/O failed - {e}") + return False + except Exception as e: + print(f"Error: Unexpected error - {e}") + return False + + +def format_xml_with_xmllint(file_path): + """Format XML file using xmllint with 2-space indentation.""" + try: + print("Formatting XML with 2-space indentation...") + + # Set environment variable for indentation + env = os.environ.copy() + env['XMLLINT_INDENT'] = ' ' + + # Run xmllint + with open(file_path, 'r', encoding='utf-8') as input_file: + result = subprocess.run(['xmllint', '--format', '-'], + input=input_file.read(), + text=True, + capture_output=True, + env=env, + timeout=30) + + if result.returncode == 0: + # Write formatted output back to file + with open(file_path, 'w', encoding='utf-8') as output_file: + output_file.write(result.stdout) + print("XML formatting completed") + return True + else: + print(f"Warning: xmllint formatting failed - {result.stderr}") + return False + + except subprocess.TimeoutExpired: + print("Warning: xmllint formatting timed out") + return False + except Exception as e: + print(f"Warning: XML formatting failed - {e}") + return False + + +def format_xml_with_python(file_path): + """Format XML file using Python's minidom as fallback.""" + try: + print("Formatting XML with Python (fallback)...") + + with open(file_path, 'r', encoding='utf-8') as f: + content = f.read() + + # Parse and format with minidom + dom = minidom.parseString(content) + pretty_xml = dom.toprettyxml(indent=' ') + + # Remove extra blank lines that minidom adds + lines = [line for line in pretty_xml.split('\n') if line.strip()] + pretty_xml = '\n'.join(lines) + + # Write formatted output back to file + with open(file_path, 'w', encoding='utf-8') as f: + f.write(pretty_xml) + + print("XML formatting completed") + return True + + except Exception as e: + print(f"Warning: Python XML formatting failed - {e}") + return False + + +def format_xml(file_path, has_xmllint): + """Format XML file using available tools.""" + if has_xmllint: + if not format_xml_with_xmllint(file_path): + # Fallback to Python formatting if xmllint fails + format_xml_with_python(file_path) + else: + format_xml_with_python(file_path) + + +def main(): + """Main function.""" + # Check arguments + if len(sys.argv) != 4: + print_usage() + + source_file = sys.argv[1] + dest_file = sys.argv[2] + element_tag = sys.argv[3] + + # Validate source file exists + if not os.path.exists(source_file): + print(f"Error: Source file '{source_file}' does not exist") + sys.exit(1) + + # Check if source file is readable + if not os.access(source_file, os.R_OK): + print(f"Error: Source file '{source_file}' is not readable") + sys.exit(1) + + # Check for xmllint availability + has_xmllint = check_xmllint() + + # Extract the XML element + if extract_xml_element(source_file, dest_file, element_tag): + # Format the extracted XML + format_xml(dest_file, has_xmllint) + print(f"Success: XML element extracted to '{dest_file}'") + else: + print("Error: XML element extraction failed") + sys.exit(1) + + +if __name__ == "__main__": + main() \ No newline at end of file