#!/usr/bin/env python3 """ LLM-based link correction for project-migrate skill This script uses an LLM to intelligently identify and correct broken or outdated links within markdown content during file migration. """ import sys import os import re import argparse import json import subprocess from pathlib import Path from typing import List, Dict, Tuple, Optional def extract_markdown_links(content: str) -> List[Dict]: """ Extract all markdown links from content and return structured information. Args: content: Markdown content to analyze Returns: List of dictionaries with link information """ links = [] # Pattern to match markdown links: [text](path) and ![alt](path) pattern = r'!\[([^\]]*)\]\(([^)]+)\)|\[([^\]]*)\]\(([^)]+)\)' for match in re.finditer(pattern, content): alt_text, img_src, link_text, link_href = match.groups() if img_src: # Image link links.append({ 'type': 'image', 'alt': alt_text, 'path': img_src, 'full_match': match.group(0) }) elif link_href: # Regular link links.append({ 'type': 'link', 'text': link_text, 'path': link_href, 'full_match': match.group(0) }) return links def should_skip_link(link_path: str) -> bool: """ Determine if a link should be skipped (external URLs, anchors, etc.). Args: link_path: The path part of the link Returns: True if the link should be skipped """ # Skip absolute URLs if link_path.startswith(('http://', 'https://', 'mailto:', 'ftp://', 'tel:')): return True # Skip anchor links if link_path.startswith('#'): return True # Skip email links without mailto prefix if '@' in link_path and not link_path.startswith(('http://', 'https://')): return True return False def get_file_context(file_path: str) -> Dict: """ Get context about the file being processed. Args: file_path: Path to the file Returns: Dictionary with file context information """ path = Path(file_path) try: relative_to_root = str(path.relative_to(Path.cwd())) except ValueError: # Handle case where file is not subdirectory of current working directory relative_to_root = str(path) context = { 'file_path': str(path.absolute()), 'filename': path.name, 'directory': str(path.parent.absolute()), 'relative_to_root': relative_to_root, } return context def call_llm_for_link_correction(content: str, context: Dict) -> str: """ Call LLM to perform intelligent link correction. Args: content: Original markdown content context: File context information Returns: Corrected markdown content """ try: # Prepare the prompt for the LLM prompt = f"""You are a markdown link correction assistant. Your task is to identify and correct broken or outdated relative links in the following markdown content. Context: - File: {context['relative_to_root']} - Directory: {context['directory']} Instructions: 1. Analyze all relative links in the content 2. For each link, determine if it points to an existing file 3. If a link appears broken or outdated, suggest a corrected path 4. Common migrations to consider: - Files moved from root to docs/ directory - Files moved from docs/ to docs/specs/ or docs/changes/ - Changes in file extensions or naming conventions 5. Preserve all external URLs, anchors, and email links unchanged 6. Only modify links that clearly need correction Return ONLY the corrected markdown content without any additional explanation. Content to analyze: {content}""" # Call Gemini CLI if available, otherwise fallback to a simple pass-through try: result = subprocess.run( ['gemini', '--model', 'gemini-2.5-flash'], input=prompt, capture_output=True, text=True, timeout=30 ) if result.returncode == 0 and result.stdout.strip(): return result.stdout.strip() except (subprocess.TimeoutExpired, FileNotFoundError): # Gemini not available or timed out - fallback to basic processing pass except Exception as e: print(f"Warning: LLM call failed: {e}", file=sys.stderr) # Fallback: return original content unchanged return content def validate_corrected_links(original: str, corrected: str) -> Dict[str, int]: """ Compare original and corrected content to count changes. Args: original: Original markdown content corrected: Corrected markdown content Returns: Dictionary with change statistics """ original_links = extract_markdown_links(original) corrected_links = extract_markdown_links(corrected) original_paths = {link['path'] for link in original_links if not should_skip_link(link['path'])} corrected_paths = {link['path'] for link in corrected_links if not should_skip_link(link['path'])} changes = { 'total_links': len(original_links), 'skipped_links': len([link for link in original_links if should_skip_link(link['path'])]), 'corrected_links': len(original_paths - corrected_paths), 'new_links': len(corrected_paths - original_paths) } return changes def correct_links_in_content(content: str, file_path: str) -> Tuple[str, Dict]: """ Correct links in markdown content using LLM. Args: content: Markdown content to process file_path: Path to the file being processed Returns: Tuple of (corrected_content, statistics) """ # Extract links for analysis links = extract_markdown_links(content) # Filter for links that need processing processable_links = [link for link in links if not should_skip_link(link['path'])] if not processable_links: # No links to process return content, { 'total_links': len(links), 'processable_links': 0, 'corrected_links': 0, 'llm_called': False } # Get file context context = get_file_context(file_path) # Call LLM for correction corrected_content = call_llm_for_link_correction(content, context) # Validate changes changes = validate_corrected_links(content, corrected_content) changes.update({ 'processable_links': len(processable_links), 'llm_called': True }) return corrected_content, changes def main(): """Main entry point.""" parser = argparse.ArgumentParser( description='LLM-based markdown link correction', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Correct links in a file cat README.md | correct_links_llm.py --file README.md # Process multiple files find docs -name "*.md" -exec correct_links_llm.py --file {} \\; # Show statistics only cat file.md | correct_links_llm.py --file file.md --stats-only """ ) parser.add_argument( '--file', required=True, help='Path to the file being processed (required for context)' ) parser.add_argument( '--stats-only', action='store_true', help='Only show statistics, don\'t output corrected content' ) parser.add_argument( '--dry-run', action='store_true', help='Analyze without making changes' ) args = parser.parse_args() try: # Read content from stdin content = sys.stdin.read() if not content.strip(): print("Error: No content provided on stdin", file=sys.stderr) sys.exit(1) # Correct links corrected_content, stats = correct_links_in_content(content, args.file) # Output statistics if stats['llm_called']: print(f"Link correction statistics for {args.file}:", file=sys.stderr) print(f" Total links: {stats['total_links']}", file=sys.stderr) print(f" Processable links: {stats['processable_links']}", file=sys.stderr) print(f" Corrected links: {stats['corrected_links']}", file=sys.stderr) print(f" Skipped links: {stats['skipped_links']}", file=sys.stderr) else: print(f"No links to process in {args.file}", file=sys.stderr) # Output corrected content (unless stats-only) if not args.stats_only: print(corrected_content) except KeyboardInterrupt: print("\nInterrupted by user", file=sys.stderr) sys.exit(1) except Exception as e: print(f"Error: {e}", file=sys.stderr) sys.exit(1) if __name__ == '__main__': main()