Initial commit

2025-11-29 18:01:30 +08:00
commit 9c0b92f025
39 changed files with 9512 additions and 0 deletions
--- a/project-migrate/scripts/correct_links_llm.py
+++ b/project-migrate/scripts/correct_links_llm.py
@@ -0,0 +1,309 @@
+#!/usr/bin/env python3
+"""
+LLM-based link correction for project-migrate skill
+
+This script uses an LLM to intelligently identify and correct broken or outdated
+links within markdown content during file migration.
+"""
+
+import sys
+import os
+import re
+import argparse
+import json
+import subprocess
+from pathlib import Path
+from typing import List, Dict, Tuple, Optional
+
+
+def extract_markdown_links(content: str) -> List[Dict]:
+    """
+    Extract all markdown links from content and return structured information.
+
+    Args:
+        content: Markdown content to analyze
+
+    Returns:
+        List of dictionaries with link information
+    """
+    links = []
+
+    # Pattern to match markdown links: [text](path) and ![alt](path)
+    pattern = r'!\[([^\]]*)\]\(([^)]+)\)|\[([^\]]*)\]\(([^)]+)\)'
+
+    for match in re.finditer(pattern, content):
+        alt_text, img_src, link_text, link_href = match.groups()
+
+        if img_src:
+            # Image link
+            links.append({
+                'type': 'image',
+                'alt': alt_text,
+                'path': img_src,
+                'full_match': match.group(0)
+            })
+        elif link_href:
+            # Regular link
+            links.append({
+                'type': 'link',
+                'text': link_text,
+                'path': link_href,
+                'full_match': match.group(0)
+            })
+
+    return links
+
+
+def should_skip_link(link_path: str) -> bool:
+    """
+    Determine if a link should be skipped (external URLs, anchors, etc.).
+
+    Args:
+        link_path: The path part of the link
+
+    Returns:
+        True if the link should be skipped
+    """
+    # Skip absolute URLs
+    if link_path.startswith(('http://', 'https://', 'mailto:', 'ftp://', 'tel:')):
+        return True
+
+    # Skip anchor links
+    if link_path.startswith('#'):
+        return True
+
+    # Skip email links without mailto prefix
+    if '@' in link_path and not link_path.startswith(('http://', 'https://')):
+        return True
+
+    return False
+
+
+def get_file_context(file_path: str) -> Dict:
+    """
+    Get context about the file being processed.
+
+    Args:
+        file_path: Path to the file
+
+    Returns:
+        Dictionary with file context information
+    """
+    path = Path(file_path)
+
+    try:
+        relative_to_root = str(path.relative_to(Path.cwd()))
+    except ValueError:
+        # Handle case where file is not subdirectory of current working directory
+        relative_to_root = str(path)
+
+    context = {
+        'file_path': str(path.absolute()),
+        'filename': path.name,
+        'directory': str(path.parent.absolute()),
+        'relative_to_root': relative_to_root,
+    }
+
+    return context
+
+
+def call_llm_for_link_correction(content: str, context: Dict) -> str:
+    """
+    Call LLM to perform intelligent link correction.
+
+    Args:
+        content: Original markdown content
+        context: File context information
+
+    Returns:
+        Corrected markdown content
+    """
+    try:
+        # Prepare the prompt for the LLM
+        prompt = f"""You are a markdown link correction assistant. Your task is to identify and correct broken or outdated relative links in the following markdown content.
+
+Context:
+- File: {context['relative_to_root']}
+- Directory: {context['directory']}
+
+Instructions:
+1. Analyze all relative links in the content
+2. For each link, determine if it points to an existing file
+3. If a link appears broken or outdated, suggest a corrected path
+4. Common migrations to consider:
+   - Files moved from root to docs/ directory
+   - Files moved from docs/ to docs/specs/ or docs/changes/
+   - Changes in file extensions or naming conventions
+5. Preserve all external URLs, anchors, and email links unchanged
+6. Only modify links that clearly need correction
+
+Return ONLY the corrected markdown content without any additional explanation.
+
+Content to analyze:
+{content}"""
+
+        # Call Gemini CLI if available, otherwise fallback to a simple pass-through
+        try:
+            result = subprocess.run(
+                ['gemini', '--model', 'gemini-2.5-flash'],
+                input=prompt,
+                capture_output=True,
+                text=True,
+                timeout=30
+            )
+
+            if result.returncode == 0 and result.stdout.strip():
+                return result.stdout.strip()
+        except (subprocess.TimeoutExpired, FileNotFoundError):
+            # Gemini not available or timed out - fallback to basic processing
+            pass
+
+    except Exception as e:
+        print(f"Warning: LLM call failed: {e}", file=sys.stderr)
+
+    # Fallback: return original content unchanged
+    return content
+
+
+def validate_corrected_links(original: str, corrected: str) -> Dict[str, int]:
+    """
+    Compare original and corrected content to count changes.
+
+    Args:
+        original: Original markdown content
+        corrected: Corrected markdown content
+
+    Returns:
+        Dictionary with change statistics
+    """
+    original_links = extract_markdown_links(original)
+    corrected_links = extract_markdown_links(corrected)
+
+    original_paths = {link['path'] for link in original_links if not should_skip_link(link['path'])}
+    corrected_paths = {link['path'] for link in corrected_links if not should_skip_link(link['path'])}
+
+    changes = {
+        'total_links': len(original_links),
+        'skipped_links': len([link for link in original_links if should_skip_link(link['path'])]),
+        'corrected_links': len(original_paths - corrected_paths),
+        'new_links': len(corrected_paths - original_paths)
+    }
+
+    return changes
+
+
+def correct_links_in_content(content: str, file_path: str) -> Tuple[str, Dict]:
+    """
+    Correct links in markdown content using LLM.
+
+    Args:
+        content: Markdown content to process
+        file_path: Path to the file being processed
+
+    Returns:
+        Tuple of (corrected_content, statistics)
+    """
+    # Extract links for analysis
+    links = extract_markdown_links(content)
+
+    # Filter for links that need processing
+    processable_links = [link for link in links if not should_skip_link(link['path'])]
+
+    if not processable_links:
+        # No links to process
+        return content, {
+            'total_links': len(links),
+            'processable_links': 0,
+            'corrected_links': 0,
+            'llm_called': False
+        }
+
+    # Get file context
+    context = get_file_context(file_path)
+
+    # Call LLM for correction
+    corrected_content = call_llm_for_link_correction(content, context)
+
+    # Validate changes
+    changes = validate_corrected_links(content, corrected_content)
+    changes.update({
+        'processable_links': len(processable_links),
+        'llm_called': True
+    })
+
+    return corrected_content, changes
+
+
+def main():
+    """Main entry point."""
+    parser = argparse.ArgumentParser(
+        description='LLM-based markdown link correction',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Correct links in a file
+  cat README.md | correct_links_llm.py --file README.md
+
+  # Process multiple files
+  find docs -name "*.md" -exec correct_links_llm.py --file {} \\;
+
+  # Show statistics only
+  cat file.md | correct_links_llm.py --file file.md --stats-only
+        """
+    )
+
+    parser.add_argument(
+        '--file',
+        required=True,
+        help='Path to the file being processed (required for context)'
+    )
+
+    parser.add_argument(
+        '--stats-only',
+        action='store_true',
+        help='Only show statistics, don\'t output corrected content'
+    )
+
+    parser.add_argument(
+        '--dry-run',
+        action='store_true',
+        help='Analyze without making changes'
+    )
+
+    args = parser.parse_args()
+
+    try:
+        # Read content from stdin
+        content = sys.stdin.read()
+
+        if not content.strip():
+            print("Error: No content provided on stdin", file=sys.stderr)
+            sys.exit(1)
+
+        # Correct links
+        corrected_content, stats = correct_links_in_content(content, args.file)
+
+        # Output statistics
+        if stats['llm_called']:
+            print(f"Link correction statistics for {args.file}:", file=sys.stderr)
+            print(f"  Total links: {stats['total_links']}", file=sys.stderr)
+            print(f"  Processable links: {stats['processable_links']}", file=sys.stderr)
+            print(f"  Corrected links: {stats['corrected_links']}", file=sys.stderr)
+            print(f"  Skipped links: {stats['skipped_links']}", file=sys.stderr)
+        else:
+            print(f"No links to process in {args.file}", file=sys.stderr)
+
+        # Output corrected content (unless stats-only)
+        if not args.stats_only:
+            print(corrected_content)
+
+    except KeyboardInterrupt:
+        print("\nInterrupted by user", file=sys.stderr)
+        sys.exit(1)
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()