Initial commit

This commit is contained in:
Zhongwei Li
2025-11-29 18:01:30 +08:00
commit 9c0b92f025
39 changed files with 9512 additions and 0 deletions

View File

@@ -0,0 +1,309 @@
#!/usr/bin/env python3
"""
LLM-based link correction for project-migrate skill
This script uses an LLM to intelligently identify and correct broken or outdated
links within markdown content during file migration.
"""
import sys
import os
import re
import argparse
import json
import subprocess
from pathlib import Path
from typing import List, Dict, Tuple, Optional
def extract_markdown_links(content: str) -> List[Dict]:
"""
Extract all markdown links from content and return structured information.
Args:
content: Markdown content to analyze
Returns:
List of dictionaries with link information
"""
links = []
# Pattern to match markdown links: [text](path) and ![alt](path)
pattern = r'!\[([^\]]*)\]\(([^)]+)\)|\[([^\]]*)\]\(([^)]+)\)'
for match in re.finditer(pattern, content):
alt_text, img_src, link_text, link_href = match.groups()
if img_src:
# Image link
links.append({
'type': 'image',
'alt': alt_text,
'path': img_src,
'full_match': match.group(0)
})
elif link_href:
# Regular link
links.append({
'type': 'link',
'text': link_text,
'path': link_href,
'full_match': match.group(0)
})
return links
def should_skip_link(link_path: str) -> bool:
"""
Determine if a link should be skipped (external URLs, anchors, etc.).
Args:
link_path: The path part of the link
Returns:
True if the link should be skipped
"""
# Skip absolute URLs
if link_path.startswith(('http://', 'https://', 'mailto:', 'ftp://', 'tel:')):
return True
# Skip anchor links
if link_path.startswith('#'):
return True
# Skip email links without mailto prefix
if '@' in link_path and not link_path.startswith(('http://', 'https://')):
return True
return False
def get_file_context(file_path: str) -> Dict:
"""
Get context about the file being processed.
Args:
file_path: Path to the file
Returns:
Dictionary with file context information
"""
path = Path(file_path)
try:
relative_to_root = str(path.relative_to(Path.cwd()))
except ValueError:
# Handle case where file is not subdirectory of current working directory
relative_to_root = str(path)
context = {
'file_path': str(path.absolute()),
'filename': path.name,
'directory': str(path.parent.absolute()),
'relative_to_root': relative_to_root,
}
return context
def call_llm_for_link_correction(content: str, context: Dict) -> str:
"""
Call LLM to perform intelligent link correction.
Args:
content: Original markdown content
context: File context information
Returns:
Corrected markdown content
"""
try:
# Prepare the prompt for the LLM
prompt = f"""You are a markdown link correction assistant. Your task is to identify and correct broken or outdated relative links in the following markdown content.
Context:
- File: {context['relative_to_root']}
- Directory: {context['directory']}
Instructions:
1. Analyze all relative links in the content
2. For each link, determine if it points to an existing file
3. If a link appears broken or outdated, suggest a corrected path
4. Common migrations to consider:
- Files moved from root to docs/ directory
- Files moved from docs/ to docs/specs/ or docs/changes/
- Changes in file extensions or naming conventions
5. Preserve all external URLs, anchors, and email links unchanged
6. Only modify links that clearly need correction
Return ONLY the corrected markdown content without any additional explanation.
Content to analyze:
{content}"""
# Call Gemini CLI if available, otherwise fallback to a simple pass-through
try:
result = subprocess.run(
['gemini', '--model', 'gemini-2.5-flash'],
input=prompt,
capture_output=True,
text=True,
timeout=30
)
if result.returncode == 0 and result.stdout.strip():
return result.stdout.strip()
except (subprocess.TimeoutExpired, FileNotFoundError):
# Gemini not available or timed out - fallback to basic processing
pass
except Exception as e:
print(f"Warning: LLM call failed: {e}", file=sys.stderr)
# Fallback: return original content unchanged
return content
def validate_corrected_links(original: str, corrected: str) -> Dict[str, int]:
"""
Compare original and corrected content to count changes.
Args:
original: Original markdown content
corrected: Corrected markdown content
Returns:
Dictionary with change statistics
"""
original_links = extract_markdown_links(original)
corrected_links = extract_markdown_links(corrected)
original_paths = {link['path'] for link in original_links if not should_skip_link(link['path'])}
corrected_paths = {link['path'] for link in corrected_links if not should_skip_link(link['path'])}
changes = {
'total_links': len(original_links),
'skipped_links': len([link for link in original_links if should_skip_link(link['path'])]),
'corrected_links': len(original_paths - corrected_paths),
'new_links': len(corrected_paths - original_paths)
}
return changes
def correct_links_in_content(content: str, file_path: str) -> Tuple[str, Dict]:
"""
Correct links in markdown content using LLM.
Args:
content: Markdown content to process
file_path: Path to the file being processed
Returns:
Tuple of (corrected_content, statistics)
"""
# Extract links for analysis
links = extract_markdown_links(content)
# Filter for links that need processing
processable_links = [link for link in links if not should_skip_link(link['path'])]
if not processable_links:
# No links to process
return content, {
'total_links': len(links),
'processable_links': 0,
'corrected_links': 0,
'llm_called': False
}
# Get file context
context = get_file_context(file_path)
# Call LLM for correction
corrected_content = call_llm_for_link_correction(content, context)
# Validate changes
changes = validate_corrected_links(content, corrected_content)
changes.update({
'processable_links': len(processable_links),
'llm_called': True
})
return corrected_content, changes
def main():
"""Main entry point."""
parser = argparse.ArgumentParser(
description='LLM-based markdown link correction',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Correct links in a file
cat README.md | correct_links_llm.py --file README.md
# Process multiple files
find docs -name "*.md" -exec correct_links_llm.py --file {} \\;
# Show statistics only
cat file.md | correct_links_llm.py --file file.md --stats-only
"""
)
parser.add_argument(
'--file',
required=True,
help='Path to the file being processed (required for context)'
)
parser.add_argument(
'--stats-only',
action='store_true',
help='Only show statistics, don\'t output corrected content'
)
parser.add_argument(
'--dry-run',
action='store_true',
help='Analyze without making changes'
)
args = parser.parse_args()
try:
# Read content from stdin
content = sys.stdin.read()
if not content.strip():
print("Error: No content provided on stdin", file=sys.stderr)
sys.exit(1)
# Correct links
corrected_content, stats = correct_links_in_content(content, args.file)
# Output statistics
if stats['llm_called']:
print(f"Link correction statistics for {args.file}:", file=sys.stderr)
print(f" Total links: {stats['total_links']}", file=sys.stderr)
print(f" Processable links: {stats['processable_links']}", file=sys.stderr)
print(f" Corrected links: {stats['corrected_links']}", file=sys.stderr)
print(f" Skipped links: {stats['skipped_links']}", file=sys.stderr)
else:
print(f"No links to process in {args.file}", file=sys.stderr)
# Output corrected content (unless stats-only)
if not args.stats_only:
print(corrected_content)
except KeyboardInterrupt:
print("\nInterrupted by user", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()