Files
gh-epieczko-betty/skills/docs.lint.links/docs_link_lint.py
2025-11-29 18:26:08 +08:00

610 lines
18 KiB
Python
Executable File

#!/usr/bin/env python3
"""
docs_link_lint.py - Implementation of the docs.lint.links Skill.
Validates Markdown links to detect broken internal or external links.
"""
import json
import os
import re
import sys
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
from urllib.parse import urlparse
from urllib.request import Request, urlopen
from urllib.error import HTTPError, URLError
# Ensure project root on path for betty imports when executed directly
from betty.errors import BettyError # noqa: E402
from betty.logging_utils import setup_logger # noqa: E402
logger = setup_logger(__name__)
# Regex patterns for finding links in markdown
# Matches [text](url) format
MARKDOWN_LINK_PATTERN = re.compile(r'\[([^\]]+)\]\(([^)]+)\)')
# Matches <url> format
ANGLE_LINK_PATTERN = re.compile(r'<(https?://[^>]+)>')
# Matches reference-style links [text][ref]
REFERENCE_LINK_PATTERN = re.compile(r'\[([^\]]+)\]\[([^\]]*)\]')
# Matches reference definitions [ref]: url
REFERENCE_DEF_PATTERN = re.compile(r'^\[([^\]]+)\]:\s+(.+)$', re.MULTILINE)
class LinkIssue:
"""Represents a broken or problematic link."""
def __init__(
self,
file: str,
line: int,
link: str,
issue_type: str,
message: str,
suggested_fix: Optional[str] = None
):
self.file = file
self.line = line
self.link = link
self.issue_type = issue_type
self.message = message
self.suggested_fix = suggested_fix
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary for JSON output."""
result = {
"file": self.file,
"line": self.line,
"link": self.link,
"issue_type": self.issue_type,
"message": self.message
}
if self.suggested_fix:
result["suggested_fix"] = self.suggested_fix
return result
def find_markdown_files(root_dir: str, exclude_patterns: Optional[List[str]] = None) -> List[Path]:
"""
Find all .md files in the directory tree.
Args:
root_dir: Root directory to search
exclude_patterns: List of path patterns to exclude (e.g., 'node_modules', '.git')
Returns:
List of Path objects for markdown files
"""
exclude_patterns = exclude_patterns or ['.git', 'node_modules', '.venv', 'venv', '__pycache__']
md_files = []
root_path = Path(root_dir).resolve()
for path in root_path.rglob('*.md'):
# Skip excluded directories
if any(excluded in path.parts for excluded in exclude_patterns):
continue
md_files.append(path)
logger.info(f"Found {len(md_files)} markdown files")
return md_files
def is_in_code_block(line: str) -> bool:
"""
Check if a line contains inline code that might contain false positive links.
Args:
line: Line to check
Returns:
True if we should skip this line for link extraction
"""
# Count backticks - if odd number, we're likely inside inline code
# This is a simple heuristic
backtick_count = line.count('`')
# If we have backticks, we need to be more careful
# For simplicity, we'll extract the content outside of backticks
return False # We'll handle this differently
def extract_links_from_markdown(content: str) -> List[Tuple[int, str, str]]:
"""
Extract all links from markdown content.
Args:
content: Markdown file content
Returns:
List of tuples: (line_number, link_text, link_url)
"""
lines = content.split('\n')
links = []
# First, extract reference definitions
references = {}
for match in REFERENCE_DEF_PATTERN.finditer(content):
ref_name = match.group(1).lower()
ref_url = match.group(2).strip()
references[ref_name] = ref_url
# Track if we're in a code block
in_code_block = False
# Process each line
for line_num, line in enumerate(lines, start=1):
# Check for code block delimiters
if line.strip().startswith('```'):
in_code_block = not in_code_block
continue
# Skip lines inside code blocks
if in_code_block:
continue
# Remove inline code blocks from the line before processing
# This prevents false positives from code examples
processed_line = re.sub(r'`[^`]+`', '', line)
# Find standard markdown links [text](url)
for match in MARKDOWN_LINK_PATTERN.finditer(processed_line):
# Check if this match is actually in the original line
# (not removed by our inline code filter)
match_pos = processed_line.find(match.group(0))
if match_pos >= 0:
text = match.group(1)
url = match.group(2)
links.append((line_num, text, url))
# Find angle bracket links <url>
for match in ANGLE_LINK_PATTERN.finditer(processed_line):
url = match.group(1)
links.append((line_num, url, url))
# Find reference-style links [text][ref] or [text][]
for match in REFERENCE_LINK_PATTERN.finditer(processed_line):
text = match.group(1)
ref = match.group(2) if match.group(2) else text
ref_lower = ref.lower()
if ref_lower in references:
url = references[ref_lower]
links.append((line_num, text, url))
return links
def is_external_link(url: str) -> bool:
"""Check if a URL is external (http/https)."""
return url.startswith('http://') or url.startswith('https://')
def check_external_link(url: str, timeout: int = 10) -> Optional[str]:
"""
Check if an external URL is accessible.
Args:
url: URL to check
timeout: Timeout in seconds
Returns:
Error message if link is broken, None if OK
"""
try:
# Create request with a user agent to avoid 403s from some sites
req = Request(
url,
headers={
'User-Agent': 'Betty/1.0 (Link Checker)',
'Accept': '*/*'
}
)
with urlopen(req, timeout=timeout) as response:
if response.status >= 400:
return f"HTTP {response.status}"
return None
except HTTPError as e:
return f"HTTP {e.code}"
except URLError as e:
return f"URL Error: {e.reason}"
except Exception as e:
return f"Error: {str(e)}"
def resolve_relative_path(md_file_path: Path, relative_url: str) -> Path:
"""
Resolve a relative URL from a markdown file.
Args:
md_file_path: Path to the markdown file containing the link
relative_url: Relative URL/path from the link
Returns:
Resolved absolute path
"""
# Remove anchor/hash fragment
url_without_anchor = relative_url.split('#')[0]
if not url_without_anchor:
# Just an anchor to current file
return md_file_path
# Resolve relative to the markdown file's directory
base_dir = md_file_path.parent
resolved = (base_dir / url_without_anchor).resolve()
return resolved
def check_internal_link(
md_file_path: Path,
relative_url: str,
root_dir: Path
) -> Tuple[Optional[str], Optional[str]]:
"""
Check if an internal link is valid.
Args:
md_file_path: Path to the markdown file containing the link
relative_url: Relative URL from the link
root_dir: Repository root directory
Returns:
Tuple of (error_message, suggested_fix)
"""
# Remove query string and anchor
clean_url = relative_url.split('?')[0].split('#')[0]
if not clean_url:
# Just an anchor or query, assume valid
return None, None
resolved = resolve_relative_path(md_file_path, clean_url)
# Check if file exists
if resolved.exists():
return None, None
# File doesn't exist - try to suggest fixes
error_msg = f"File not found: {relative_url}"
suggested_fix = None
# Try case-insensitive match
if resolved.parent.exists():
for file in resolved.parent.iterdir():
if file.name.lower() == resolved.name.lower():
relative_to_md = os.path.relpath(file, md_file_path.parent)
suggested_fix = relative_to_md
error_msg += f" (found case mismatch: {file.name})"
break
# Try without .md extension if it has one
if not suggested_fix and clean_url.endswith('.md'):
url_without_ext = clean_url[:-3]
resolved_without_ext = resolve_relative_path(md_file_path, url_without_ext)
if resolved_without_ext.exists():
relative_to_md = os.path.relpath(resolved_without_ext, md_file_path.parent)
suggested_fix = relative_to_md
error_msg += f" (file exists without .md extension)"
# Try adding .md extension if it doesn't have one
if not suggested_fix and not clean_url.endswith('.md'):
url_with_ext = clean_url + '.md'
resolved_with_ext = resolve_relative_path(md_file_path, url_with_ext)
if resolved_with_ext.exists():
relative_to_md = os.path.relpath(resolved_with_ext, md_file_path.parent)
suggested_fix = relative_to_md
error_msg += f" (file exists with .md extension)"
return error_msg, suggested_fix
def lint_markdown_file(
md_file: Path,
root_dir: Path,
check_external: bool = True,
external_timeout: int = 10
) -> List[LinkIssue]:
"""
Lint a single markdown file for broken links.
Args:
md_file: Path to markdown file
root_dir: Repository root directory
check_external: Whether to check external links
external_timeout: Timeout for external link checks
Returns:
List of LinkIssue objects
"""
issues = []
try:
content = md_file.read_text(encoding='utf-8')
except Exception as e:
logger.warning(f"Could not read {md_file}: {e}")
return issues
links = extract_links_from_markdown(content)
for line_num, link_text, url in links:
# Skip empty URLs
if not url or url.strip() == '':
continue
# Skip mailto and other special schemes
if url.startswith('mailto:') or url.startswith('tel:'):
continue
relative_path = os.path.relpath(md_file, root_dir)
if is_external_link(url):
if check_external:
logger.debug(f"Checking external link: {url}")
error = check_external_link(url, timeout=external_timeout)
if error:
issues.append(LinkIssue(
file=relative_path,
line=line_num,
link=url,
issue_type="external_broken",
message=f"External link is broken: {error}"
))
else:
# Internal link
logger.debug(f"Checking internal link: {url}")
error, suggested_fix = check_internal_link(md_file, url, root_dir)
if error:
issues.append(LinkIssue(
file=relative_path,
line=line_num,
link=url,
issue_type="internal_broken",
message=error,
suggested_fix=suggested_fix
))
return issues
def autofix_markdown_file(
md_file: Path,
root_dir: Path
) -> Tuple[int, List[str]]:
"""
Automatically fix common link issues in a markdown file.
Args:
md_file: Path to markdown file
root_dir: Repository root directory
Returns:
Tuple of (number_of_fixes, list_of_fix_descriptions)
"""
try:
content = md_file.read_text(encoding='utf-8')
except Exception as e:
logger.warning(f"Could not read {md_file}: {e}")
return 0, []
original_content = content
links = extract_links_from_markdown(content)
fixes = []
fix_count = 0
for line_num, link_text, url in links:
if is_external_link(url):
continue
# Check if internal link is broken
error, suggested_fix = check_internal_link(md_file, url, root_dir)
if error and suggested_fix:
# Apply the fix
# Preserve any anchor/hash
anchor = ''
if '#' in url:
anchor = '#' + url.split('#', 1)[1]
new_url = suggested_fix + anchor
# Replace in content
content = content.replace(f']({url})', f']({new_url})')
fix_count += 1
fixes.append(f"Line {line_num}: {url} -> {new_url}")
# Write back if changes were made
if fix_count > 0:
try:
md_file.write_text(content, encoding='utf-8')
logger.info(f"Applied {fix_count} fixes to {md_file}")
except Exception as e:
logger.error(f"Could not write fixes to {md_file}: {e}")
return 0, []
return fix_count, fixes
def lint_all_markdown(
root_dir: str,
check_external: bool = True,
autofix: bool = False,
external_timeout: int = 10,
exclude_patterns: Optional[List[str]] = None
) -> Dict[str, Any]:
"""
Lint all markdown files in a directory.
Args:
root_dir: Root directory to search
check_external: Whether to check external links (can be slow)
autofix: Whether to automatically fix common issues
external_timeout: Timeout for external link checks
exclude_patterns: Patterns to exclude from search
Returns:
Result dictionary with issues and statistics
"""
root_path = Path(root_dir).resolve()
md_files = find_markdown_files(root_dir, exclude_patterns)
all_issues = []
all_fixes = []
files_checked = 0
files_with_issues = 0
total_fixes = 0
for md_file in md_files:
files_checked += 1
if autofix:
fix_count, fixes = autofix_markdown_file(md_file, root_path)
total_fixes += fix_count
if fixes:
relative_path = os.path.relpath(md_file, root_path)
all_fixes.append({
"file": relative_path,
"fixes": fixes
})
# Check for issues (after autofix if enabled)
issues = lint_markdown_file(
md_file,
root_path,
check_external=check_external,
external_timeout=external_timeout
)
if issues:
files_with_issues += 1
all_issues.extend(issues)
result = {
"status": "success",
"summary": {
"files_checked": files_checked,
"files_with_issues": files_with_issues,
"total_issues": len(all_issues),
"autofix_enabled": autofix,
"total_fixes_applied": total_fixes
},
"issues": [issue.to_dict() for issue in all_issues]
}
if autofix and all_fixes:
result["fixes"] = all_fixes
return result
def main(argv: Optional[List[str]] = None) -> int:
"""Entry point for CLI execution."""
import argparse
parser = argparse.ArgumentParser(
description="Lint Markdown files to detect broken internal or external links"
)
parser.add_argument(
"root_dir",
nargs='?',
default='.',
help="Root directory to search for Markdown files (default: current directory)"
)
parser.add_argument(
"--no-external",
action="store_true",
help="Skip checking external links (faster)"
)
parser.add_argument(
"--autofix",
action="store_true",
help="Automatically fix common issues (case, .md extension)"
)
parser.add_argument(
"--timeout",
type=int,
default=10,
help="Timeout for external link checks in seconds (default: 10)"
)
parser.add_argument(
"--exclude",
type=str,
help="Comma-separated list of patterns to exclude (e.g., 'node_modules,.git')"
)
parser.add_argument(
"--output",
type=str,
choices=['json', 'text'],
default='json',
help="Output format (default: json)"
)
args = parser.parse_args(argv)
exclude_patterns = None
if args.exclude:
exclude_patterns = [p.strip() for p in args.exclude.split(',')]
try:
result = lint_all_markdown(
root_dir=args.root_dir,
check_external=not args.no_external,
autofix=args.autofix,
external_timeout=args.timeout,
exclude_patterns=exclude_patterns
)
if args.output == 'json':
print(json.dumps(result, indent=2))
else:
# Text output
summary = result['summary']
print(f"Markdown Link Lint Results")
print(f"=" * 50)
print(f"Files checked: {summary['files_checked']}")
print(f"Files with issues: {summary['files_with_issues']}")
print(f"Total issues: {summary['total_issues']}")
if summary['autofix_enabled']:
print(f"Fixes applied: {summary['total_fixes_applied']}")
if result['issues']:
print(f"\nIssues found:")
print(f"-" * 50)
for issue in result['issues']:
print(f"\n{issue['file']}:{issue['line']}")
print(f" Link: {issue['link']}")
print(f" Issue: {issue['message']}")
if issue.get('suggested_fix'):
print(f" Suggested fix: {issue['suggested_fix']}")
else:
print("\n✓ No issues found!")
# Return non-zero if issues found
return 1 if result['issues'] else 0
except BettyError as e:
logger.error(f"Linting failed: {e}")
result = {
"status": "error",
"error": str(e)
}
print(json.dumps(result, indent=2))
return 1
except Exception as e:
logger.exception("Unexpected error during linting")
result = {
"status": "error",
"error": str(e)
}
print(json.dumps(result, indent=2))
return 1
if __name__ == "__main__":
sys.exit(main())