#!/usr/bin/env python3 """ docs_link_lint.py - Implementation of the docs.lint.links Skill. Validates Markdown links to detect broken internal or external links. """ import json import os import re import sys from pathlib import Path from typing import Any, Dict, List, Optional, Tuple from urllib.parse import urlparse from urllib.request import Request, urlopen from urllib.error import HTTPError, URLError # Ensure project root on path for betty imports when executed directly from betty.errors import BettyError # noqa: E402 from betty.logging_utils import setup_logger # noqa: E402 logger = setup_logger(__name__) # Regex patterns for finding links in markdown # Matches [text](url) format MARKDOWN_LINK_PATTERN = re.compile(r'\[([^\]]+)\]\(([^)]+)\)') # Matches format ANGLE_LINK_PATTERN = re.compile(r'<(https?://[^>]+)>') # Matches reference-style links [text][ref] REFERENCE_LINK_PATTERN = re.compile(r'\[([^\]]+)\]\[([^\]]*)\]') # Matches reference definitions [ref]: url REFERENCE_DEF_PATTERN = re.compile(r'^\[([^\]]+)\]:\s+(.+)$', re.MULTILINE) class LinkIssue: """Represents a broken or problematic link.""" def __init__( self, file: str, line: int, link: str, issue_type: str, message: str, suggested_fix: Optional[str] = None ): self.file = file self.line = line self.link = link self.issue_type = issue_type self.message = message self.suggested_fix = suggested_fix def to_dict(self) -> Dict[str, Any]: """Convert to dictionary for JSON output.""" result = { "file": self.file, "line": self.line, "link": self.link, "issue_type": self.issue_type, "message": self.message } if self.suggested_fix: result["suggested_fix"] = self.suggested_fix return result def find_markdown_files(root_dir: str, exclude_patterns: Optional[List[str]] = None) -> List[Path]: """ Find all .md files in the directory tree. Args: root_dir: Root directory to search exclude_patterns: List of path patterns to exclude (e.g., 'node_modules', '.git') Returns: List of Path objects for markdown files """ exclude_patterns = exclude_patterns or ['.git', 'node_modules', '.venv', 'venv', '__pycache__'] md_files = [] root_path = Path(root_dir).resolve() for path in root_path.rglob('*.md'): # Skip excluded directories if any(excluded in path.parts for excluded in exclude_patterns): continue md_files.append(path) logger.info(f"Found {len(md_files)} markdown files") return md_files def is_in_code_block(line: str) -> bool: """ Check if a line contains inline code that might contain false positive links. Args: line: Line to check Returns: True if we should skip this line for link extraction """ # Count backticks - if odd number, we're likely inside inline code # This is a simple heuristic backtick_count = line.count('`') # If we have backticks, we need to be more careful # For simplicity, we'll extract the content outside of backticks return False # We'll handle this differently def extract_links_from_markdown(content: str) -> List[Tuple[int, str, str]]: """ Extract all links from markdown content. Args: content: Markdown file content Returns: List of tuples: (line_number, link_text, link_url) """ lines = content.split('\n') links = [] # First, extract reference definitions references = {} for match in REFERENCE_DEF_PATTERN.finditer(content): ref_name = match.group(1).lower() ref_url = match.group(2).strip() references[ref_name] = ref_url # Track if we're in a code block in_code_block = False # Process each line for line_num, line in enumerate(lines, start=1): # Check for code block delimiters if line.strip().startswith('```'): in_code_block = not in_code_block continue # Skip lines inside code blocks if in_code_block: continue # Remove inline code blocks from the line before processing # This prevents false positives from code examples processed_line = re.sub(r'`[^`]+`', '', line) # Find standard markdown links [text](url) for match in MARKDOWN_LINK_PATTERN.finditer(processed_line): # Check if this match is actually in the original line # (not removed by our inline code filter) match_pos = processed_line.find(match.group(0)) if match_pos >= 0: text = match.group(1) url = match.group(2) links.append((line_num, text, url)) # Find angle bracket links for match in ANGLE_LINK_PATTERN.finditer(processed_line): url = match.group(1) links.append((line_num, url, url)) # Find reference-style links [text][ref] or [text][] for match in REFERENCE_LINK_PATTERN.finditer(processed_line): text = match.group(1) ref = match.group(2) if match.group(2) else text ref_lower = ref.lower() if ref_lower in references: url = references[ref_lower] links.append((line_num, text, url)) return links def is_external_link(url: str) -> bool: """Check if a URL is external (http/https).""" return url.startswith('http://') or url.startswith('https://') def check_external_link(url: str, timeout: int = 10) -> Optional[str]: """ Check if an external URL is accessible. Args: url: URL to check timeout: Timeout in seconds Returns: Error message if link is broken, None if OK """ try: # Create request with a user agent to avoid 403s from some sites req = Request( url, headers={ 'User-Agent': 'Betty/1.0 (Link Checker)', 'Accept': '*/*' } ) with urlopen(req, timeout=timeout) as response: if response.status >= 400: return f"HTTP {response.status}" return None except HTTPError as e: return f"HTTP {e.code}" except URLError as e: return f"URL Error: {e.reason}" except Exception as e: return f"Error: {str(e)}" def resolve_relative_path(md_file_path: Path, relative_url: str) -> Path: """ Resolve a relative URL from a markdown file. Args: md_file_path: Path to the markdown file containing the link relative_url: Relative URL/path from the link Returns: Resolved absolute path """ # Remove anchor/hash fragment url_without_anchor = relative_url.split('#')[0] if not url_without_anchor: # Just an anchor to current file return md_file_path # Resolve relative to the markdown file's directory base_dir = md_file_path.parent resolved = (base_dir / url_without_anchor).resolve() return resolved def check_internal_link( md_file_path: Path, relative_url: str, root_dir: Path ) -> Tuple[Optional[str], Optional[str]]: """ Check if an internal link is valid. Args: md_file_path: Path to the markdown file containing the link relative_url: Relative URL from the link root_dir: Repository root directory Returns: Tuple of (error_message, suggested_fix) """ # Remove query string and anchor clean_url = relative_url.split('?')[0].split('#')[0] if not clean_url: # Just an anchor or query, assume valid return None, None resolved = resolve_relative_path(md_file_path, clean_url) # Check if file exists if resolved.exists(): return None, None # File doesn't exist - try to suggest fixes error_msg = f"File not found: {relative_url}" suggested_fix = None # Try case-insensitive match if resolved.parent.exists(): for file in resolved.parent.iterdir(): if file.name.lower() == resolved.name.lower(): relative_to_md = os.path.relpath(file, md_file_path.parent) suggested_fix = relative_to_md error_msg += f" (found case mismatch: {file.name})" break # Try without .md extension if it has one if not suggested_fix and clean_url.endswith('.md'): url_without_ext = clean_url[:-3] resolved_without_ext = resolve_relative_path(md_file_path, url_without_ext) if resolved_without_ext.exists(): relative_to_md = os.path.relpath(resolved_without_ext, md_file_path.parent) suggested_fix = relative_to_md error_msg += f" (file exists without .md extension)" # Try adding .md extension if it doesn't have one if not suggested_fix and not clean_url.endswith('.md'): url_with_ext = clean_url + '.md' resolved_with_ext = resolve_relative_path(md_file_path, url_with_ext) if resolved_with_ext.exists(): relative_to_md = os.path.relpath(resolved_with_ext, md_file_path.parent) suggested_fix = relative_to_md error_msg += f" (file exists with .md extension)" return error_msg, suggested_fix def lint_markdown_file( md_file: Path, root_dir: Path, check_external: bool = True, external_timeout: int = 10 ) -> List[LinkIssue]: """ Lint a single markdown file for broken links. Args: md_file: Path to markdown file root_dir: Repository root directory check_external: Whether to check external links external_timeout: Timeout for external link checks Returns: List of LinkIssue objects """ issues = [] try: content = md_file.read_text(encoding='utf-8') except Exception as e: logger.warning(f"Could not read {md_file}: {e}") return issues links = extract_links_from_markdown(content) for line_num, link_text, url in links: # Skip empty URLs if not url or url.strip() == '': continue # Skip mailto and other special schemes if url.startswith('mailto:') or url.startswith('tel:'): continue relative_path = os.path.relpath(md_file, root_dir) if is_external_link(url): if check_external: logger.debug(f"Checking external link: {url}") error = check_external_link(url, timeout=external_timeout) if error: issues.append(LinkIssue( file=relative_path, line=line_num, link=url, issue_type="external_broken", message=f"External link is broken: {error}" )) else: # Internal link logger.debug(f"Checking internal link: {url}") error, suggested_fix = check_internal_link(md_file, url, root_dir) if error: issues.append(LinkIssue( file=relative_path, line=line_num, link=url, issue_type="internal_broken", message=error, suggested_fix=suggested_fix )) return issues def autofix_markdown_file( md_file: Path, root_dir: Path ) -> Tuple[int, List[str]]: """ Automatically fix common link issues in a markdown file. Args: md_file: Path to markdown file root_dir: Repository root directory Returns: Tuple of (number_of_fixes, list_of_fix_descriptions) """ try: content = md_file.read_text(encoding='utf-8') except Exception as e: logger.warning(f"Could not read {md_file}: {e}") return 0, [] original_content = content links = extract_links_from_markdown(content) fixes = [] fix_count = 0 for line_num, link_text, url in links: if is_external_link(url): continue # Check if internal link is broken error, suggested_fix = check_internal_link(md_file, url, root_dir) if error and suggested_fix: # Apply the fix # Preserve any anchor/hash anchor = '' if '#' in url: anchor = '#' + url.split('#', 1)[1] new_url = suggested_fix + anchor # Replace in content content = content.replace(f']({url})', f']({new_url})') fix_count += 1 fixes.append(f"Line {line_num}: {url} -> {new_url}") # Write back if changes were made if fix_count > 0: try: md_file.write_text(content, encoding='utf-8') logger.info(f"Applied {fix_count} fixes to {md_file}") except Exception as e: logger.error(f"Could not write fixes to {md_file}: {e}") return 0, [] return fix_count, fixes def lint_all_markdown( root_dir: str, check_external: bool = True, autofix: bool = False, external_timeout: int = 10, exclude_patterns: Optional[List[str]] = None ) -> Dict[str, Any]: """ Lint all markdown files in a directory. Args: root_dir: Root directory to search check_external: Whether to check external links (can be slow) autofix: Whether to automatically fix common issues external_timeout: Timeout for external link checks exclude_patterns: Patterns to exclude from search Returns: Result dictionary with issues and statistics """ root_path = Path(root_dir).resolve() md_files = find_markdown_files(root_dir, exclude_patterns) all_issues = [] all_fixes = [] files_checked = 0 files_with_issues = 0 total_fixes = 0 for md_file in md_files: files_checked += 1 if autofix: fix_count, fixes = autofix_markdown_file(md_file, root_path) total_fixes += fix_count if fixes: relative_path = os.path.relpath(md_file, root_path) all_fixes.append({ "file": relative_path, "fixes": fixes }) # Check for issues (after autofix if enabled) issues = lint_markdown_file( md_file, root_path, check_external=check_external, external_timeout=external_timeout ) if issues: files_with_issues += 1 all_issues.extend(issues) result = { "status": "success", "summary": { "files_checked": files_checked, "files_with_issues": files_with_issues, "total_issues": len(all_issues), "autofix_enabled": autofix, "total_fixes_applied": total_fixes }, "issues": [issue.to_dict() for issue in all_issues] } if autofix and all_fixes: result["fixes"] = all_fixes return result def main(argv: Optional[List[str]] = None) -> int: """Entry point for CLI execution.""" import argparse parser = argparse.ArgumentParser( description="Lint Markdown files to detect broken internal or external links" ) parser.add_argument( "root_dir", nargs='?', default='.', help="Root directory to search for Markdown files (default: current directory)" ) parser.add_argument( "--no-external", action="store_true", help="Skip checking external links (faster)" ) parser.add_argument( "--autofix", action="store_true", help="Automatically fix common issues (case, .md extension)" ) parser.add_argument( "--timeout", type=int, default=10, help="Timeout for external link checks in seconds (default: 10)" ) parser.add_argument( "--exclude", type=str, help="Comma-separated list of patterns to exclude (e.g., 'node_modules,.git')" ) parser.add_argument( "--output", type=str, choices=['json', 'text'], default='json', help="Output format (default: json)" ) args = parser.parse_args(argv) exclude_patterns = None if args.exclude: exclude_patterns = [p.strip() for p in args.exclude.split(',')] try: result = lint_all_markdown( root_dir=args.root_dir, check_external=not args.no_external, autofix=args.autofix, external_timeout=args.timeout, exclude_patterns=exclude_patterns ) if args.output == 'json': print(json.dumps(result, indent=2)) else: # Text output summary = result['summary'] print(f"Markdown Link Lint Results") print(f"=" * 50) print(f"Files checked: {summary['files_checked']}") print(f"Files with issues: {summary['files_with_issues']}") print(f"Total issues: {summary['total_issues']}") if summary['autofix_enabled']: print(f"Fixes applied: {summary['total_fixes_applied']}") if result['issues']: print(f"\nIssues found:") print(f"-" * 50) for issue in result['issues']: print(f"\n{issue['file']}:{issue['line']}") print(f" Link: {issue['link']}") print(f" Issue: {issue['message']}") if issue.get('suggested_fix'): print(f" Suggested fix: {issue['suggested_fix']}") else: print("\n✓ No issues found!") # Return non-zero if issues found return 1 if result['issues'] else 0 except BettyError as e: logger.error(f"Linting failed: {e}") result = { "status": "error", "error": str(e) } print(json.dumps(result, indent=2)) return 1 except Exception as e: logger.exception("Unexpected error during linting") result = { "status": "error", "error": str(e) } print(json.dumps(result, indent=2)) return 1 if __name__ == "__main__": sys.exit(main())