#!/usr/bin/env python3 """ Automatically archive documents based on status, age, and category-specific rules. Documents are moved to archive/ and their metadata is updated. """ import os import sys import re import shutil from pathlib import Path from datetime import datetime, timedelta import yaml # Archiving rules by category (days since last_updated) ARCHIVING_RULES = { 'specs': { 'complete_after_days': 90, 'auto_archive': True, 'require_complete_status': True }, 'analysis': { 'complete_after_days': 60, 'auto_archive': True, 'require_complete_status': True }, 'plans': { 'complete_after_days': 30, 'auto_archive': True, 'require_complete_status': True }, 'ai_docs': { 'auto_archive': False, # Manual archiving only for reference docs }, 'templates': { 'auto_archive': False, # Never auto-archive templates } } def extract_frontmatter(file_path: Path) -> tuple[dict, str]: """Extract YAML frontmatter and remaining content from a markdown file.""" try: content = file_path.read_text() # Match YAML frontmatter between --- delimiters match = re.match(r'^---\s*\n(.*?)\n---\s*\n(.*)', content, re.DOTALL) if not match: return {}, content frontmatter_text = match.group(1) body = match.group(2) metadata = yaml.safe_load(frontmatter_text) return (metadata if isinstance(metadata, dict) else {}), body except Exception as e: print(f"⚠️ Warning: Could not parse {file_path}: {e}") return {}, "" def update_frontmatter(file_path: Path, metadata: dict) -> None: """Update the YAML frontmatter in a markdown file.""" _, body = extract_frontmatter(file_path) frontmatter = yaml.dump(metadata, default_flow_style=False, sort_keys=False) new_content = f"---\n{frontmatter}---\n{body}" file_path.write_text(new_content) def should_archive(metadata: dict, category: str, file_modified: datetime) -> tuple[bool, str]: """ Determine if a document should be archived based on rules. Returns (should_archive, reason). """ # Skip if already archived if metadata.get('status') == 'archived': return False, "already archived" # Get category rules rules = ARCHIVING_RULES.get(category, {}) # Skip if auto-archiving is disabled for this category if not rules.get('auto_archive', False): return False, f"{category} does not auto-archive" # Check if status is 'complete' (required for most categories) if rules.get('require_complete_status', False): if metadata.get('status') != 'complete': return False, "status is not 'complete'" # Check age-based archiving complete_after_days = rules.get('complete_after_days') if complete_after_days: last_updated = metadata.get('last_updated') if not last_updated: return False, "no last_updated date in metadata" try: if isinstance(last_updated, str): updated_date = datetime.strptime(last_updated, '%Y-%m-%d').date() else: # YAML parser returns date objects, convert to date for comparison updated_date = last_updated if hasattr(last_updated, 'year') else datetime.strptime(str(last_updated), '%Y-%m-%d').date() days_old = (datetime.now().date() - updated_date).days if days_old >= complete_after_days: return True, f"{days_old} days old (threshold: {complete_after_days})" except ValueError: return False, "invalid last_updated date format" return False, "no archiving criteria met" def archive_document(file_path: Path, docs_path: Path, reason: str, dry_run: bool = False) -> bool: """ Archive a document by moving it to archive/ and updating its metadata. Returns True if successful. """ try: # Read metadata metadata, body = extract_frontmatter(file_path) # Determine archive path (preserve subdirectory structure) relative_path = file_path.relative_to(docs_path) category = relative_path.parts[0] # Create archive subdirectory for the category archive_path = docs_path / 'archive' / category archive_path.mkdir(parents=True, exist_ok=True) # Build destination path archive_file = archive_path / file_path.name # Handle name conflicts if archive_file.exists(): base = archive_file.stem suffix = archive_file.suffix counter = 1 while archive_file.exists(): archive_file = archive_path / f"{base}_{counter}{suffix}" counter += 1 if dry_run: print(f" [DRY RUN] Would archive: {relative_path} → archive/{category}/{archive_file.name}") print(f" Reason: {reason}") return True # Update metadata metadata['status'] = 'archived' metadata['archived_date'] = datetime.now().strftime('%Y-%m-%d') metadata['archive_reason'] = reason # Write updated file to archive frontmatter = yaml.dump(metadata, default_flow_style=False, sort_keys=False) new_content = f"---\n{frontmatter}---\n{body}" archive_file.write_text(new_content) # Remove original file_path.unlink() print(f" ✅ Archived: {relative_path} → archive/{category}/{archive_file.name}") print(f" Reason: {reason}") return True except Exception as e: print(f" ❌ Error archiving {file_path}: {e}") return False def scan_and_archive(docs_path: Path, dry_run: bool = False) -> dict: """ Scan all documents and archive those that meet criteria. Returns statistics about the archiving operation. """ stats = { 'scanned': 0, 'archived': 0, 'skipped': 0, 'errors': 0 } skip_files = {'README.md', 'INDEX.md', '.gitkeep'} skip_dirs = {'archive'} for category_dir in docs_path.iterdir(): if not category_dir.is_dir() or category_dir.name in skip_dirs or category_dir.name.startswith('.'): continue category_name = category_dir.name # Find all markdown files for md_file in category_dir.rglob('*.md'): if md_file.name in skip_files: continue stats['scanned'] += 1 # Extract metadata metadata, _ = extract_frontmatter(md_file) file_stats = md_file.stat() file_modified = datetime.fromtimestamp(file_stats.st_mtime) # Check if should archive should_arch, reason = should_archive(metadata, category_name, file_modified) if should_arch: success = archive_document(md_file, docs_path, reason, dry_run) if success: stats['archived'] += 1 else: stats['errors'] += 1 else: stats['skipped'] += 1 return stats def main(): """Main entry point.""" dry_run = '--dry-run' in sys.argv # Get base path args = [arg for arg in sys.argv[1:] if not arg.startswith('--')] if args: base_path = Path(args[0]).resolve() else: base_path = Path.cwd() docs_path = base_path / 'docs' if not docs_path.exists(): print(f"❌ Error: docs/ directory not found at {docs_path}") sys.exit(1) print(f"Scanning documents in: {docs_path}") if dry_run: print("🔍 DRY RUN MODE - No files will be modified") print() # Scan and archive stats = scan_and_archive(docs_path, dry_run) print() print("=" * 60) print("Archive Summary:") print(f" Documents scanned: {stats['scanned']}") print(f" Documents archived: {stats['archived']}") print(f" Documents skipped: {stats['skipped']}") print(f" Errors: {stats['errors']}") print() if not dry_run and stats['archived'] > 0: print("💡 Tip: Run 'python scripts/index_docs.py' to update the documentation index") if __name__ == '__main__': main()