Initial commit

2025-11-30 08:58:05 +08:00
commit 36a6fff8d8
20 changed files with 4237 additions and 0 deletions
--- a/skills/cyberarian/scripts/archive_docs.py
+++ b/skills/cyberarian/scripts/archive_docs.py
@@ -0,0 +1,262 @@
+#!/usr/bin/env python3
+"""
+Automatically archive documents based on status, age, and category-specific rules.
+Documents are moved to archive/ and their metadata is updated.
+"""
+
+import os
+import sys
+import re
+import shutil
+from pathlib import Path
+from datetime import datetime, timedelta
+import yaml
+
+
+# Archiving rules by category (days since last_updated)
+ARCHIVING_RULES = {
+    'specs': {
+        'complete_after_days': 90,
+        'auto_archive': True,
+        'require_complete_status': True
+    },
+    'analysis': {
+        'complete_after_days': 60,
+        'auto_archive': True,
+        'require_complete_status': True
+    },
+    'plans': {
+        'complete_after_days': 30,
+        'auto_archive': True,
+        'require_complete_status': True
+    },
+    'ai_docs': {
+        'auto_archive': False,  # Manual archiving only for reference docs
+    },
+    'templates': {
+        'auto_archive': False,  # Never auto-archive templates
+    }
+}
+
+
+def extract_frontmatter(file_path: Path) -> tuple[dict, str]:
+    """Extract YAML frontmatter and remaining content from a markdown file."""
+    try:
+        content = file_path.read_text()
+        
+        # Match YAML frontmatter between --- delimiters
+        match = re.match(r'^---\s*\n(.*?)\n---\s*\n(.*)', content, re.DOTALL)
+        if not match:
+            return {}, content
+        
+        frontmatter_text = match.group(1)
+        body = match.group(2)
+        metadata = yaml.safe_load(frontmatter_text)
+        
+        return (metadata if isinstance(metadata, dict) else {}), body
+    
+    except Exception as e:
+        print(f"⚠️  Warning: Could not parse {file_path}: {e}")
+        return {}, ""
+
+
+def update_frontmatter(file_path: Path, metadata: dict) -> None:
+    """Update the YAML frontmatter in a markdown file."""
+    _, body = extract_frontmatter(file_path)
+    
+    frontmatter = yaml.dump(metadata, default_flow_style=False, sort_keys=False)
+    new_content = f"---\n{frontmatter}---\n{body}"
+    
+    file_path.write_text(new_content)
+
+
+def should_archive(metadata: dict, category: str, file_modified: datetime) -> tuple[bool, str]:
+    """
+    Determine if a document should be archived based on rules.
+    Returns (should_archive, reason).
+    """
+    # Skip if already archived
+    if metadata.get('status') == 'archived':
+        return False, "already archived"
+    
+    # Get category rules
+    rules = ARCHIVING_RULES.get(category, {})
+    
+    # Skip if auto-archiving is disabled for this category
+    if not rules.get('auto_archive', False):
+        return False, f"{category} does not auto-archive"
+    
+    # Check if status is 'complete' (required for most categories)
+    if rules.get('require_complete_status', False):
+        if metadata.get('status') != 'complete':
+            return False, "status is not 'complete'"
+    
+    # Check age-based archiving
+    complete_after_days = rules.get('complete_after_days')
+    if complete_after_days:
+        last_updated = metadata.get('last_updated')
+        if not last_updated:
+            return False, "no last_updated date in metadata"
+        
+        try:
+            if isinstance(last_updated, str):
+                updated_date = datetime.strptime(last_updated, '%Y-%m-%d').date()
+            else:
+                # YAML parser returns date objects, convert to date for comparison
+                updated_date = last_updated if hasattr(last_updated, 'year') else datetime.strptime(str(last_updated), '%Y-%m-%d').date()
+            
+            days_old = (datetime.now().date() - updated_date).days
+            
+            if days_old >= complete_after_days:
+                return True, f"{days_old} days old (threshold: {complete_after_days})"
+        except ValueError:
+            return False, "invalid last_updated date format"
+    
+    return False, "no archiving criteria met"
+
+
+def archive_document(file_path: Path, docs_path: Path, reason: str, dry_run: bool = False) -> bool:
+    """
+    Archive a document by moving it to archive/ and updating its metadata.
+    Returns True if successful.
+    """
+    try:
+        # Read metadata
+        metadata, body = extract_frontmatter(file_path)
+        
+        # Determine archive path (preserve subdirectory structure)
+        relative_path = file_path.relative_to(docs_path)
+        category = relative_path.parts[0]
+        
+        # Create archive subdirectory for the category
+        archive_path = docs_path / 'archive' / category
+        archive_path.mkdir(parents=True, exist_ok=True)
+        
+        # Build destination path
+        archive_file = archive_path / file_path.name
+        
+        # Handle name conflicts
+        if archive_file.exists():
+            base = archive_file.stem
+            suffix = archive_file.suffix
+            counter = 1
+            while archive_file.exists():
+                archive_file = archive_path / f"{base}_{counter}{suffix}"
+                counter += 1
+        
+        if dry_run:
+            print(f"  [DRY RUN] Would archive: {relative_path} → archive/{category}/{archive_file.name}")
+            print(f"            Reason: {reason}")
+            return True
+        
+        # Update metadata
+        metadata['status'] = 'archived'
+        metadata['archived_date'] = datetime.now().strftime('%Y-%m-%d')
+        metadata['archive_reason'] = reason
+        
+        # Write updated file to archive
+        frontmatter = yaml.dump(metadata, default_flow_style=False, sort_keys=False)
+        new_content = f"---\n{frontmatter}---\n{body}"
+        archive_file.write_text(new_content)
+        
+        # Remove original
+        file_path.unlink()
+        
+        print(f"  ✅ Archived: {relative_path} → archive/{category}/{archive_file.name}")
+        print(f"     Reason: {reason}")
+        
+        return True
+    
+    except Exception as e:
+        print(f"  ❌ Error archiving {file_path}: {e}")
+        return False
+
+
+def scan_and_archive(docs_path: Path, dry_run: bool = False) -> dict:
+    """
+    Scan all documents and archive those that meet criteria.
+    Returns statistics about the archiving operation.
+    """
+    stats = {
+        'scanned': 0,
+        'archived': 0,
+        'skipped': 0,
+        'errors': 0
+    }
+    
+    skip_files = {'README.md', 'INDEX.md', '.gitkeep'}
+    skip_dirs = {'archive'}
+    
+    for category_dir in docs_path.iterdir():
+        if not category_dir.is_dir() or category_dir.name in skip_dirs or category_dir.name.startswith('.'):
+            continue
+        
+        category_name = category_dir.name
+        
+        # Find all markdown files
+        for md_file in category_dir.rglob('*.md'):
+            if md_file.name in skip_files:
+                continue
+            
+            stats['scanned'] += 1
+            
+            # Extract metadata
+            metadata, _ = extract_frontmatter(md_file)
+            file_stats = md_file.stat()
+            file_modified = datetime.fromtimestamp(file_stats.st_mtime)
+            
+            # Check if should archive
+            should_arch, reason = should_archive(metadata, category_name, file_modified)
+            
+            if should_arch:
+                success = archive_document(md_file, docs_path, reason, dry_run)
+                if success:
+                    stats['archived'] += 1
+                else:
+                    stats['errors'] += 1
+            else:
+                stats['skipped'] += 1
+    
+    return stats
+
+
+def main():
+    """Main entry point."""
+    dry_run = '--dry-run' in sys.argv
+    
+    # Get base path
+    args = [arg for arg in sys.argv[1:] if not arg.startswith('--')]
+    if args:
+        base_path = Path(args[0]).resolve()
+    else:
+        base_path = Path.cwd()
+    
+    docs_path = base_path / 'docs'
+    
+    if not docs_path.exists():
+        print(f"❌ Error: docs/ directory not found at {docs_path}")
+        sys.exit(1)
+    
+    print(f"Scanning documents in: {docs_path}")
+    if dry_run:
+        print("🔍 DRY RUN MODE - No files will be modified")
+    print()
+    
+    # Scan and archive
+    stats = scan_and_archive(docs_path, dry_run)
+    
+    print()
+    print("=" * 60)
+    print("Archive Summary:")
+    print(f"  Documents scanned: {stats['scanned']}")
+    print(f"  Documents archived: {stats['archived']}")
+    print(f"  Documents skipped: {stats['skipped']}")
+    print(f"  Errors: {stats['errors']}")
+    print()
+    
+    if not dry_run and stats['archived'] > 0:
+        print("💡 Tip: Run 'python scripts/index_docs.py' to update the documentation index")
+
+
+if __name__ == '__main__':
+    main()
--- a/skills/cyberarian/scripts/index_docs.py
+++ b/skills/cyberarian/scripts/index_docs.py
@@ -0,0 +1,177 @@
+#!/usr/bin/env python3
+"""
+Generate and update the INDEX.md file by scanning all documents in docs/.
+Reads YAML frontmatter to extract metadata and organize the index.
+"""
+
+import os
+import sys
+import re
+from pathlib import Path
+from datetime import datetime
+from collections import defaultdict
+import yaml
+
+
+def extract_frontmatter(file_path: Path) -> dict:
+    """Extract YAML frontmatter from a markdown file."""
+    try:
+        content = file_path.read_text()
+        
+        # Match YAML frontmatter between --- delimiters
+        match = re.match(r'^---\s*\n(.*?)\n---\s*\n', content, re.DOTALL)
+        if not match:
+            return {}
+        
+        frontmatter_text = match.group(1)
+        metadata = yaml.safe_load(frontmatter_text)
+        
+        return metadata if isinstance(metadata, dict) else {}
+    
+    except Exception as e:
+        print(f"⚠️  Warning: Could not parse frontmatter in {file_path}: {e}")
+        return {}
+
+
+def get_file_stats(file_path: Path) -> dict:
+    """Get file statistics."""
+    stats = file_path.stat()
+    return {
+        'size': stats.st_size,
+        'modified': datetime.fromtimestamp(stats.st_mtime)
+    }
+
+
+def scan_documents(docs_path: Path) -> dict:
+    """Scan all markdown documents in docs/ and extract metadata."""
+    categories = defaultdict(list)
+    
+    # Skip these files/directories
+    skip_files = {'README.md', 'INDEX.md', '.gitkeep'}
+    skip_dirs = {'archive'}  # We'll handle archive separately
+    
+    for category_dir in docs_path.iterdir():
+        if not category_dir.is_dir() or category_dir.name.startswith('.'):
+            continue
+        
+        category_name = category_dir.name
+        
+        # Find all markdown files
+        for md_file in category_dir.rglob('*.md'):
+            if md_file.name in skip_files:
+                continue
+            
+            # Extract metadata
+            metadata = extract_frontmatter(md_file)
+            stats = get_file_stats(md_file)
+            
+            # Build document entry
+            relative_path = md_file.relative_to(docs_path)
+            doc_entry = {
+                'path': str(relative_path),
+                'title': metadata.get('title', md_file.stem),
+                'status': metadata.get('status', 'unknown'),
+                'created': metadata.get('created', 'unknown'),
+                'last_updated': metadata.get('last_updated', stats['modified'].strftime('%Y-%m-%d')),
+                'tags': metadata.get('tags', []),
+                'category': category_name,
+                'file_modified': stats['modified']
+            }
+            
+            categories[category_name].append(doc_entry)
+    
+    return categories
+
+
+def generate_index(categories: dict) -> str:
+    """Generate the INDEX.md content."""
+    total_docs = sum(len(docs) for docs in categories.values())
+    
+    index_lines = [
+        "# Documentation Index",
+        "",
+        f"Auto-generated index of all documents. Last updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
+        "",
+        "Run `python scripts/index_docs.py` to regenerate this index.",
+        "",
+        "---",
+        "",
+        "## Summary",
+        "",
+        f"Total documents: {total_docs}",
+        ""
+    ]
+    
+    # Add category breakdown
+    if categories:
+        index_lines.append("By category:")
+        for category in sorted(categories.keys()):
+            count = len(categories[category])
+            index_lines.append(f"- **{category}**: {count} document{'s' if count != 1 else ''}")
+        index_lines.append("")
+    
+    index_lines.append("---")
+    index_lines.append("")
+    
+    # Add documents by category
+    if not categories:
+        index_lines.append("_No documents found. Add documents to the category directories and regenerate the index._")
+    else:
+        for category in sorted(categories.keys()):
+            docs = categories[category]
+            docs.sort(key=lambda d: d['last_updated'], reverse=True)
+            
+            index_lines.append(f"## {category.replace('_', ' ').title()}")
+            index_lines.append("")
+            
+            for doc in docs:
+                # Format: [Title](path) - status | updated: date | tags
+                title_link = f"[{doc['title']}]({doc['path']})"
+                status_badge = f"**{doc['status']}**"
+                updated = f"updated: {doc['last_updated']}"
+                tags = f"tags: [{', '.join(doc['tags'])}]" if doc['tags'] else ""
+                
+                parts = [title_link, status_badge, updated]
+                if tags:
+                    parts.append(tags)
+                
+                index_lines.append(f"- {' | '.join(parts)}")
+            
+            index_lines.append("")
+    
+    return '\n'.join(index_lines)
+
+
+def main():
+    """Main entry point."""
+    if len(sys.argv) > 1:
+        base_path = Path(sys.argv[1]).resolve()
+    else:
+        base_path = Path.cwd()
+    
+    docs_path = base_path / 'docs'
+    
+    if not docs_path.exists():
+        print(f"❌ Error: docs/ directory not found at {docs_path}")
+        print("Run 'python scripts/init_docs_structure.py' first to initialize the structure.")
+        sys.exit(1)
+    
+    print(f"Scanning documents in: {docs_path}")
+    
+    # Scan all documents
+    categories = scan_documents(docs_path)
+    
+    # Generate index content
+    index_content = generate_index(categories)
+    
+    # Write INDEX.md
+    index_path = docs_path / 'INDEX.md'
+    index_path.write_text(index_content)
+    
+    total_docs = sum(len(docs) for docs in categories.values())
+    print(f"✅ Generated index with {total_docs} documents")
+    print(f"✅ Updated: {index_path}")
+
+
+if __name__ == '__main__':
+    main()
--- a/skills/cyberarian/scripts/init_docs_structure.py
+++ b/skills/cyberarian/scripts/init_docs_structure.py
@@ -0,0 +1,156 @@
+#!/usr/bin/env python3
+"""
+Initialize the docs/ directory structure for document lifecycle management.
+Creates all required directories and initial README.md.
+"""
+
+import os
+import sys
+from pathlib import Path
+from datetime import datetime
+
+
+DIRECTORY_STRUCTURE = {
+    'ai_docs': 'Reference materials for Claude Code: SDKs, API docs, repo context',
+    'specs': 'Feature and migration specifications',
+    'analysis': 'Investigation outputs: bug hunting, optimization, cleanup',
+    'plans': 'Implementation plans from specs, analysis, or ad-hoc tasks',
+    'templates': 'Reusable document templates',
+    'archive': 'Historical and completed documents'
+}
+
+
+README_TEMPLATE = """# Documentation Structure
+
+This directory contains project documentation organized by purpose and lifecycle stage.
+
+## Directory Structure
+
+{directory_descriptions}
+
+## Document Lifecycle
+
+Documents follow a lifecycle managed through YAML frontmatter:
+
+1. **Draft** → Document is being created
+2. **Active** → Document is current and relevant
+3. **Complete** → Work is done, kept for reference
+4. **Archived** → Moved to archive/ when no longer relevant
+
+## Metadata Requirements
+
+All documents should include YAML frontmatter:
+
+```yaml
+---
+title: Document Title
+category: specs|analysis|plans|ai_docs|templates
+status: draft|active|complete|archived
+created: YYYY-MM-DD
+last_updated: YYYY-MM-DD
+tags: [tag1, tag2]
+---
+```
+
+See INDEX.md for a complete list of all documents.
+
+## Temporary Documents
+
+Ephemeral/scratch documents should be created in `/tmp` or system temp directories,
+NOT in this docs/ directory. The docs/ directory is for persistent documentation only.
+
+---
+Last updated: {timestamp}
+"""
+
+
+def create_directory_structure(base_path: Path) -> None:
+    """Create the docs directory structure."""
+    docs_path = base_path / 'docs'
+    
+    # Create main docs directory
+    docs_path.mkdir(exist_ok=True)
+    print(f"✅ Created: {docs_path}")
+    
+    # Create category directories
+    for directory, description in DIRECTORY_STRUCTURE.items():
+        dir_path = docs_path / directory
+        dir_path.mkdir(exist_ok=True)
+        print(f"✅ Created: {dir_path}")
+        
+        # Create .gitkeep for empty directories
+        gitkeep = dir_path / '.gitkeep'
+        if not any(dir_path.iterdir()):
+            gitkeep.touch()
+
+
+def create_readme(base_path: Path) -> None:
+    """Create the README.md file."""
+    docs_path = base_path / 'docs'
+    readme_path = docs_path / 'README.md'
+    
+    # Format directory descriptions
+    descriptions = []
+    for directory, description in DIRECTORY_STRUCTURE.items():
+        descriptions.append(f"- **{directory}/** - {description}")
+    
+    readme_content = README_TEMPLATE.format(
+        directory_descriptions='\n'.join(descriptions),
+        timestamp=datetime.now().strftime('%Y-%m-%d')
+    )
+    
+    readme_path.write_text(readme_content)
+    print(f"✅ Created: {readme_path}")
+
+
+def create_index(base_path: Path) -> None:
+    """Create initial INDEX.md file."""
+    docs_path = base_path / 'docs'
+    index_path = docs_path / 'INDEX.md'
+    
+    index_content = f"""# Documentation Index
+
+Auto-generated index of all documents. Last updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
+
+Run `python scripts/index_docs.py` to regenerate this index.
+
+---
+
+## Summary
+
+Total documents: 0
+
+---
+
+_No documents found. Add documents to the category directories and regenerate the index._
+"""
+    
+    index_path.write_text(index_content)
+    print(f"✅ Created: {index_path}")
+
+
+def main():
+    """Main entry point."""
+    if len(sys.argv) > 1:
+        base_path = Path(sys.argv[1]).resolve()
+    else:
+        base_path = Path.cwd()
+    
+    print(f"Initializing docs structure at: {base_path}")
+    print()
+    
+    create_directory_structure(base_path)
+    create_readme(base_path)
+    create_index(base_path)
+    
+    print()
+    print("🎉 Documentation structure initialized successfully!")
+    print()
+    print("Next steps:")
+    print("1. Add documents to the category directories")
+    print("2. Run 'python scripts/index_docs.py' to update the index")
+    print("3. Run 'python scripts/archive_docs.py' periodically to maintain the archive")
+
+
+if __name__ == '__main__':
+    main()
--- a/skills/cyberarian/scripts/validate_doc_metadata.py
+++ b/skills/cyberarian/scripts/validate_doc_metadata.py
@@ -0,0 +1,178 @@
+#!/usr/bin/env python3
+"""
+Validate that all documents have proper YAML frontmatter metadata.
+Reports documents with missing or invalid metadata.
+"""
+
+import sys
+import re
+from pathlib import Path
+from datetime import datetime
+import yaml
+
+
+REQUIRED_FIELDS = ['title', 'category', 'status', 'created', 'last_updated']
+VALID_STATUSES = ['draft', 'active', 'complete', 'archived']
+VALID_CATEGORIES = ['ai_docs', 'specs', 'analysis', 'plans', 'templates', 'archive']
+
+
+def extract_frontmatter(file_path: Path) -> dict:
+    """Extract YAML frontmatter from a markdown file."""
+    try:
+        content = file_path.read_text()
+        
+        # Match YAML frontmatter between --- delimiters
+        match = re.match(r'^---\s*\n(.*?)\n---\s*\n', content, re.DOTALL)
+        if not match:
+            return None  # No frontmatter found
+        
+        frontmatter_text = match.group(1)
+        metadata = yaml.safe_load(frontmatter_text)
+        
+        return metadata if isinstance(metadata, dict) else None
+    
+    except Exception as e:
+        return {'_error': str(e)}
+
+
+def validate_date(date_str: str) -> bool:
+    """Validate date format (YYYY-MM-DD)."""
+    try:
+        datetime.strptime(str(date_str), '%Y-%m-%d')
+        return True
+    except (ValueError, TypeError):
+        return False
+
+
+def validate_metadata(metadata: dict, category_from_path: str) -> list[str]:
+    """
+    Validate metadata against requirements.
+    Returns list of validation errors (empty if valid).
+    """
+    errors = []
+    
+    if metadata is None:
+        return ["No YAML frontmatter found"]
+    
+    if '_error' in metadata:
+        return [f"Failed to parse frontmatter: {metadata['_error']}"]
+    
+    # Check required fields
+    for field in REQUIRED_FIELDS:
+        if field not in metadata:
+            errors.append(f"Missing required field: {field}")
+    
+    # Validate status
+    if 'status' in metadata:
+        if metadata['status'] not in VALID_STATUSES:
+            errors.append(f"Invalid status '{metadata['status']}'. Must be one of: {', '.join(VALID_STATUSES)}")
+    
+    # Validate category
+    if 'category' in metadata:
+        if metadata['category'] not in VALID_CATEGORIES:
+            errors.append(f"Invalid category '{metadata['category']}'. Must be one of: {', '.join(VALID_CATEGORIES)}")
+        elif metadata['category'] != category_from_path:
+            errors.append(f"Category mismatch: metadata says '{metadata['category']}' but file is in '{category_from_path}/'")
+    
+    # Validate dates
+    for date_field in ['created', 'last_updated']:
+        if date_field in metadata:
+            if not validate_date(metadata[date_field]):
+                errors.append(f"Invalid {date_field} date format. Must be YYYY-MM-DD")
+    
+    # Validate tags (optional but must be list if present)
+    if 'tags' in metadata:
+        if not isinstance(metadata['tags'], list):
+            errors.append("Tags must be a list")
+    
+    return errors
+
+
+def scan_and_validate(docs_path: Path) -> dict:
+    """
+    Scan all documents and validate their metadata.
+    Returns validation results.
+    """
+    results = {
+        'valid': [],
+        'invalid': [],
+        'no_frontmatter': [],
+        'total': 0
+    }
+    
+    skip_files = {'README.md', 'INDEX.md', '.gitkeep'}
+    
+    for category_dir in docs_path.iterdir():
+        if not category_dir.is_dir() or category_dir.name.startswith('.'):
+            continue
+        
+        category_name = category_dir.name
+        
+        # Find all markdown files
+        for md_file in category_dir.rglob('*.md'):
+            if md_file.name in skip_files:
+                continue
+            
+            results['total'] += 1
+            relative_path = md_file.relative_to(docs_path)
+            
+            # Extract and validate metadata
+            metadata = extract_frontmatter(md_file)
+            errors = validate_metadata(metadata, category_name)
+            
+            if not errors:
+                results['valid'].append(str(relative_path))
+            else:
+                results['invalid'].append({
+                    'path': str(relative_path),
+                    'errors': errors
+                })
+    
+    return results
+
+
+def main():
+    """Main entry point."""
+    if len(sys.argv) > 1:
+        base_path = Path(sys.argv[1]).resolve()
+    else:
+        base_path = Path.cwd()
+    
+    docs_path = base_path / 'docs'
+    
+    if not docs_path.exists():
+        print(f"❌ Error: docs/ directory not found at {docs_path}")
+        sys.exit(1)
+    
+    print(f"Validating documents in: {docs_path}")
+    print()
+    
+    # Scan and validate
+    results = scan_and_validate(docs_path)
+    
+    # Display results
+    print("=" * 60)
+    print("Validation Results:")
+    print(f"  Total documents: {results['total']}")
+    print(f"  ✅ Valid: {len(results['valid'])}")
+    print(f"  ❌ Invalid: {len(results['invalid'])}")
+    print()
+    
+    if results['invalid']:
+        print("Invalid Documents:")
+        print()
+        for item in results['invalid']:
+            print(f"  📄 {item['path']}")
+            for error in item['errors']:
+                print(f"     • {error}")
+            print()
+    
+    if results['valid'] and not results['invalid']:
+        print("🎉 All documents have valid metadata!")
+    
+    # Exit with error code if any invalid documents
+    sys.exit(1 if results['invalid'] else 0)
+
+
+if __name__ == '__main__':
+    main()