Initial commit
This commit is contained in:
262
skills/cyberarian/scripts/archive_docs.py
Executable file
262
skills/cyberarian/scripts/archive_docs.py
Executable file
@@ -0,0 +1,262 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Automatically archive documents based on status, age, and category-specific rules.
|
||||
Documents are moved to archive/ and their metadata is updated.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timedelta
|
||||
import yaml
|
||||
|
||||
|
||||
# Archiving rules by category (days since last_updated)
|
||||
ARCHIVING_RULES = {
|
||||
'specs': {
|
||||
'complete_after_days': 90,
|
||||
'auto_archive': True,
|
||||
'require_complete_status': True
|
||||
},
|
||||
'analysis': {
|
||||
'complete_after_days': 60,
|
||||
'auto_archive': True,
|
||||
'require_complete_status': True
|
||||
},
|
||||
'plans': {
|
||||
'complete_after_days': 30,
|
||||
'auto_archive': True,
|
||||
'require_complete_status': True
|
||||
},
|
||||
'ai_docs': {
|
||||
'auto_archive': False, # Manual archiving only for reference docs
|
||||
},
|
||||
'templates': {
|
||||
'auto_archive': False, # Never auto-archive templates
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def extract_frontmatter(file_path: Path) -> tuple[dict, str]:
|
||||
"""Extract YAML frontmatter and remaining content from a markdown file."""
|
||||
try:
|
||||
content = file_path.read_text()
|
||||
|
||||
# Match YAML frontmatter between --- delimiters
|
||||
match = re.match(r'^---\s*\n(.*?)\n---\s*\n(.*)', content, re.DOTALL)
|
||||
if not match:
|
||||
return {}, content
|
||||
|
||||
frontmatter_text = match.group(1)
|
||||
body = match.group(2)
|
||||
metadata = yaml.safe_load(frontmatter_text)
|
||||
|
||||
return (metadata if isinstance(metadata, dict) else {}), body
|
||||
|
||||
except Exception as e:
|
||||
print(f"⚠️ Warning: Could not parse {file_path}: {e}")
|
||||
return {}, ""
|
||||
|
||||
|
||||
def update_frontmatter(file_path: Path, metadata: dict) -> None:
|
||||
"""Update the YAML frontmatter in a markdown file."""
|
||||
_, body = extract_frontmatter(file_path)
|
||||
|
||||
frontmatter = yaml.dump(metadata, default_flow_style=False, sort_keys=False)
|
||||
new_content = f"---\n{frontmatter}---\n{body}"
|
||||
|
||||
file_path.write_text(new_content)
|
||||
|
||||
|
||||
def should_archive(metadata: dict, category: str, file_modified: datetime) -> tuple[bool, str]:
|
||||
"""
|
||||
Determine if a document should be archived based on rules.
|
||||
Returns (should_archive, reason).
|
||||
"""
|
||||
# Skip if already archived
|
||||
if metadata.get('status') == 'archived':
|
||||
return False, "already archived"
|
||||
|
||||
# Get category rules
|
||||
rules = ARCHIVING_RULES.get(category, {})
|
||||
|
||||
# Skip if auto-archiving is disabled for this category
|
||||
if not rules.get('auto_archive', False):
|
||||
return False, f"{category} does not auto-archive"
|
||||
|
||||
# Check if status is 'complete' (required for most categories)
|
||||
if rules.get('require_complete_status', False):
|
||||
if metadata.get('status') != 'complete':
|
||||
return False, "status is not 'complete'"
|
||||
|
||||
# Check age-based archiving
|
||||
complete_after_days = rules.get('complete_after_days')
|
||||
if complete_after_days:
|
||||
last_updated = metadata.get('last_updated')
|
||||
if not last_updated:
|
||||
return False, "no last_updated date in metadata"
|
||||
|
||||
try:
|
||||
if isinstance(last_updated, str):
|
||||
updated_date = datetime.strptime(last_updated, '%Y-%m-%d').date()
|
||||
else:
|
||||
# YAML parser returns date objects, convert to date for comparison
|
||||
updated_date = last_updated if hasattr(last_updated, 'year') else datetime.strptime(str(last_updated), '%Y-%m-%d').date()
|
||||
|
||||
days_old = (datetime.now().date() - updated_date).days
|
||||
|
||||
if days_old >= complete_after_days:
|
||||
return True, f"{days_old} days old (threshold: {complete_after_days})"
|
||||
except ValueError:
|
||||
return False, "invalid last_updated date format"
|
||||
|
||||
return False, "no archiving criteria met"
|
||||
|
||||
|
||||
def archive_document(file_path: Path, docs_path: Path, reason: str, dry_run: bool = False) -> bool:
|
||||
"""
|
||||
Archive a document by moving it to archive/ and updating its metadata.
|
||||
Returns True if successful.
|
||||
"""
|
||||
try:
|
||||
# Read metadata
|
||||
metadata, body = extract_frontmatter(file_path)
|
||||
|
||||
# Determine archive path (preserve subdirectory structure)
|
||||
relative_path = file_path.relative_to(docs_path)
|
||||
category = relative_path.parts[0]
|
||||
|
||||
# Create archive subdirectory for the category
|
||||
archive_path = docs_path / 'archive' / category
|
||||
archive_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Build destination path
|
||||
archive_file = archive_path / file_path.name
|
||||
|
||||
# Handle name conflicts
|
||||
if archive_file.exists():
|
||||
base = archive_file.stem
|
||||
suffix = archive_file.suffix
|
||||
counter = 1
|
||||
while archive_file.exists():
|
||||
archive_file = archive_path / f"{base}_{counter}{suffix}"
|
||||
counter += 1
|
||||
|
||||
if dry_run:
|
||||
print(f" [DRY RUN] Would archive: {relative_path} → archive/{category}/{archive_file.name}")
|
||||
print(f" Reason: {reason}")
|
||||
return True
|
||||
|
||||
# Update metadata
|
||||
metadata['status'] = 'archived'
|
||||
metadata['archived_date'] = datetime.now().strftime('%Y-%m-%d')
|
||||
metadata['archive_reason'] = reason
|
||||
|
||||
# Write updated file to archive
|
||||
frontmatter = yaml.dump(metadata, default_flow_style=False, sort_keys=False)
|
||||
new_content = f"---\n{frontmatter}---\n{body}"
|
||||
archive_file.write_text(new_content)
|
||||
|
||||
# Remove original
|
||||
file_path.unlink()
|
||||
|
||||
print(f" ✅ Archived: {relative_path} → archive/{category}/{archive_file.name}")
|
||||
print(f" Reason: {reason}")
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f" ❌ Error archiving {file_path}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def scan_and_archive(docs_path: Path, dry_run: bool = False) -> dict:
|
||||
"""
|
||||
Scan all documents and archive those that meet criteria.
|
||||
Returns statistics about the archiving operation.
|
||||
"""
|
||||
stats = {
|
||||
'scanned': 0,
|
||||
'archived': 0,
|
||||
'skipped': 0,
|
||||
'errors': 0
|
||||
}
|
||||
|
||||
skip_files = {'README.md', 'INDEX.md', '.gitkeep'}
|
||||
skip_dirs = {'archive'}
|
||||
|
||||
for category_dir in docs_path.iterdir():
|
||||
if not category_dir.is_dir() or category_dir.name in skip_dirs or category_dir.name.startswith('.'):
|
||||
continue
|
||||
|
||||
category_name = category_dir.name
|
||||
|
||||
# Find all markdown files
|
||||
for md_file in category_dir.rglob('*.md'):
|
||||
if md_file.name in skip_files:
|
||||
continue
|
||||
|
||||
stats['scanned'] += 1
|
||||
|
||||
# Extract metadata
|
||||
metadata, _ = extract_frontmatter(md_file)
|
||||
file_stats = md_file.stat()
|
||||
file_modified = datetime.fromtimestamp(file_stats.st_mtime)
|
||||
|
||||
# Check if should archive
|
||||
should_arch, reason = should_archive(metadata, category_name, file_modified)
|
||||
|
||||
if should_arch:
|
||||
success = archive_document(md_file, docs_path, reason, dry_run)
|
||||
if success:
|
||||
stats['archived'] += 1
|
||||
else:
|
||||
stats['errors'] += 1
|
||||
else:
|
||||
stats['skipped'] += 1
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
dry_run = '--dry-run' in sys.argv
|
||||
|
||||
# Get base path
|
||||
args = [arg for arg in sys.argv[1:] if not arg.startswith('--')]
|
||||
if args:
|
||||
base_path = Path(args[0]).resolve()
|
||||
else:
|
||||
base_path = Path.cwd()
|
||||
|
||||
docs_path = base_path / 'docs'
|
||||
|
||||
if not docs_path.exists():
|
||||
print(f"❌ Error: docs/ directory not found at {docs_path}")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Scanning documents in: {docs_path}")
|
||||
if dry_run:
|
||||
print("🔍 DRY RUN MODE - No files will be modified")
|
||||
print()
|
||||
|
||||
# Scan and archive
|
||||
stats = scan_and_archive(docs_path, dry_run)
|
||||
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("Archive Summary:")
|
||||
print(f" Documents scanned: {stats['scanned']}")
|
||||
print(f" Documents archived: {stats['archived']}")
|
||||
print(f" Documents skipped: {stats['skipped']}")
|
||||
print(f" Errors: {stats['errors']}")
|
||||
print()
|
||||
|
||||
if not dry_run and stats['archived'] > 0:
|
||||
print("💡 Tip: Run 'python scripts/index_docs.py' to update the documentation index")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
177
skills/cyberarian/scripts/index_docs.py
Executable file
177
skills/cyberarian/scripts/index_docs.py
Executable file
@@ -0,0 +1,177 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Generate and update the INDEX.md file by scanning all documents in docs/.
|
||||
Reads YAML frontmatter to extract metadata and organize the index.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from collections import defaultdict
|
||||
import yaml
|
||||
|
||||
|
||||
def extract_frontmatter(file_path: Path) -> dict:
|
||||
"""Extract YAML frontmatter from a markdown file."""
|
||||
try:
|
||||
content = file_path.read_text()
|
||||
|
||||
# Match YAML frontmatter between --- delimiters
|
||||
match = re.match(r'^---\s*\n(.*?)\n---\s*\n', content, re.DOTALL)
|
||||
if not match:
|
||||
return {}
|
||||
|
||||
frontmatter_text = match.group(1)
|
||||
metadata = yaml.safe_load(frontmatter_text)
|
||||
|
||||
return metadata if isinstance(metadata, dict) else {}
|
||||
|
||||
except Exception as e:
|
||||
print(f"⚠️ Warning: Could not parse frontmatter in {file_path}: {e}")
|
||||
return {}
|
||||
|
||||
|
||||
def get_file_stats(file_path: Path) -> dict:
|
||||
"""Get file statistics."""
|
||||
stats = file_path.stat()
|
||||
return {
|
||||
'size': stats.st_size,
|
||||
'modified': datetime.fromtimestamp(stats.st_mtime)
|
||||
}
|
||||
|
||||
|
||||
def scan_documents(docs_path: Path) -> dict:
|
||||
"""Scan all markdown documents in docs/ and extract metadata."""
|
||||
categories = defaultdict(list)
|
||||
|
||||
# Skip these files/directories
|
||||
skip_files = {'README.md', 'INDEX.md', '.gitkeep'}
|
||||
skip_dirs = {'archive'} # We'll handle archive separately
|
||||
|
||||
for category_dir in docs_path.iterdir():
|
||||
if not category_dir.is_dir() or category_dir.name.startswith('.'):
|
||||
continue
|
||||
|
||||
category_name = category_dir.name
|
||||
|
||||
# Find all markdown files
|
||||
for md_file in category_dir.rglob('*.md'):
|
||||
if md_file.name in skip_files:
|
||||
continue
|
||||
|
||||
# Extract metadata
|
||||
metadata = extract_frontmatter(md_file)
|
||||
stats = get_file_stats(md_file)
|
||||
|
||||
# Build document entry
|
||||
relative_path = md_file.relative_to(docs_path)
|
||||
doc_entry = {
|
||||
'path': str(relative_path),
|
||||
'title': metadata.get('title', md_file.stem),
|
||||
'status': metadata.get('status', 'unknown'),
|
||||
'created': metadata.get('created', 'unknown'),
|
||||
'last_updated': metadata.get('last_updated', stats['modified'].strftime('%Y-%m-%d')),
|
||||
'tags': metadata.get('tags', []),
|
||||
'category': category_name,
|
||||
'file_modified': stats['modified']
|
||||
}
|
||||
|
||||
categories[category_name].append(doc_entry)
|
||||
|
||||
return categories
|
||||
|
||||
|
||||
def generate_index(categories: dict) -> str:
|
||||
"""Generate the INDEX.md content."""
|
||||
total_docs = sum(len(docs) for docs in categories.values())
|
||||
|
||||
index_lines = [
|
||||
"# Documentation Index",
|
||||
"",
|
||||
f"Auto-generated index of all documents. Last updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
|
||||
"",
|
||||
"Run `python scripts/index_docs.py` to regenerate this index.",
|
||||
"",
|
||||
"---",
|
||||
"",
|
||||
"## Summary",
|
||||
"",
|
||||
f"Total documents: {total_docs}",
|
||||
""
|
||||
]
|
||||
|
||||
# Add category breakdown
|
||||
if categories:
|
||||
index_lines.append("By category:")
|
||||
for category in sorted(categories.keys()):
|
||||
count = len(categories[category])
|
||||
index_lines.append(f"- **{category}**: {count} document{'s' if count != 1 else ''}")
|
||||
index_lines.append("")
|
||||
|
||||
index_lines.append("---")
|
||||
index_lines.append("")
|
||||
|
||||
# Add documents by category
|
||||
if not categories:
|
||||
index_lines.append("_No documents found. Add documents to the category directories and regenerate the index._")
|
||||
else:
|
||||
for category in sorted(categories.keys()):
|
||||
docs = categories[category]
|
||||
docs.sort(key=lambda d: d['last_updated'], reverse=True)
|
||||
|
||||
index_lines.append(f"## {category.replace('_', ' ').title()}")
|
||||
index_lines.append("")
|
||||
|
||||
for doc in docs:
|
||||
# Format: [Title](path) - status | updated: date | tags
|
||||
title_link = f"[{doc['title']}]({doc['path']})"
|
||||
status_badge = f"**{doc['status']}**"
|
||||
updated = f"updated: {doc['last_updated']}"
|
||||
tags = f"tags: [{', '.join(doc['tags'])}]" if doc['tags'] else ""
|
||||
|
||||
parts = [title_link, status_badge, updated]
|
||||
if tags:
|
||||
parts.append(tags)
|
||||
|
||||
index_lines.append(f"- {' | '.join(parts)}")
|
||||
|
||||
index_lines.append("")
|
||||
|
||||
return '\n'.join(index_lines)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
if len(sys.argv) > 1:
|
||||
base_path = Path(sys.argv[1]).resolve()
|
||||
else:
|
||||
base_path = Path.cwd()
|
||||
|
||||
docs_path = base_path / 'docs'
|
||||
|
||||
if not docs_path.exists():
|
||||
print(f"❌ Error: docs/ directory not found at {docs_path}")
|
||||
print("Run 'python scripts/init_docs_structure.py' first to initialize the structure.")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Scanning documents in: {docs_path}")
|
||||
|
||||
# Scan all documents
|
||||
categories = scan_documents(docs_path)
|
||||
|
||||
# Generate index content
|
||||
index_content = generate_index(categories)
|
||||
|
||||
# Write INDEX.md
|
||||
index_path = docs_path / 'INDEX.md'
|
||||
index_path.write_text(index_content)
|
||||
|
||||
total_docs = sum(len(docs) for docs in categories.values())
|
||||
print(f"✅ Generated index with {total_docs} documents")
|
||||
print(f"✅ Updated: {index_path}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
156
skills/cyberarian/scripts/init_docs_structure.py
Executable file
156
skills/cyberarian/scripts/init_docs_structure.py
Executable file
@@ -0,0 +1,156 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Initialize the docs/ directory structure for document lifecycle management.
|
||||
Creates all required directories and initial README.md.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
DIRECTORY_STRUCTURE = {
|
||||
'ai_docs': 'Reference materials for Claude Code: SDKs, API docs, repo context',
|
||||
'specs': 'Feature and migration specifications',
|
||||
'analysis': 'Investigation outputs: bug hunting, optimization, cleanup',
|
||||
'plans': 'Implementation plans from specs, analysis, or ad-hoc tasks',
|
||||
'templates': 'Reusable document templates',
|
||||
'archive': 'Historical and completed documents'
|
||||
}
|
||||
|
||||
|
||||
README_TEMPLATE = """# Documentation Structure
|
||||
|
||||
This directory contains project documentation organized by purpose and lifecycle stage.
|
||||
|
||||
## Directory Structure
|
||||
|
||||
{directory_descriptions}
|
||||
|
||||
## Document Lifecycle
|
||||
|
||||
Documents follow a lifecycle managed through YAML frontmatter:
|
||||
|
||||
1. **Draft** → Document is being created
|
||||
2. **Active** → Document is current and relevant
|
||||
3. **Complete** → Work is done, kept for reference
|
||||
4. **Archived** → Moved to archive/ when no longer relevant
|
||||
|
||||
## Metadata Requirements
|
||||
|
||||
All documents should include YAML frontmatter:
|
||||
|
||||
```yaml
|
||||
---
|
||||
title: Document Title
|
||||
category: specs|analysis|plans|ai_docs|templates
|
||||
status: draft|active|complete|archived
|
||||
created: YYYY-MM-DD
|
||||
last_updated: YYYY-MM-DD
|
||||
tags: [tag1, tag2]
|
||||
---
|
||||
```
|
||||
|
||||
See INDEX.md for a complete list of all documents.
|
||||
|
||||
## Temporary Documents
|
||||
|
||||
Ephemeral/scratch documents should be created in `/tmp` or system temp directories,
|
||||
NOT in this docs/ directory. The docs/ directory is for persistent documentation only.
|
||||
|
||||
---
|
||||
Last updated: {timestamp}
|
||||
"""
|
||||
|
||||
|
||||
def create_directory_structure(base_path: Path) -> None:
|
||||
"""Create the docs directory structure."""
|
||||
docs_path = base_path / 'docs'
|
||||
|
||||
# Create main docs directory
|
||||
docs_path.mkdir(exist_ok=True)
|
||||
print(f"✅ Created: {docs_path}")
|
||||
|
||||
# Create category directories
|
||||
for directory, description in DIRECTORY_STRUCTURE.items():
|
||||
dir_path = docs_path / directory
|
||||
dir_path.mkdir(exist_ok=True)
|
||||
print(f"✅ Created: {dir_path}")
|
||||
|
||||
# Create .gitkeep for empty directories
|
||||
gitkeep = dir_path / '.gitkeep'
|
||||
if not any(dir_path.iterdir()):
|
||||
gitkeep.touch()
|
||||
|
||||
|
||||
def create_readme(base_path: Path) -> None:
|
||||
"""Create the README.md file."""
|
||||
docs_path = base_path / 'docs'
|
||||
readme_path = docs_path / 'README.md'
|
||||
|
||||
# Format directory descriptions
|
||||
descriptions = []
|
||||
for directory, description in DIRECTORY_STRUCTURE.items():
|
||||
descriptions.append(f"- **{directory}/** - {description}")
|
||||
|
||||
readme_content = README_TEMPLATE.format(
|
||||
directory_descriptions='\n'.join(descriptions),
|
||||
timestamp=datetime.now().strftime('%Y-%m-%d')
|
||||
)
|
||||
|
||||
readme_path.write_text(readme_content)
|
||||
print(f"✅ Created: {readme_path}")
|
||||
|
||||
|
||||
def create_index(base_path: Path) -> None:
|
||||
"""Create initial INDEX.md file."""
|
||||
docs_path = base_path / 'docs'
|
||||
index_path = docs_path / 'INDEX.md'
|
||||
|
||||
index_content = f"""# Documentation Index
|
||||
|
||||
Auto-generated index of all documents. Last updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
|
||||
|
||||
Run `python scripts/index_docs.py` to regenerate this index.
|
||||
|
||||
---
|
||||
|
||||
## Summary
|
||||
|
||||
Total documents: 0
|
||||
|
||||
---
|
||||
|
||||
_No documents found. Add documents to the category directories and regenerate the index._
|
||||
"""
|
||||
|
||||
index_path.write_text(index_content)
|
||||
print(f"✅ Created: {index_path}")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
if len(sys.argv) > 1:
|
||||
base_path = Path(sys.argv[1]).resolve()
|
||||
else:
|
||||
base_path = Path.cwd()
|
||||
|
||||
print(f"Initializing docs structure at: {base_path}")
|
||||
print()
|
||||
|
||||
create_directory_structure(base_path)
|
||||
create_readme(base_path)
|
||||
create_index(base_path)
|
||||
|
||||
print()
|
||||
print("🎉 Documentation structure initialized successfully!")
|
||||
print()
|
||||
print("Next steps:")
|
||||
print("1. Add documents to the category directories")
|
||||
print("2. Run 'python scripts/index_docs.py' to update the index")
|
||||
print("3. Run 'python scripts/archive_docs.py' periodically to maintain the archive")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
178
skills/cyberarian/scripts/validate_doc_metadata.py
Executable file
178
skills/cyberarian/scripts/validate_doc_metadata.py
Executable file
@@ -0,0 +1,178 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Validate that all documents have proper YAML frontmatter metadata.
|
||||
Reports documents with missing or invalid metadata.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import re
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
import yaml
|
||||
|
||||
|
||||
REQUIRED_FIELDS = ['title', 'category', 'status', 'created', 'last_updated']
|
||||
VALID_STATUSES = ['draft', 'active', 'complete', 'archived']
|
||||
VALID_CATEGORIES = ['ai_docs', 'specs', 'analysis', 'plans', 'templates', 'archive']
|
||||
|
||||
|
||||
def extract_frontmatter(file_path: Path) -> dict:
|
||||
"""Extract YAML frontmatter from a markdown file."""
|
||||
try:
|
||||
content = file_path.read_text()
|
||||
|
||||
# Match YAML frontmatter between --- delimiters
|
||||
match = re.match(r'^---\s*\n(.*?)\n---\s*\n', content, re.DOTALL)
|
||||
if not match:
|
||||
return None # No frontmatter found
|
||||
|
||||
frontmatter_text = match.group(1)
|
||||
metadata = yaml.safe_load(frontmatter_text)
|
||||
|
||||
return metadata if isinstance(metadata, dict) else None
|
||||
|
||||
except Exception as e:
|
||||
return {'_error': str(e)}
|
||||
|
||||
|
||||
def validate_date(date_str: str) -> bool:
|
||||
"""Validate date format (YYYY-MM-DD)."""
|
||||
try:
|
||||
datetime.strptime(str(date_str), '%Y-%m-%d')
|
||||
return True
|
||||
except (ValueError, TypeError):
|
||||
return False
|
||||
|
||||
|
||||
def validate_metadata(metadata: dict, category_from_path: str) -> list[str]:
|
||||
"""
|
||||
Validate metadata against requirements.
|
||||
Returns list of validation errors (empty if valid).
|
||||
"""
|
||||
errors = []
|
||||
|
||||
if metadata is None:
|
||||
return ["No YAML frontmatter found"]
|
||||
|
||||
if '_error' in metadata:
|
||||
return [f"Failed to parse frontmatter: {metadata['_error']}"]
|
||||
|
||||
# Check required fields
|
||||
for field in REQUIRED_FIELDS:
|
||||
if field not in metadata:
|
||||
errors.append(f"Missing required field: {field}")
|
||||
|
||||
# Validate status
|
||||
if 'status' in metadata:
|
||||
if metadata['status'] not in VALID_STATUSES:
|
||||
errors.append(f"Invalid status '{metadata['status']}'. Must be one of: {', '.join(VALID_STATUSES)}")
|
||||
|
||||
# Validate category
|
||||
if 'category' in metadata:
|
||||
if metadata['category'] not in VALID_CATEGORIES:
|
||||
errors.append(f"Invalid category '{metadata['category']}'. Must be one of: {', '.join(VALID_CATEGORIES)}")
|
||||
elif metadata['category'] != category_from_path:
|
||||
errors.append(f"Category mismatch: metadata says '{metadata['category']}' but file is in '{category_from_path}/'")
|
||||
|
||||
# Validate dates
|
||||
for date_field in ['created', 'last_updated']:
|
||||
if date_field in metadata:
|
||||
if not validate_date(metadata[date_field]):
|
||||
errors.append(f"Invalid {date_field} date format. Must be YYYY-MM-DD")
|
||||
|
||||
# Validate tags (optional but must be list if present)
|
||||
if 'tags' in metadata:
|
||||
if not isinstance(metadata['tags'], list):
|
||||
errors.append("Tags must be a list")
|
||||
|
||||
return errors
|
||||
|
||||
|
||||
def scan_and_validate(docs_path: Path) -> dict:
|
||||
"""
|
||||
Scan all documents and validate their metadata.
|
||||
Returns validation results.
|
||||
"""
|
||||
results = {
|
||||
'valid': [],
|
||||
'invalid': [],
|
||||
'no_frontmatter': [],
|
||||
'total': 0
|
||||
}
|
||||
|
||||
skip_files = {'README.md', 'INDEX.md', '.gitkeep'}
|
||||
|
||||
for category_dir in docs_path.iterdir():
|
||||
if not category_dir.is_dir() or category_dir.name.startswith('.'):
|
||||
continue
|
||||
|
||||
category_name = category_dir.name
|
||||
|
||||
# Find all markdown files
|
||||
for md_file in category_dir.rglob('*.md'):
|
||||
if md_file.name in skip_files:
|
||||
continue
|
||||
|
||||
results['total'] += 1
|
||||
relative_path = md_file.relative_to(docs_path)
|
||||
|
||||
# Extract and validate metadata
|
||||
metadata = extract_frontmatter(md_file)
|
||||
errors = validate_metadata(metadata, category_name)
|
||||
|
||||
if not errors:
|
||||
results['valid'].append(str(relative_path))
|
||||
else:
|
||||
results['invalid'].append({
|
||||
'path': str(relative_path),
|
||||
'errors': errors
|
||||
})
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
if len(sys.argv) > 1:
|
||||
base_path = Path(sys.argv[1]).resolve()
|
||||
else:
|
||||
base_path = Path.cwd()
|
||||
|
||||
docs_path = base_path / 'docs'
|
||||
|
||||
if not docs_path.exists():
|
||||
print(f"❌ Error: docs/ directory not found at {docs_path}")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Validating documents in: {docs_path}")
|
||||
print()
|
||||
|
||||
# Scan and validate
|
||||
results = scan_and_validate(docs_path)
|
||||
|
||||
# Display results
|
||||
print("=" * 60)
|
||||
print("Validation Results:")
|
||||
print(f" Total documents: {results['total']}")
|
||||
print(f" ✅ Valid: {len(results['valid'])}")
|
||||
print(f" ❌ Invalid: {len(results['invalid'])}")
|
||||
print()
|
||||
|
||||
if results['invalid']:
|
||||
print("Invalid Documents:")
|
||||
print()
|
||||
for item in results['invalid']:
|
||||
print(f" 📄 {item['path']}")
|
||||
for error in item['errors']:
|
||||
print(f" • {error}")
|
||||
print()
|
||||
|
||||
if results['valid'] and not results['invalid']:
|
||||
print("🎉 All documents have valid metadata!")
|
||||
|
||||
# Exit with error code if any invalid documents
|
||||
sys.exit(1 if results['invalid'] else 0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user