Initial commit
This commit is contained in:
178
skills/cyberarian/scripts/validate_doc_metadata.py
Executable file
178
skills/cyberarian/scripts/validate_doc_metadata.py
Executable file
@@ -0,0 +1,178 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Validate that all documents have proper YAML frontmatter metadata.
|
||||
Reports documents with missing or invalid metadata.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import re
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
import yaml
|
||||
|
||||
|
||||
REQUIRED_FIELDS = ['title', 'category', 'status', 'created', 'last_updated']
|
||||
VALID_STATUSES = ['draft', 'active', 'complete', 'archived']
|
||||
VALID_CATEGORIES = ['ai_docs', 'specs', 'analysis', 'plans', 'templates', 'archive']
|
||||
|
||||
|
||||
def extract_frontmatter(file_path: Path) -> dict:
|
||||
"""Extract YAML frontmatter from a markdown file."""
|
||||
try:
|
||||
content = file_path.read_text()
|
||||
|
||||
# Match YAML frontmatter between --- delimiters
|
||||
match = re.match(r'^---\s*\n(.*?)\n---\s*\n', content, re.DOTALL)
|
||||
if not match:
|
||||
return None # No frontmatter found
|
||||
|
||||
frontmatter_text = match.group(1)
|
||||
metadata = yaml.safe_load(frontmatter_text)
|
||||
|
||||
return metadata if isinstance(metadata, dict) else None
|
||||
|
||||
except Exception as e:
|
||||
return {'_error': str(e)}
|
||||
|
||||
|
||||
def validate_date(date_str: str) -> bool:
|
||||
"""Validate date format (YYYY-MM-DD)."""
|
||||
try:
|
||||
datetime.strptime(str(date_str), '%Y-%m-%d')
|
||||
return True
|
||||
except (ValueError, TypeError):
|
||||
return False
|
||||
|
||||
|
||||
def validate_metadata(metadata: dict, category_from_path: str) -> list[str]:
|
||||
"""
|
||||
Validate metadata against requirements.
|
||||
Returns list of validation errors (empty if valid).
|
||||
"""
|
||||
errors = []
|
||||
|
||||
if metadata is None:
|
||||
return ["No YAML frontmatter found"]
|
||||
|
||||
if '_error' in metadata:
|
||||
return [f"Failed to parse frontmatter: {metadata['_error']}"]
|
||||
|
||||
# Check required fields
|
||||
for field in REQUIRED_FIELDS:
|
||||
if field not in metadata:
|
||||
errors.append(f"Missing required field: {field}")
|
||||
|
||||
# Validate status
|
||||
if 'status' in metadata:
|
||||
if metadata['status'] not in VALID_STATUSES:
|
||||
errors.append(f"Invalid status '{metadata['status']}'. Must be one of: {', '.join(VALID_STATUSES)}")
|
||||
|
||||
# Validate category
|
||||
if 'category' in metadata:
|
||||
if metadata['category'] not in VALID_CATEGORIES:
|
||||
errors.append(f"Invalid category '{metadata['category']}'. Must be one of: {', '.join(VALID_CATEGORIES)}")
|
||||
elif metadata['category'] != category_from_path:
|
||||
errors.append(f"Category mismatch: metadata says '{metadata['category']}' but file is in '{category_from_path}/'")
|
||||
|
||||
# Validate dates
|
||||
for date_field in ['created', 'last_updated']:
|
||||
if date_field in metadata:
|
||||
if not validate_date(metadata[date_field]):
|
||||
errors.append(f"Invalid {date_field} date format. Must be YYYY-MM-DD")
|
||||
|
||||
# Validate tags (optional but must be list if present)
|
||||
if 'tags' in metadata:
|
||||
if not isinstance(metadata['tags'], list):
|
||||
errors.append("Tags must be a list")
|
||||
|
||||
return errors
|
||||
|
||||
|
||||
def scan_and_validate(docs_path: Path) -> dict:
|
||||
"""
|
||||
Scan all documents and validate their metadata.
|
||||
Returns validation results.
|
||||
"""
|
||||
results = {
|
||||
'valid': [],
|
||||
'invalid': [],
|
||||
'no_frontmatter': [],
|
||||
'total': 0
|
||||
}
|
||||
|
||||
skip_files = {'README.md', 'INDEX.md', '.gitkeep'}
|
||||
|
||||
for category_dir in docs_path.iterdir():
|
||||
if not category_dir.is_dir() or category_dir.name.startswith('.'):
|
||||
continue
|
||||
|
||||
category_name = category_dir.name
|
||||
|
||||
# Find all markdown files
|
||||
for md_file in category_dir.rglob('*.md'):
|
||||
if md_file.name in skip_files:
|
||||
continue
|
||||
|
||||
results['total'] += 1
|
||||
relative_path = md_file.relative_to(docs_path)
|
||||
|
||||
# Extract and validate metadata
|
||||
metadata = extract_frontmatter(md_file)
|
||||
errors = validate_metadata(metadata, category_name)
|
||||
|
||||
if not errors:
|
||||
results['valid'].append(str(relative_path))
|
||||
else:
|
||||
results['invalid'].append({
|
||||
'path': str(relative_path),
|
||||
'errors': errors
|
||||
})
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
if len(sys.argv) > 1:
|
||||
base_path = Path(sys.argv[1]).resolve()
|
||||
else:
|
||||
base_path = Path.cwd()
|
||||
|
||||
docs_path = base_path / 'docs'
|
||||
|
||||
if not docs_path.exists():
|
||||
print(f"❌ Error: docs/ directory not found at {docs_path}")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Validating documents in: {docs_path}")
|
||||
print()
|
||||
|
||||
# Scan and validate
|
||||
results = scan_and_validate(docs_path)
|
||||
|
||||
# Display results
|
||||
print("=" * 60)
|
||||
print("Validation Results:")
|
||||
print(f" Total documents: {results['total']}")
|
||||
print(f" ✅ Valid: {len(results['valid'])}")
|
||||
print(f" ❌ Invalid: {len(results['invalid'])}")
|
||||
print()
|
||||
|
||||
if results['invalid']:
|
||||
print("Invalid Documents:")
|
||||
print()
|
||||
for item in results['invalid']:
|
||||
print(f" 📄 {item['path']}")
|
||||
for error in item['errors']:
|
||||
print(f" • {error}")
|
||||
print()
|
||||
|
||||
if results['valid'] and not results['invalid']:
|
||||
print("🎉 All documents have valid metadata!")
|
||||
|
||||
# Exit with error code if any invalid documents
|
||||
sys.exit(1 if results['invalid'] else 0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user