Initial commit

This commit is contained in:
Zhongwei Li
2025-11-30 08:58:05 +08:00
commit 36a6fff8d8
20 changed files with 4237 additions and 0 deletions

View File

@@ -0,0 +1,178 @@
#!/usr/bin/env python3
"""
Validate that all documents have proper YAML frontmatter metadata.
Reports documents with missing or invalid metadata.
"""
import sys
import re
from pathlib import Path
from datetime import datetime
import yaml
REQUIRED_FIELDS = ['title', 'category', 'status', 'created', 'last_updated']
VALID_STATUSES = ['draft', 'active', 'complete', 'archived']
VALID_CATEGORIES = ['ai_docs', 'specs', 'analysis', 'plans', 'templates', 'archive']
def extract_frontmatter(file_path: Path) -> dict:
"""Extract YAML frontmatter from a markdown file."""
try:
content = file_path.read_text()
# Match YAML frontmatter between --- delimiters
match = re.match(r'^---\s*\n(.*?)\n---\s*\n', content, re.DOTALL)
if not match:
return None # No frontmatter found
frontmatter_text = match.group(1)
metadata = yaml.safe_load(frontmatter_text)
return metadata if isinstance(metadata, dict) else None
except Exception as e:
return {'_error': str(e)}
def validate_date(date_str: str) -> bool:
"""Validate date format (YYYY-MM-DD)."""
try:
datetime.strptime(str(date_str), '%Y-%m-%d')
return True
except (ValueError, TypeError):
return False
def validate_metadata(metadata: dict, category_from_path: str) -> list[str]:
"""
Validate metadata against requirements.
Returns list of validation errors (empty if valid).
"""
errors = []
if metadata is None:
return ["No YAML frontmatter found"]
if '_error' in metadata:
return [f"Failed to parse frontmatter: {metadata['_error']}"]
# Check required fields
for field in REQUIRED_FIELDS:
if field not in metadata:
errors.append(f"Missing required field: {field}")
# Validate status
if 'status' in metadata:
if metadata['status'] not in VALID_STATUSES:
errors.append(f"Invalid status '{metadata['status']}'. Must be one of: {', '.join(VALID_STATUSES)}")
# Validate category
if 'category' in metadata:
if metadata['category'] not in VALID_CATEGORIES:
errors.append(f"Invalid category '{metadata['category']}'. Must be one of: {', '.join(VALID_CATEGORIES)}")
elif metadata['category'] != category_from_path:
errors.append(f"Category mismatch: metadata says '{metadata['category']}' but file is in '{category_from_path}/'")
# Validate dates
for date_field in ['created', 'last_updated']:
if date_field in metadata:
if not validate_date(metadata[date_field]):
errors.append(f"Invalid {date_field} date format. Must be YYYY-MM-DD")
# Validate tags (optional but must be list if present)
if 'tags' in metadata:
if not isinstance(metadata['tags'], list):
errors.append("Tags must be a list")
return errors
def scan_and_validate(docs_path: Path) -> dict:
"""
Scan all documents and validate their metadata.
Returns validation results.
"""
results = {
'valid': [],
'invalid': [],
'no_frontmatter': [],
'total': 0
}
skip_files = {'README.md', 'INDEX.md', '.gitkeep'}
for category_dir in docs_path.iterdir():
if not category_dir.is_dir() or category_dir.name.startswith('.'):
continue
category_name = category_dir.name
# Find all markdown files
for md_file in category_dir.rglob('*.md'):
if md_file.name in skip_files:
continue
results['total'] += 1
relative_path = md_file.relative_to(docs_path)
# Extract and validate metadata
metadata = extract_frontmatter(md_file)
errors = validate_metadata(metadata, category_name)
if not errors:
results['valid'].append(str(relative_path))
else:
results['invalid'].append({
'path': str(relative_path),
'errors': errors
})
return results
def main():
"""Main entry point."""
if len(sys.argv) > 1:
base_path = Path(sys.argv[1]).resolve()
else:
base_path = Path.cwd()
docs_path = base_path / 'docs'
if not docs_path.exists():
print(f"❌ Error: docs/ directory not found at {docs_path}")
sys.exit(1)
print(f"Validating documents in: {docs_path}")
print()
# Scan and validate
results = scan_and_validate(docs_path)
# Display results
print("=" * 60)
print("Validation Results:")
print(f" Total documents: {results['total']}")
print(f" ✅ Valid: {len(results['valid'])}")
print(f" ❌ Invalid: {len(results['invalid'])}")
print()
if results['invalid']:
print("Invalid Documents:")
print()
for item in results['invalid']:
print(f" 📄 {item['path']}")
for error in item['errors']:
print(f"{error}")
print()
if results['valid'] and not results['invalid']:
print("🎉 All documents have valid metadata!")
# Exit with error code if any invalid documents
sys.exit(1 if results['invalid'] else 0)
if __name__ == '__main__':
main()