Initial commit

This commit is contained in:
Zhongwei Li
2025-11-30 08:58:05 +08:00
commit 36a6fff8d8
20 changed files with 4237 additions and 0 deletions

View File

@@ -0,0 +1,262 @@
#!/usr/bin/env python3
"""
Automatically archive documents based on status, age, and category-specific rules.
Documents are moved to archive/ and their metadata is updated.
"""
import os
import sys
import re
import shutil
from pathlib import Path
from datetime import datetime, timedelta
import yaml
# Archiving rules by category (days since last_updated)
ARCHIVING_RULES = {
'specs': {
'complete_after_days': 90,
'auto_archive': True,
'require_complete_status': True
},
'analysis': {
'complete_after_days': 60,
'auto_archive': True,
'require_complete_status': True
},
'plans': {
'complete_after_days': 30,
'auto_archive': True,
'require_complete_status': True
},
'ai_docs': {
'auto_archive': False, # Manual archiving only for reference docs
},
'templates': {
'auto_archive': False, # Never auto-archive templates
}
}
def extract_frontmatter(file_path: Path) -> tuple[dict, str]:
"""Extract YAML frontmatter and remaining content from a markdown file."""
try:
content = file_path.read_text()
# Match YAML frontmatter between --- delimiters
match = re.match(r'^---\s*\n(.*?)\n---\s*\n(.*)', content, re.DOTALL)
if not match:
return {}, content
frontmatter_text = match.group(1)
body = match.group(2)
metadata = yaml.safe_load(frontmatter_text)
return (metadata if isinstance(metadata, dict) else {}), body
except Exception as e:
print(f"⚠️ Warning: Could not parse {file_path}: {e}")
return {}, ""
def update_frontmatter(file_path: Path, metadata: dict) -> None:
"""Update the YAML frontmatter in a markdown file."""
_, body = extract_frontmatter(file_path)
frontmatter = yaml.dump(metadata, default_flow_style=False, sort_keys=False)
new_content = f"---\n{frontmatter}---\n{body}"
file_path.write_text(new_content)
def should_archive(metadata: dict, category: str, file_modified: datetime) -> tuple[bool, str]:
"""
Determine if a document should be archived based on rules.
Returns (should_archive, reason).
"""
# Skip if already archived
if metadata.get('status') == 'archived':
return False, "already archived"
# Get category rules
rules = ARCHIVING_RULES.get(category, {})
# Skip if auto-archiving is disabled for this category
if not rules.get('auto_archive', False):
return False, f"{category} does not auto-archive"
# Check if status is 'complete' (required for most categories)
if rules.get('require_complete_status', False):
if metadata.get('status') != 'complete':
return False, "status is not 'complete'"
# Check age-based archiving
complete_after_days = rules.get('complete_after_days')
if complete_after_days:
last_updated = metadata.get('last_updated')
if not last_updated:
return False, "no last_updated date in metadata"
try:
if isinstance(last_updated, str):
updated_date = datetime.strptime(last_updated, '%Y-%m-%d').date()
else:
# YAML parser returns date objects, convert to date for comparison
updated_date = last_updated if hasattr(last_updated, 'year') else datetime.strptime(str(last_updated), '%Y-%m-%d').date()
days_old = (datetime.now().date() - updated_date).days
if days_old >= complete_after_days:
return True, f"{days_old} days old (threshold: {complete_after_days})"
except ValueError:
return False, "invalid last_updated date format"
return False, "no archiving criteria met"
def archive_document(file_path: Path, docs_path: Path, reason: str, dry_run: bool = False) -> bool:
"""
Archive a document by moving it to archive/ and updating its metadata.
Returns True if successful.
"""
try:
# Read metadata
metadata, body = extract_frontmatter(file_path)
# Determine archive path (preserve subdirectory structure)
relative_path = file_path.relative_to(docs_path)
category = relative_path.parts[0]
# Create archive subdirectory for the category
archive_path = docs_path / 'archive' / category
archive_path.mkdir(parents=True, exist_ok=True)
# Build destination path
archive_file = archive_path / file_path.name
# Handle name conflicts
if archive_file.exists():
base = archive_file.stem
suffix = archive_file.suffix
counter = 1
while archive_file.exists():
archive_file = archive_path / f"{base}_{counter}{suffix}"
counter += 1
if dry_run:
print(f" [DRY RUN] Would archive: {relative_path} → archive/{category}/{archive_file.name}")
print(f" Reason: {reason}")
return True
# Update metadata
metadata['status'] = 'archived'
metadata['archived_date'] = datetime.now().strftime('%Y-%m-%d')
metadata['archive_reason'] = reason
# Write updated file to archive
frontmatter = yaml.dump(metadata, default_flow_style=False, sort_keys=False)
new_content = f"---\n{frontmatter}---\n{body}"
archive_file.write_text(new_content)
# Remove original
file_path.unlink()
print(f" ✅ Archived: {relative_path} → archive/{category}/{archive_file.name}")
print(f" Reason: {reason}")
return True
except Exception as e:
print(f" ❌ Error archiving {file_path}: {e}")
return False
def scan_and_archive(docs_path: Path, dry_run: bool = False) -> dict:
"""
Scan all documents and archive those that meet criteria.
Returns statistics about the archiving operation.
"""
stats = {
'scanned': 0,
'archived': 0,
'skipped': 0,
'errors': 0
}
skip_files = {'README.md', 'INDEX.md', '.gitkeep'}
skip_dirs = {'archive'}
for category_dir in docs_path.iterdir():
if not category_dir.is_dir() or category_dir.name in skip_dirs or category_dir.name.startswith('.'):
continue
category_name = category_dir.name
# Find all markdown files
for md_file in category_dir.rglob('*.md'):
if md_file.name in skip_files:
continue
stats['scanned'] += 1
# Extract metadata
metadata, _ = extract_frontmatter(md_file)
file_stats = md_file.stat()
file_modified = datetime.fromtimestamp(file_stats.st_mtime)
# Check if should archive
should_arch, reason = should_archive(metadata, category_name, file_modified)
if should_arch:
success = archive_document(md_file, docs_path, reason, dry_run)
if success:
stats['archived'] += 1
else:
stats['errors'] += 1
else:
stats['skipped'] += 1
return stats
def main():
"""Main entry point."""
dry_run = '--dry-run' in sys.argv
# Get base path
args = [arg for arg in sys.argv[1:] if not arg.startswith('--')]
if args:
base_path = Path(args[0]).resolve()
else:
base_path = Path.cwd()
docs_path = base_path / 'docs'
if not docs_path.exists():
print(f"❌ Error: docs/ directory not found at {docs_path}")
sys.exit(1)
print(f"Scanning documents in: {docs_path}")
if dry_run:
print("🔍 DRY RUN MODE - No files will be modified")
print()
# Scan and archive
stats = scan_and_archive(docs_path, dry_run)
print()
print("=" * 60)
print("Archive Summary:")
print(f" Documents scanned: {stats['scanned']}")
print(f" Documents archived: {stats['archived']}")
print(f" Documents skipped: {stats['skipped']}")
print(f" Errors: {stats['errors']}")
print()
if not dry_run and stats['archived'] > 0:
print("💡 Tip: Run 'python scripts/index_docs.py' to update the documentation index")
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,177 @@
#!/usr/bin/env python3
"""
Generate and update the INDEX.md file by scanning all documents in docs/.
Reads YAML frontmatter to extract metadata and organize the index.
"""
import os
import sys
import re
from pathlib import Path
from datetime import datetime
from collections import defaultdict
import yaml
def extract_frontmatter(file_path: Path) -> dict:
"""Extract YAML frontmatter from a markdown file."""
try:
content = file_path.read_text()
# Match YAML frontmatter between --- delimiters
match = re.match(r'^---\s*\n(.*?)\n---\s*\n', content, re.DOTALL)
if not match:
return {}
frontmatter_text = match.group(1)
metadata = yaml.safe_load(frontmatter_text)
return metadata if isinstance(metadata, dict) else {}
except Exception as e:
print(f"⚠️ Warning: Could not parse frontmatter in {file_path}: {e}")
return {}
def get_file_stats(file_path: Path) -> dict:
"""Get file statistics."""
stats = file_path.stat()
return {
'size': stats.st_size,
'modified': datetime.fromtimestamp(stats.st_mtime)
}
def scan_documents(docs_path: Path) -> dict:
"""Scan all markdown documents in docs/ and extract metadata."""
categories = defaultdict(list)
# Skip these files/directories
skip_files = {'README.md', 'INDEX.md', '.gitkeep'}
skip_dirs = {'archive'} # We'll handle archive separately
for category_dir in docs_path.iterdir():
if not category_dir.is_dir() or category_dir.name.startswith('.'):
continue
category_name = category_dir.name
# Find all markdown files
for md_file in category_dir.rglob('*.md'):
if md_file.name in skip_files:
continue
# Extract metadata
metadata = extract_frontmatter(md_file)
stats = get_file_stats(md_file)
# Build document entry
relative_path = md_file.relative_to(docs_path)
doc_entry = {
'path': str(relative_path),
'title': metadata.get('title', md_file.stem),
'status': metadata.get('status', 'unknown'),
'created': metadata.get('created', 'unknown'),
'last_updated': metadata.get('last_updated', stats['modified'].strftime('%Y-%m-%d')),
'tags': metadata.get('tags', []),
'category': category_name,
'file_modified': stats['modified']
}
categories[category_name].append(doc_entry)
return categories
def generate_index(categories: dict) -> str:
"""Generate the INDEX.md content."""
total_docs = sum(len(docs) for docs in categories.values())
index_lines = [
"# Documentation Index",
"",
f"Auto-generated index of all documents. Last updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
"",
"Run `python scripts/index_docs.py` to regenerate this index.",
"",
"---",
"",
"## Summary",
"",
f"Total documents: {total_docs}",
""
]
# Add category breakdown
if categories:
index_lines.append("By category:")
for category in sorted(categories.keys()):
count = len(categories[category])
index_lines.append(f"- **{category}**: {count} document{'s' if count != 1 else ''}")
index_lines.append("")
index_lines.append("---")
index_lines.append("")
# Add documents by category
if not categories:
index_lines.append("_No documents found. Add documents to the category directories and regenerate the index._")
else:
for category in sorted(categories.keys()):
docs = categories[category]
docs.sort(key=lambda d: d['last_updated'], reverse=True)
index_lines.append(f"## {category.replace('_', ' ').title()}")
index_lines.append("")
for doc in docs:
# Format: [Title](path) - status | updated: date | tags
title_link = f"[{doc['title']}]({doc['path']})"
status_badge = f"**{doc['status']}**"
updated = f"updated: {doc['last_updated']}"
tags = f"tags: [{', '.join(doc['tags'])}]" if doc['tags'] else ""
parts = [title_link, status_badge, updated]
if tags:
parts.append(tags)
index_lines.append(f"- {' | '.join(parts)}")
index_lines.append("")
return '\n'.join(index_lines)
def main():
"""Main entry point."""
if len(sys.argv) > 1:
base_path = Path(sys.argv[1]).resolve()
else:
base_path = Path.cwd()
docs_path = base_path / 'docs'
if not docs_path.exists():
print(f"❌ Error: docs/ directory not found at {docs_path}")
print("Run 'python scripts/init_docs_structure.py' first to initialize the structure.")
sys.exit(1)
print(f"Scanning documents in: {docs_path}")
# Scan all documents
categories = scan_documents(docs_path)
# Generate index content
index_content = generate_index(categories)
# Write INDEX.md
index_path = docs_path / 'INDEX.md'
index_path.write_text(index_content)
total_docs = sum(len(docs) for docs in categories.values())
print(f"✅ Generated index with {total_docs} documents")
print(f"✅ Updated: {index_path}")
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,156 @@
#!/usr/bin/env python3
"""
Initialize the docs/ directory structure for document lifecycle management.
Creates all required directories and initial README.md.
"""
import os
import sys
from pathlib import Path
from datetime import datetime
DIRECTORY_STRUCTURE = {
'ai_docs': 'Reference materials for Claude Code: SDKs, API docs, repo context',
'specs': 'Feature and migration specifications',
'analysis': 'Investigation outputs: bug hunting, optimization, cleanup',
'plans': 'Implementation plans from specs, analysis, or ad-hoc tasks',
'templates': 'Reusable document templates',
'archive': 'Historical and completed documents'
}
README_TEMPLATE = """# Documentation Structure
This directory contains project documentation organized by purpose and lifecycle stage.
## Directory Structure
{directory_descriptions}
## Document Lifecycle
Documents follow a lifecycle managed through YAML frontmatter:
1. **Draft** → Document is being created
2. **Active** → Document is current and relevant
3. **Complete** → Work is done, kept for reference
4. **Archived** → Moved to archive/ when no longer relevant
## Metadata Requirements
All documents should include YAML frontmatter:
```yaml
---
title: Document Title
category: specs|analysis|plans|ai_docs|templates
status: draft|active|complete|archived
created: YYYY-MM-DD
last_updated: YYYY-MM-DD
tags: [tag1, tag2]
---
```
See INDEX.md for a complete list of all documents.
## Temporary Documents
Ephemeral/scratch documents should be created in `/tmp` or system temp directories,
NOT in this docs/ directory. The docs/ directory is for persistent documentation only.
---
Last updated: {timestamp}
"""
def create_directory_structure(base_path: Path) -> None:
"""Create the docs directory structure."""
docs_path = base_path / 'docs'
# Create main docs directory
docs_path.mkdir(exist_ok=True)
print(f"✅ Created: {docs_path}")
# Create category directories
for directory, description in DIRECTORY_STRUCTURE.items():
dir_path = docs_path / directory
dir_path.mkdir(exist_ok=True)
print(f"✅ Created: {dir_path}")
# Create .gitkeep for empty directories
gitkeep = dir_path / '.gitkeep'
if not any(dir_path.iterdir()):
gitkeep.touch()
def create_readme(base_path: Path) -> None:
"""Create the README.md file."""
docs_path = base_path / 'docs'
readme_path = docs_path / 'README.md'
# Format directory descriptions
descriptions = []
for directory, description in DIRECTORY_STRUCTURE.items():
descriptions.append(f"- **{directory}/** - {description}")
readme_content = README_TEMPLATE.format(
directory_descriptions='\n'.join(descriptions),
timestamp=datetime.now().strftime('%Y-%m-%d')
)
readme_path.write_text(readme_content)
print(f"✅ Created: {readme_path}")
def create_index(base_path: Path) -> None:
"""Create initial INDEX.md file."""
docs_path = base_path / 'docs'
index_path = docs_path / 'INDEX.md'
index_content = f"""# Documentation Index
Auto-generated index of all documents. Last updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
Run `python scripts/index_docs.py` to regenerate this index.
---
## Summary
Total documents: 0
---
_No documents found. Add documents to the category directories and regenerate the index._
"""
index_path.write_text(index_content)
print(f"✅ Created: {index_path}")
def main():
"""Main entry point."""
if len(sys.argv) > 1:
base_path = Path(sys.argv[1]).resolve()
else:
base_path = Path.cwd()
print(f"Initializing docs structure at: {base_path}")
print()
create_directory_structure(base_path)
create_readme(base_path)
create_index(base_path)
print()
print("🎉 Documentation structure initialized successfully!")
print()
print("Next steps:")
print("1. Add documents to the category directories")
print("2. Run 'python scripts/index_docs.py' to update the index")
print("3. Run 'python scripts/archive_docs.py' periodically to maintain the archive")
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,178 @@
#!/usr/bin/env python3
"""
Validate that all documents have proper YAML frontmatter metadata.
Reports documents with missing or invalid metadata.
"""
import sys
import re
from pathlib import Path
from datetime import datetime
import yaml
REQUIRED_FIELDS = ['title', 'category', 'status', 'created', 'last_updated']
VALID_STATUSES = ['draft', 'active', 'complete', 'archived']
VALID_CATEGORIES = ['ai_docs', 'specs', 'analysis', 'plans', 'templates', 'archive']
def extract_frontmatter(file_path: Path) -> dict:
"""Extract YAML frontmatter from a markdown file."""
try:
content = file_path.read_text()
# Match YAML frontmatter between --- delimiters
match = re.match(r'^---\s*\n(.*?)\n---\s*\n', content, re.DOTALL)
if not match:
return None # No frontmatter found
frontmatter_text = match.group(1)
metadata = yaml.safe_load(frontmatter_text)
return metadata if isinstance(metadata, dict) else None
except Exception as e:
return {'_error': str(e)}
def validate_date(date_str: str) -> bool:
"""Validate date format (YYYY-MM-DD)."""
try:
datetime.strptime(str(date_str), '%Y-%m-%d')
return True
except (ValueError, TypeError):
return False
def validate_metadata(metadata: dict, category_from_path: str) -> list[str]:
"""
Validate metadata against requirements.
Returns list of validation errors (empty if valid).
"""
errors = []
if metadata is None:
return ["No YAML frontmatter found"]
if '_error' in metadata:
return [f"Failed to parse frontmatter: {metadata['_error']}"]
# Check required fields
for field in REQUIRED_FIELDS:
if field not in metadata:
errors.append(f"Missing required field: {field}")
# Validate status
if 'status' in metadata:
if metadata['status'] not in VALID_STATUSES:
errors.append(f"Invalid status '{metadata['status']}'. Must be one of: {', '.join(VALID_STATUSES)}")
# Validate category
if 'category' in metadata:
if metadata['category'] not in VALID_CATEGORIES:
errors.append(f"Invalid category '{metadata['category']}'. Must be one of: {', '.join(VALID_CATEGORIES)}")
elif metadata['category'] != category_from_path:
errors.append(f"Category mismatch: metadata says '{metadata['category']}' but file is in '{category_from_path}/'")
# Validate dates
for date_field in ['created', 'last_updated']:
if date_field in metadata:
if not validate_date(metadata[date_field]):
errors.append(f"Invalid {date_field} date format. Must be YYYY-MM-DD")
# Validate tags (optional but must be list if present)
if 'tags' in metadata:
if not isinstance(metadata['tags'], list):
errors.append("Tags must be a list")
return errors
def scan_and_validate(docs_path: Path) -> dict:
"""
Scan all documents and validate their metadata.
Returns validation results.
"""
results = {
'valid': [],
'invalid': [],
'no_frontmatter': [],
'total': 0
}
skip_files = {'README.md', 'INDEX.md', '.gitkeep'}
for category_dir in docs_path.iterdir():
if not category_dir.is_dir() or category_dir.name.startswith('.'):
continue
category_name = category_dir.name
# Find all markdown files
for md_file in category_dir.rglob('*.md'):
if md_file.name in skip_files:
continue
results['total'] += 1
relative_path = md_file.relative_to(docs_path)
# Extract and validate metadata
metadata = extract_frontmatter(md_file)
errors = validate_metadata(metadata, category_name)
if not errors:
results['valid'].append(str(relative_path))
else:
results['invalid'].append({
'path': str(relative_path),
'errors': errors
})
return results
def main():
"""Main entry point."""
if len(sys.argv) > 1:
base_path = Path(sys.argv[1]).resolve()
else:
base_path = Path.cwd()
docs_path = base_path / 'docs'
if not docs_path.exists():
print(f"❌ Error: docs/ directory not found at {docs_path}")
sys.exit(1)
print(f"Validating documents in: {docs_path}")
print()
# Scan and validate
results = scan_and_validate(docs_path)
# Display results
print("=" * 60)
print("Validation Results:")
print(f" Total documents: {results['total']}")
print(f" ✅ Valid: {len(results['valid'])}")
print(f" ❌ Invalid: {len(results['invalid'])}")
print()
if results['invalid']:
print("Invalid Documents:")
print()
for item in results['invalid']:
print(f" 📄 {item['path']}")
for error in item['errors']:
print(f"{error}")
print()
if results['valid'] and not results['invalid']:
print("🎉 All documents have valid metadata!")
# Exit with error code if any invalid documents
sys.exit(1 if results['invalid'] else 0)
if __name__ == '__main__':
main()