Initial commit
This commit is contained in:
572
skills/docs.expand.glossary/glossary_expand.py
Executable file
572
skills/docs.expand.glossary/glossary_expand.py
Executable file
@@ -0,0 +1,572 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
glossary_expand.py - Implementation of the docs.expand.glossary Skill
|
||||
|
||||
Extract undocumented terms from Betty manifests and docs, then enrich glossary.md
|
||||
with new definitions.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import yaml
|
||||
import re
|
||||
from typing import Dict, Any, List, Set, Optional
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
from betty.config import BASE_DIR
|
||||
from betty.logging_utils import setup_logger
|
||||
from betty.errors import BettyError
|
||||
|
||||
logger = setup_logger(__name__)
|
||||
|
||||
|
||||
# Common field names found in manifests
|
||||
SKILL_FIELDS = {
|
||||
"name", "version", "description", "inputs", "outputs", "dependencies",
|
||||
"entrypoints", "status", "tags", "runtime", "handler", "permissions",
|
||||
"parameters", "command", "required", "type", "default"
|
||||
}
|
||||
|
||||
AGENT_FIELDS = {
|
||||
"name", "version", "description", "capabilities", "skills_available",
|
||||
"reasoning_mode", "context_requirements", "workflow_pattern",
|
||||
"error_handling", "output", "status", "tags", "dependencies",
|
||||
"max_retries", "timeout_seconds", "on_validation_failure",
|
||||
"on_generation_failure", "on_compilation_failure"
|
||||
}
|
||||
|
||||
COMMAND_FIELDS = {
|
||||
"name", "description", "execution", "parameters", "version", "status",
|
||||
"tags", "delegate_to", "workflow", "agent", "skill"
|
||||
}
|
||||
|
||||
HOOK_FIELDS = {
|
||||
"name", "description", "event", "command", "enabled", "blocking",
|
||||
"timeout", "version", "status", "tags"
|
||||
}
|
||||
|
||||
# Terms that are already well-documented or common
|
||||
SKIP_TERMS = {
|
||||
"name", "version", "description", "true", "false", "string", "boolean",
|
||||
"integer", "array", "object", "list", "dict", "file", "path", "url",
|
||||
"id", "uuid", "timestamp", "date", "time", "json", "yaml", "xml"
|
||||
}
|
||||
|
||||
|
||||
def build_response(
|
||||
ok: bool,
|
||||
errors: Optional[List[str]] = None,
|
||||
details: Optional[Dict[str, Any]] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""Build standardized response."""
|
||||
response: Dict[str, Any] = {
|
||||
"ok": ok,
|
||||
"status": "success" if ok else "failed",
|
||||
"errors": errors or [],
|
||||
"timestamp": datetime.now(timezone.utc).isoformat()
|
||||
}
|
||||
if details is not None:
|
||||
response["details"] = details
|
||||
return response
|
||||
|
||||
|
||||
def load_glossary(glossary_path: str) -> Dict[str, str]:
|
||||
"""
|
||||
Load existing glossary and extract defined terms.
|
||||
|
||||
Args:
|
||||
glossary_path: Path to glossary.md
|
||||
|
||||
Returns:
|
||||
Dictionary mapping term names to their sections
|
||||
"""
|
||||
if not os.path.exists(glossary_path):
|
||||
logger.warning(f"Glossary not found: {glossary_path}")
|
||||
return {}
|
||||
|
||||
terms = {}
|
||||
with open(glossary_path, 'r') as f:
|
||||
content = f.read()
|
||||
|
||||
# Extract term headings (### Term Name)
|
||||
pattern = r'^###\s+(.+)$'
|
||||
matches = re.finditer(pattern, content, re.MULTILINE)
|
||||
|
||||
for match in matches:
|
||||
term = match.group(1).strip()
|
||||
terms[term.lower()] = term
|
||||
|
||||
logger.info(f"Loaded {len(terms)} existing glossary terms")
|
||||
return terms
|
||||
|
||||
|
||||
def scan_yaml_files(pattern: str, base_dir: str) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Scan YAML files matching pattern.
|
||||
|
||||
Args:
|
||||
pattern: File pattern (e.g., "skill.yaml", "agent.yaml")
|
||||
base_dir: Base directory to search
|
||||
|
||||
Returns:
|
||||
List of parsed YAML data
|
||||
"""
|
||||
files = []
|
||||
for root, dirs, filenames in os.walk(base_dir):
|
||||
for filename in filenames:
|
||||
if filename == pattern:
|
||||
file_path = os.path.join(root, filename)
|
||||
try:
|
||||
with open(file_path, 'r') as f:
|
||||
data = yaml.safe_load(f)
|
||||
if data:
|
||||
data['_source_path'] = file_path
|
||||
files.append(data)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse {file_path}: {e}")
|
||||
|
||||
logger.info(f"Scanned {len(files)} {pattern} files")
|
||||
return files
|
||||
|
||||
|
||||
def scan_markdown_files(docs_dir: str) -> List[str]:
|
||||
"""
|
||||
Scan markdown files for capitalized terms that might need definitions.
|
||||
|
||||
Args:
|
||||
docs_dir: Directory containing markdown files
|
||||
|
||||
Returns:
|
||||
List of potential terms found in docs
|
||||
"""
|
||||
terms = set()
|
||||
|
||||
for file_path in Path(docs_dir).glob("*.md"):
|
||||
try:
|
||||
with open(file_path, 'r') as f:
|
||||
content = f.read()
|
||||
|
||||
# Find capitalized phrases (potential terms)
|
||||
# Look for patterns like "Breaking Change", "Blocking Hook", etc.
|
||||
pattern = r'\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\b'
|
||||
matches = re.finditer(pattern, content)
|
||||
|
||||
for match in matches:
|
||||
term = match.group(1)
|
||||
# Filter out common words and single words
|
||||
if len(term.split()) > 1 or term.lower() not in SKIP_TERMS:
|
||||
terms.add(term)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to scan {file_path}: {e}")
|
||||
|
||||
logger.info(f"Found {len(terms)} potential terms in docs")
|
||||
return list(terms)
|
||||
|
||||
|
||||
def extract_terms_from_manifests(
|
||||
skills: List[Dict[str, Any]],
|
||||
agents: List[Dict[str, Any]]
|
||||
) -> Dict[str, List[str]]:
|
||||
"""
|
||||
Extract field names and values from manifests.
|
||||
|
||||
Args:
|
||||
skills: List of skill manifests
|
||||
agents: List of agent manifests
|
||||
|
||||
Returns:
|
||||
Dictionary of term categories to terms
|
||||
"""
|
||||
terms = defaultdict(set)
|
||||
|
||||
# Extract from skills
|
||||
for skill in skills:
|
||||
# Status values
|
||||
if 'status' in skill:
|
||||
terms['status'].add(skill['status'])
|
||||
|
||||
# Runtime values
|
||||
for ep in skill.get('entrypoints', []):
|
||||
if 'runtime' in ep:
|
||||
terms['runtime'].add(ep['runtime'])
|
||||
if 'permissions' in ep:
|
||||
for perm in ep['permissions']:
|
||||
terms['permissions'].add(perm)
|
||||
|
||||
# Input/output types
|
||||
for input_def in skill.get('inputs', []):
|
||||
if isinstance(input_def, dict) and 'type' in input_def:
|
||||
terms['types'].add(input_def['type'])
|
||||
|
||||
for output_def in skill.get('outputs', []):
|
||||
if isinstance(output_def, dict) and 'type' in output_def:
|
||||
terms['types'].add(output_def['type'])
|
||||
|
||||
# Extract from agents
|
||||
for agent in agents:
|
||||
# Reasoning modes
|
||||
if 'reasoning_mode' in agent:
|
||||
terms['reasoning_mode'].add(agent['reasoning_mode'])
|
||||
|
||||
# Status values
|
||||
if 'status' in agent:
|
||||
terms['status'].add(agent['status'])
|
||||
|
||||
# Error handling strategies
|
||||
error_handling = agent.get('error_handling', {})
|
||||
for key in error_handling:
|
||||
if key.startswith('on_'):
|
||||
terms['error_handling'].add(key)
|
||||
|
||||
# Convert sets to sorted lists
|
||||
return {k: sorted(v) for k, v in terms.items()}
|
||||
|
||||
|
||||
def generate_definition(term: str, category: str, context: Dict[str, Any]) -> Optional[str]:
|
||||
"""
|
||||
Generate a glossary definition for a term.
|
||||
|
||||
Args:
|
||||
term: Term to define
|
||||
category: Category of the term (e.g., 'status', 'runtime')
|
||||
context: Additional context from manifests
|
||||
|
||||
Returns:
|
||||
Generated definition or None if unable to generate
|
||||
"""
|
||||
definitions = {
|
||||
# Status values
|
||||
'active': 'A status indicating that a component is production-ready and available for use in workflows and operations.',
|
||||
'draft': 'A status indicating that a component is under development and not yet production-ready. Draft components are excluded from production operations.',
|
||||
'deprecated': 'A status indicating that a component is no longer recommended for use and may be removed in future versions.',
|
||||
'archived': 'A status indicating that a component has been retired and is no longer maintained or available.',
|
||||
|
||||
# Runtime values
|
||||
'python': 'A runtime environment for executing Python-based skills and operations.',
|
||||
'javascript': 'A runtime environment for executing JavaScript/Node.js-based skills and operations.',
|
||||
'bash': 'A runtime environment for executing shell scripts and command-line operations.',
|
||||
|
||||
# Permissions
|
||||
'filesystem:read': 'Permission to read files and directories from the filesystem.',
|
||||
'filesystem:write': 'Permission to write, modify, or delete files and directories.',
|
||||
'network:http': 'Permission to make HTTP/HTTPS network requests.',
|
||||
'network:all': 'Permission to make any network connections.',
|
||||
|
||||
# Reasoning modes (already in glossary but we can check)
|
||||
'iterative': 'A reasoning mode where an agent can retry operations based on feedback, useful for tasks requiring refinement.',
|
||||
'oneshot': 'A reasoning mode where an agent executes once without retries, suitable for deterministic tasks.',
|
||||
|
||||
# Types
|
||||
'string': 'A text value type.',
|
||||
'boolean': 'A true/false value type.',
|
||||
'integer': 'A whole number value type.',
|
||||
'object': 'A structured data type containing key-value pairs.',
|
||||
'array': 'A list of values.',
|
||||
|
||||
# Error handling
|
||||
'on_validation_failure': 'Error handling strategy that defines actions to take when validation fails.',
|
||||
'on_generation_failure': 'Error handling strategy that defines actions to take when generation fails.',
|
||||
'on_compilation_failure': 'Error handling strategy that defines actions to take when compilation fails.',
|
||||
|
||||
# Other common terms
|
||||
'max_retries': 'The maximum number of retry attempts allowed for an operation before failing.',
|
||||
'timeout_seconds': 'The maximum time in seconds that an operation is allowed to run before being terminated.',
|
||||
'blocking': 'A property indicating that an operation must complete (or fail) before subsequent operations can proceed.',
|
||||
'fuzzy': 'A matching mode that allows approximate string matching rather than exact matches.',
|
||||
'handler': 'The script or function that implements the core logic of a skill or operation.',
|
||||
'strict': 'A validation mode where warnings are treated as errors.',
|
||||
'dry_run': 'A mode that previews an operation without actually executing it or making changes.',
|
||||
'overwrite': 'An option to replace existing content rather than preserving or merging it.',
|
||||
}
|
||||
|
||||
# Return predefined definition if available
|
||||
if term.lower() in definitions:
|
||||
return definitions[term.lower()]
|
||||
|
||||
# Generate contextual definitions based on category
|
||||
if category == 'permissions':
|
||||
parts = term.split(':')
|
||||
if len(parts) == 2:
|
||||
resource, action = parts
|
||||
return f"Permission to {action} {resource} resources."
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def update_glossary(
|
||||
glossary_path: str,
|
||||
new_terms: Dict[str, str],
|
||||
dry_run: bool = False
|
||||
) -> str:
|
||||
"""
|
||||
Update glossary.md with new term definitions.
|
||||
|
||||
Args:
|
||||
glossary_path: Path to glossary.md
|
||||
new_terms: Dictionary mapping terms to definitions
|
||||
dry_run: If True, don't write to file
|
||||
|
||||
Returns:
|
||||
Updated glossary content
|
||||
"""
|
||||
# Read existing glossary
|
||||
with open(glossary_path, 'r') as f:
|
||||
content = f.read()
|
||||
|
||||
# Group terms by first letter
|
||||
terms_by_letter = defaultdict(list)
|
||||
for term, definition in sorted(new_terms.items()):
|
||||
first_letter = term[0].upper()
|
||||
terms_by_letter[first_letter].append((term, definition))
|
||||
|
||||
# Find insertion points and add new terms
|
||||
lines = content.split('\n')
|
||||
new_lines = []
|
||||
current_section = None
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
new_lines.append(line)
|
||||
|
||||
# Detect section headers (## A, ## B, etc.)
|
||||
section_match = re.match(r'^##\s+([A-Z])\s*$', line)
|
||||
if section_match:
|
||||
current_section = section_match.group(1)
|
||||
|
||||
# If we have new terms for this section, add them
|
||||
if current_section in terms_by_letter:
|
||||
# Find the right place to insert (alphabetically)
|
||||
# For now, append at the end of the section
|
||||
for term, definition in terms_by_letter[current_section]:
|
||||
new_lines.append('')
|
||||
new_lines.append(f'### {term}')
|
||||
new_lines.append(definition)
|
||||
|
||||
new_content = '\n'.join(new_lines)
|
||||
|
||||
if not dry_run:
|
||||
with open(glossary_path, 'w') as f:
|
||||
f.write(new_content)
|
||||
logger.info(f"Updated glossary with {len(new_terms)} new terms")
|
||||
|
||||
return new_content
|
||||
|
||||
|
||||
def expand_glossary(
|
||||
glossary_path: Optional[str] = None,
|
||||
base_dir: Optional[str] = None,
|
||||
dry_run: bool = False,
|
||||
include_auto_generated: bool = True
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Main function to expand glossary with undocumented terms.
|
||||
|
||||
Args:
|
||||
glossary_path: Path to glossary.md (default: docs/glossary.md)
|
||||
base_dir: Base directory to scan (default: BASE_DIR)
|
||||
dry_run: Preview changes without writing
|
||||
include_auto_generated: Include auto-generated definitions
|
||||
|
||||
Returns:
|
||||
Result with new terms and summary
|
||||
"""
|
||||
# Set defaults
|
||||
if base_dir is None:
|
||||
base_dir = BASE_DIR
|
||||
|
||||
if glossary_path is None:
|
||||
glossary_path = os.path.join(base_dir, "docs", "glossary.md")
|
||||
|
||||
logger.info(f"Expanding glossary at {glossary_path}")
|
||||
|
||||
# Load existing glossary
|
||||
existing_terms = load_glossary(glossary_path)
|
||||
|
||||
# Scan manifests
|
||||
skills = scan_yaml_files("skill.yaml", os.path.join(base_dir, "skills"))
|
||||
agents = scan_yaml_files("agent.yaml", os.path.join(base_dir, "agents"))
|
||||
|
||||
# Extract terms from manifests
|
||||
manifest_terms = extract_terms_from_manifests(skills, agents)
|
||||
|
||||
# Scan docs for additional terms
|
||||
docs_dir = os.path.join(base_dir, "docs")
|
||||
doc_terms = scan_markdown_files(docs_dir)
|
||||
|
||||
# Find undocumented terms
|
||||
new_terms = {}
|
||||
skipped_terms = []
|
||||
|
||||
for category, terms in manifest_terms.items():
|
||||
for term in terms:
|
||||
term_lower = term.lower()
|
||||
|
||||
# Skip if already in glossary
|
||||
if term_lower in existing_terms:
|
||||
continue
|
||||
|
||||
# Skip common terms
|
||||
if term_lower in SKIP_TERMS:
|
||||
skipped_terms.append(term)
|
||||
continue
|
||||
|
||||
# Generate definition
|
||||
if include_auto_generated:
|
||||
definition = generate_definition(term, category, {
|
||||
'category': category,
|
||||
'skills': skills,
|
||||
'agents': agents
|
||||
})
|
||||
|
||||
if definition:
|
||||
# Capitalize term name properly
|
||||
term_name = term.title() if term.islower() else term
|
||||
new_terms[term_name] = definition
|
||||
else:
|
||||
skipped_terms.append(term)
|
||||
|
||||
# Update glossary
|
||||
updated_content = None
|
||||
if new_terms:
|
||||
updated_content = update_glossary(glossary_path, new_terms, dry_run)
|
||||
|
||||
# Build summary
|
||||
summary = {
|
||||
"glossary_path": glossary_path,
|
||||
"existing_terms_count": len(existing_terms),
|
||||
"new_terms_count": len(new_terms),
|
||||
"new_terms": list(new_terms.keys()),
|
||||
"skipped_terms_count": len(skipped_terms),
|
||||
"scanned_files": {
|
||||
"skills": len(skills),
|
||||
"agents": len(agents)
|
||||
},
|
||||
"dry_run": dry_run
|
||||
}
|
||||
|
||||
if dry_run and updated_content:
|
||||
summary["preview"] = updated_content
|
||||
|
||||
# Build detailed output
|
||||
details = {
|
||||
"summary": summary,
|
||||
"new_definitions": new_terms,
|
||||
"manifest_terms": manifest_terms,
|
||||
"skipped_terms": skipped_terms[:20] # Limit to first 20
|
||||
}
|
||||
|
||||
return build_response(ok=True, details=details)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main CLI entry point."""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Expand glossary.md with undocumented terms from manifests",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Expand glossary with new terms
|
||||
glossary_expand.py
|
||||
|
||||
# Preview changes without writing
|
||||
glossary_expand.py --dry-run
|
||||
|
||||
# Use custom glossary path
|
||||
glossary_expand.py --glossary-path /path/to/glossary.md
|
||||
|
||||
# Skip auto-generated definitions (only show what's missing)
|
||||
glossary_expand.py --no-auto-generate
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--glossary-path",
|
||||
help="Path to glossary.md file"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--base-dir",
|
||||
help="Base directory to scan for manifests"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Preview changes without writing to glossary"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-auto-generate",
|
||||
action="store_true",
|
||||
help="Don't auto-generate definitions, only report missing terms"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--format",
|
||||
choices=["json", "summary"],
|
||||
default="summary",
|
||||
help="Output format"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
result = expand_glossary(
|
||||
glossary_path=args.glossary_path,
|
||||
base_dir=args.base_dir,
|
||||
dry_run=args.dry_run,
|
||||
include_auto_generated=not args.no_auto_generate
|
||||
)
|
||||
|
||||
if args.format == "json":
|
||||
print(json.dumps(result, indent=2))
|
||||
else:
|
||||
# Pretty summary output
|
||||
details = result["details"]
|
||||
summary = details["summary"]
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("GLOSSARY EXPANSION SUMMARY")
|
||||
print("="*80)
|
||||
print(f"\nGlossary: {summary['glossary_path']}")
|
||||
print(f"Existing terms: {summary['existing_terms_count']}")
|
||||
print(f"New terms added: {summary['new_terms_count']}")
|
||||
print(f"Scanned: {summary['scanned_files']['skills']} skills, "
|
||||
f"{summary['scanned_files']['agents']} agents")
|
||||
|
||||
if summary['new_terms_count'] > 0:
|
||||
print(f"\n{'-'*80}")
|
||||
print("NEW TERMS:")
|
||||
print(f"{'-'*80}")
|
||||
for term in summary['new_terms']:
|
||||
definition = details['new_definitions'][term]
|
||||
print(f"\n### {term}")
|
||||
print(definition)
|
||||
print(f"\n{'-'*80}")
|
||||
|
||||
if summary['dry_run']:
|
||||
print("\n[DRY RUN] No changes written to glossary")
|
||||
else:
|
||||
print(f"\nGlossary updated successfully!")
|
||||
|
||||
print("\n" + "="*80 + "\n")
|
||||
|
||||
sys.exit(0 if result['ok'] else 1)
|
||||
|
||||
except BettyError as e:
|
||||
logger.error(f"Failed to expand glossary: {e}")
|
||||
result = build_response(ok=False, errors=[str(e)])
|
||||
print(json.dumps(result, indent=2))
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error: {e}", exc_info=True)
|
||||
result = build_response(ok=False, errors=[f"Unexpected error: {str(e)}"])
|
||||
print(json.dumps(result, indent=2))
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user