#!/usr/bin/env python3 """ glossary_expand.py - Implementation of the docs.expand.glossary Skill Extract undocumented terms from Betty manifests and docs, then enrich glossary.md with new definitions. """ import os import sys import json import yaml import re from typing import Dict, Any, List, Set, Optional from datetime import datetime, timezone from pathlib import Path from collections import defaultdict from betty.config import BASE_DIR from betty.logging_utils import setup_logger from betty.errors import BettyError logger = setup_logger(__name__) # Common field names found in manifests SKILL_FIELDS = { "name", "version", "description", "inputs", "outputs", "dependencies", "entrypoints", "status", "tags", "runtime", "handler", "permissions", "parameters", "command", "required", "type", "default" } AGENT_FIELDS = { "name", "version", "description", "capabilities", "skills_available", "reasoning_mode", "context_requirements", "workflow_pattern", "error_handling", "output", "status", "tags", "dependencies", "max_retries", "timeout_seconds", "on_validation_failure", "on_generation_failure", "on_compilation_failure" } COMMAND_FIELDS = { "name", "description", "execution", "parameters", "version", "status", "tags", "delegate_to", "workflow", "agent", "skill" } HOOK_FIELDS = { "name", "description", "event", "command", "enabled", "blocking", "timeout", "version", "status", "tags" } # Terms that are already well-documented or common SKIP_TERMS = { "name", "version", "description", "true", "false", "string", "boolean", "integer", "array", "object", "list", "dict", "file", "path", "url", "id", "uuid", "timestamp", "date", "time", "json", "yaml", "xml" } def build_response( ok: bool, errors: Optional[List[str]] = None, details: Optional[Dict[str, Any]] = None ) -> Dict[str, Any]: """Build standardized response.""" response: Dict[str, Any] = { "ok": ok, "status": "success" if ok else "failed", "errors": errors or [], "timestamp": datetime.now(timezone.utc).isoformat() } if details is not None: response["details"] = details return response def load_glossary(glossary_path: str) -> Dict[str, str]: """ Load existing glossary and extract defined terms. Args: glossary_path: Path to glossary.md Returns: Dictionary mapping term names to their sections """ if not os.path.exists(glossary_path): logger.warning(f"Glossary not found: {glossary_path}") return {} terms = {} with open(glossary_path, 'r') as f: content = f.read() # Extract term headings (### Term Name) pattern = r'^###\s+(.+)$' matches = re.finditer(pattern, content, re.MULTILINE) for match in matches: term = match.group(1).strip() terms[term.lower()] = term logger.info(f"Loaded {len(terms)} existing glossary terms") return terms def scan_yaml_files(pattern: str, base_dir: str) -> List[Dict[str, Any]]: """ Scan YAML files matching pattern. Args: pattern: File pattern (e.g., "skill.yaml", "agent.yaml") base_dir: Base directory to search Returns: List of parsed YAML data """ files = [] for root, dirs, filenames in os.walk(base_dir): for filename in filenames: if filename == pattern: file_path = os.path.join(root, filename) try: with open(file_path, 'r') as f: data = yaml.safe_load(f) if data: data['_source_path'] = file_path files.append(data) except Exception as e: logger.warning(f"Failed to parse {file_path}: {e}") logger.info(f"Scanned {len(files)} {pattern} files") return files def scan_markdown_files(docs_dir: str) -> List[str]: """ Scan markdown files for capitalized terms that might need definitions. Args: docs_dir: Directory containing markdown files Returns: List of potential terms found in docs """ terms = set() for file_path in Path(docs_dir).glob("*.md"): try: with open(file_path, 'r') as f: content = f.read() # Find capitalized phrases (potential terms) # Look for patterns like "Breaking Change", "Blocking Hook", etc. pattern = r'\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\b' matches = re.finditer(pattern, content) for match in matches: term = match.group(1) # Filter out common words and single words if len(term.split()) > 1 or term.lower() not in SKIP_TERMS: terms.add(term) except Exception as e: logger.warning(f"Failed to scan {file_path}: {e}") logger.info(f"Found {len(terms)} potential terms in docs") return list(terms) def extract_terms_from_manifests( skills: List[Dict[str, Any]], agents: List[Dict[str, Any]] ) -> Dict[str, List[str]]: """ Extract field names and values from manifests. Args: skills: List of skill manifests agents: List of agent manifests Returns: Dictionary of term categories to terms """ terms = defaultdict(set) # Extract from skills for skill in skills: # Status values if 'status' in skill: terms['status'].add(skill['status']) # Runtime values for ep in skill.get('entrypoints', []): if 'runtime' in ep: terms['runtime'].add(ep['runtime']) if 'permissions' in ep: for perm in ep['permissions']: terms['permissions'].add(perm) # Input/output types for input_def in skill.get('inputs', []): if isinstance(input_def, dict) and 'type' in input_def: terms['types'].add(input_def['type']) for output_def in skill.get('outputs', []): if isinstance(output_def, dict) and 'type' in output_def: terms['types'].add(output_def['type']) # Extract from agents for agent in agents: # Reasoning modes if 'reasoning_mode' in agent: terms['reasoning_mode'].add(agent['reasoning_mode']) # Status values if 'status' in agent: terms['status'].add(agent['status']) # Error handling strategies error_handling = agent.get('error_handling', {}) for key in error_handling: if key.startswith('on_'): terms['error_handling'].add(key) # Convert sets to sorted lists return {k: sorted(v) for k, v in terms.items()} def generate_definition(term: str, category: str, context: Dict[str, Any]) -> Optional[str]: """ Generate a glossary definition for a term. Args: term: Term to define category: Category of the term (e.g., 'status', 'runtime') context: Additional context from manifests Returns: Generated definition or None if unable to generate """ definitions = { # Status values 'active': 'A status indicating that a component is production-ready and available for use in workflows and operations.', 'draft': 'A status indicating that a component is under development and not yet production-ready. Draft components are excluded from production operations.', 'deprecated': 'A status indicating that a component is no longer recommended for use and may be removed in future versions.', 'archived': 'A status indicating that a component has been retired and is no longer maintained or available.', # Runtime values 'python': 'A runtime environment for executing Python-based skills and operations.', 'javascript': 'A runtime environment for executing JavaScript/Node.js-based skills and operations.', 'bash': 'A runtime environment for executing shell scripts and command-line operations.', # Permissions 'filesystem:read': 'Permission to read files and directories from the filesystem.', 'filesystem:write': 'Permission to write, modify, or delete files and directories.', 'network:http': 'Permission to make HTTP/HTTPS network requests.', 'network:all': 'Permission to make any network connections.', # Reasoning modes (already in glossary but we can check) 'iterative': 'A reasoning mode where an agent can retry operations based on feedback, useful for tasks requiring refinement.', 'oneshot': 'A reasoning mode where an agent executes once without retries, suitable for deterministic tasks.', # Types 'string': 'A text value type.', 'boolean': 'A true/false value type.', 'integer': 'A whole number value type.', 'object': 'A structured data type containing key-value pairs.', 'array': 'A list of values.', # Error handling 'on_validation_failure': 'Error handling strategy that defines actions to take when validation fails.', 'on_generation_failure': 'Error handling strategy that defines actions to take when generation fails.', 'on_compilation_failure': 'Error handling strategy that defines actions to take when compilation fails.', # Other common terms 'max_retries': 'The maximum number of retry attempts allowed for an operation before failing.', 'timeout_seconds': 'The maximum time in seconds that an operation is allowed to run before being terminated.', 'blocking': 'A property indicating that an operation must complete (or fail) before subsequent operations can proceed.', 'fuzzy': 'A matching mode that allows approximate string matching rather than exact matches.', 'handler': 'The script or function that implements the core logic of a skill or operation.', 'strict': 'A validation mode where warnings are treated as errors.', 'dry_run': 'A mode that previews an operation without actually executing it or making changes.', 'overwrite': 'An option to replace existing content rather than preserving or merging it.', } # Return predefined definition if available if term.lower() in definitions: return definitions[term.lower()] # Generate contextual definitions based on category if category == 'permissions': parts = term.split(':') if len(parts) == 2: resource, action = parts return f"Permission to {action} {resource} resources." return None def update_glossary( glossary_path: str, new_terms: Dict[str, str], dry_run: bool = False ) -> str: """ Update glossary.md with new term definitions. Args: glossary_path: Path to glossary.md new_terms: Dictionary mapping terms to definitions dry_run: If True, don't write to file Returns: Updated glossary content """ # Read existing glossary with open(glossary_path, 'r') as f: content = f.read() # Group terms by first letter terms_by_letter = defaultdict(list) for term, definition in sorted(new_terms.items()): first_letter = term[0].upper() terms_by_letter[first_letter].append((term, definition)) # Find insertion points and add new terms lines = content.split('\n') new_lines = [] current_section = None for i, line in enumerate(lines): new_lines.append(line) # Detect section headers (## A, ## B, etc.) section_match = re.match(r'^##\s+([A-Z])\s*$', line) if section_match: current_section = section_match.group(1) # If we have new terms for this section, add them if current_section in terms_by_letter: # Find the right place to insert (alphabetically) # For now, append at the end of the section for term, definition in terms_by_letter[current_section]: new_lines.append('') new_lines.append(f'### {term}') new_lines.append(definition) new_content = '\n'.join(new_lines) if not dry_run: with open(glossary_path, 'w') as f: f.write(new_content) logger.info(f"Updated glossary with {len(new_terms)} new terms") return new_content def expand_glossary( glossary_path: Optional[str] = None, base_dir: Optional[str] = None, dry_run: bool = False, include_auto_generated: bool = True ) -> Dict[str, Any]: """ Main function to expand glossary with undocumented terms. Args: glossary_path: Path to glossary.md (default: docs/glossary.md) base_dir: Base directory to scan (default: BASE_DIR) dry_run: Preview changes without writing include_auto_generated: Include auto-generated definitions Returns: Result with new terms and summary """ # Set defaults if base_dir is None: base_dir = BASE_DIR if glossary_path is None: glossary_path = os.path.join(base_dir, "docs", "glossary.md") logger.info(f"Expanding glossary at {glossary_path}") # Load existing glossary existing_terms = load_glossary(glossary_path) # Scan manifests skills = scan_yaml_files("skill.yaml", os.path.join(base_dir, "skills")) agents = scan_yaml_files("agent.yaml", os.path.join(base_dir, "agents")) # Extract terms from manifests manifest_terms = extract_terms_from_manifests(skills, agents) # Scan docs for additional terms docs_dir = os.path.join(base_dir, "docs") doc_terms = scan_markdown_files(docs_dir) # Find undocumented terms new_terms = {} skipped_terms = [] for category, terms in manifest_terms.items(): for term in terms: term_lower = term.lower() # Skip if already in glossary if term_lower in existing_terms: continue # Skip common terms if term_lower in SKIP_TERMS: skipped_terms.append(term) continue # Generate definition if include_auto_generated: definition = generate_definition(term, category, { 'category': category, 'skills': skills, 'agents': agents }) if definition: # Capitalize term name properly term_name = term.title() if term.islower() else term new_terms[term_name] = definition else: skipped_terms.append(term) # Update glossary updated_content = None if new_terms: updated_content = update_glossary(glossary_path, new_terms, dry_run) # Build summary summary = { "glossary_path": glossary_path, "existing_terms_count": len(existing_terms), "new_terms_count": len(new_terms), "new_terms": list(new_terms.keys()), "skipped_terms_count": len(skipped_terms), "scanned_files": { "skills": len(skills), "agents": len(agents) }, "dry_run": dry_run } if dry_run and updated_content: summary["preview"] = updated_content # Build detailed output details = { "summary": summary, "new_definitions": new_terms, "manifest_terms": manifest_terms, "skipped_terms": skipped_terms[:20] # Limit to first 20 } return build_response(ok=True, details=details) def main(): """Main CLI entry point.""" import argparse parser = argparse.ArgumentParser( description="Expand glossary.md with undocumented terms from manifests", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Expand glossary with new terms glossary_expand.py # Preview changes without writing glossary_expand.py --dry-run # Use custom glossary path glossary_expand.py --glossary-path /path/to/glossary.md # Skip auto-generated definitions (only show what's missing) glossary_expand.py --no-auto-generate """ ) parser.add_argument( "--glossary-path", help="Path to glossary.md file" ) parser.add_argument( "--base-dir", help="Base directory to scan for manifests" ) parser.add_argument( "--dry-run", action="store_true", help="Preview changes without writing to glossary" ) parser.add_argument( "--no-auto-generate", action="store_true", help="Don't auto-generate definitions, only report missing terms" ) parser.add_argument( "--format", choices=["json", "summary"], default="summary", help="Output format" ) args = parser.parse_args() try: result = expand_glossary( glossary_path=args.glossary_path, base_dir=args.base_dir, dry_run=args.dry_run, include_auto_generated=not args.no_auto_generate ) if args.format == "json": print(json.dumps(result, indent=2)) else: # Pretty summary output details = result["details"] summary = details["summary"] print("\n" + "="*80) print("GLOSSARY EXPANSION SUMMARY") print("="*80) print(f"\nGlossary: {summary['glossary_path']}") print(f"Existing terms: {summary['existing_terms_count']}") print(f"New terms added: {summary['new_terms_count']}") print(f"Scanned: {summary['scanned_files']['skills']} skills, " f"{summary['scanned_files']['agents']} agents") if summary['new_terms_count'] > 0: print(f"\n{'-'*80}") print("NEW TERMS:") print(f"{'-'*80}") for term in summary['new_terms']: definition = details['new_definitions'][term] print(f"\n### {term}") print(definition) print(f"\n{'-'*80}") if summary['dry_run']: print("\n[DRY RUN] No changes written to glossary") else: print(f"\nGlossary updated successfully!") print("\n" + "="*80 + "\n") sys.exit(0 if result['ok'] else 1) except BettyError as e: logger.error(f"Failed to expand glossary: {e}") result = build_response(ok=False, errors=[str(e)]) print(json.dumps(result, indent=2)) sys.exit(1) except Exception as e: logger.error(f"Unexpected error: {e}", exc_info=True) result = build_response(ok=False, errors=[f"Unexpected error: {str(e)}"]) print(json.dumps(result, indent=2)) sys.exit(1) if __name__ == "__main__": main()