gh-epieczko-betty/skills/docs.validate.skilldocs/skill_docs_validate.py

#!/usr/bin/env python3
"""
Validate SKILL.md documentation against skill.yaml manifests.

This skill ensures that SKILL.md files contain all required sections and that
documented fields match the corresponding skill.yaml manifest.
"""

import json
import sys
import os
import re
from pathlib import Path
from typing import Any, Dict, List, Optional, Set

import yaml


from betty.errors import BettyError, SkillValidationError
from betty.logging_utils import setup_logger
from betty.validation import validate_path, ValidationError
from betty.telemetry_integration import telemetry_tracked

logger = setup_logger(__name__)

# Required headers in SKILL.md
REQUIRED_HEADERS = [
    "Overview",
    "Purpose",
    "Usage",
    "Inputs",
    "Outputs",
    "Examples",
    "Integration",
    "Errors"
]

# Alternative header variations that are acceptable
HEADER_VARIATIONS = {
    "Overview": ["Overview", "## Overview", "Description"],
    "Purpose": ["Purpose", "## Purpose"],
    "Usage": ["Usage", "## Usage", "How to Use", "## How to Use"],
    "Inputs": ["Inputs", "## Inputs", "Parameters", "## Parameters"],
    "Outputs": ["Outputs", "## Outputs", "Output", "## Output"],
    "Examples": ["Examples", "## Examples", "Example", "## Example"],
    "Integration": ["Integration", "## Integration", "Integration with Hooks", "## Integration with Hooks", "Use in Workflows", "## Use in Workflows"],
    "Errors": ["Errors", "## Errors", "Error Codes", "## Error Codes", "Exit Codes", "## Exit Codes", "Common Errors", "## Common Errors"]
}


class ValidationIssue:
    """Represents a validation issue (error or warning)."""

    def __init__(self, issue_type: str, message: str, severity: str = "error",
                 file_path: Optional[str] = None, suggestion: Optional[str] = None):
        self.issue_type = issue_type
        self.message = message
        self.severity = severity
        self.file_path = file_path
        self.suggestion = suggestion

    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary for JSON serialization."""
        result = {
            "type": self.issue_type,
            "message": self.message,
            "severity": self.severity
        }
        if self.file_path:
            result["file"] = self.file_path
        if self.suggestion:
            result["suggestion"] = self.suggestion
        return result


class SkillDocsValidator:
    """Validates SKILL.md documentation against skill.yaml manifest."""

    def __init__(self, skill_path: str, check_headers: bool = True,
                 check_manifest_parity: bool = True):
        self.skill_path = Path(skill_path)
        self.check_headers = check_headers
        self.check_manifest_parity = check_manifest_parity
        self.errors: List[ValidationIssue] = []
        self.warnings: List[ValidationIssue] = []
        self.manifest: Optional[Dict[str, Any]] = None
        self.skill_md_content: Optional[str] = None

    def validate(self) -> Dict[str, Any]:
        """Run all validation checks."""
        try:
            # Load manifest and documentation
            self._load_files()

            # Run checks
            if self.check_headers:
                self._validate_headers()

            if self.check_manifest_parity:
                self._validate_manifest_parity()

            # Build response
            return {
                "valid": len(self.errors) == 0,
                "skill_name": self.manifest.get("name") if self.manifest else "unknown",
                "skill_path": str(self.skill_path),
                "errors": [e.to_dict() for e in self.errors],
                "warnings": [w.to_dict() for w in self.warnings],
                "checks_run": {
                    "headers": self.check_headers,
                    "manifest_parity": self.check_manifest_parity
                }
            }

        except Exception as exc:
            logger.error("Validation failed: %s", exc)
            self.errors.append(ValidationIssue(
                "validation_error",
                str(exc),
                severity="error"
            ))
            return {
                "valid": False,
                "skill_name": "unknown",
                "skill_path": str(self.skill_path),
                "errors": [e.to_dict() for e in self.errors],
                "warnings": [w.to_dict() for w in self.warnings],
                "checks_run": {
                    "headers": self.check_headers,
                    "manifest_parity": self.check_manifest_parity
                }
            }

    def _load_files(self) -> None:
        """Load skill.yaml and SKILL.md files."""
        # Validate skill path
        try:
            validate_path(str(self.skill_path), must_exist=True)
        except ValidationError as exc:
            raise SkillValidationError(f"Invalid skill path: {exc}") from exc

        if not self.skill_path.is_dir():
            raise SkillValidationError(f"Skill path is not a directory: {self.skill_path}")

        # Load skill.yaml
        manifest_path = self.skill_path / "skill.yaml"
        if not manifest_path.exists():
            raise SkillValidationError(f"skill.yaml not found at {manifest_path}")

        try:
            with open(manifest_path, 'r', encoding='utf-8') as f:
                self.manifest = yaml.safe_load(f)
        except Exception as exc:
            raise SkillValidationError(f"Failed to parse skill.yaml: {exc}") from exc

        # Load SKILL.md
        skill_md_path = self.skill_path / "SKILL.md"
        if not skill_md_path.exists():
            self.errors.append(ValidationIssue(
                "missing_file",
                "SKILL.md file not found",
                severity="error",
                file_path=str(skill_md_path),
                suggestion="Create SKILL.md documentation file"
            ))
            self.skill_md_content = ""
            return

        try:
            with open(skill_md_path, 'r', encoding='utf-8') as f:
                self.skill_md_content = f.read()
        except Exception as exc:
            raise SkillValidationError(f"Failed to read SKILL.md: {exc}") from exc

    def _validate_headers(self) -> None:
        """Validate that SKILL.md contains all required headers."""
        if not self.skill_md_content:
            return

        # Extract all headers from the markdown
        header_pattern = re.compile(r'^#{1,6}\s+(.+)$', re.MULTILINE)
        found_headers = set()

        for match in header_pattern.finditer(self.skill_md_content):
            header_text = match.group(1).strip()
            found_headers.add(header_text)

        # Check each required header
        for required_header in REQUIRED_HEADERS:
            variations = HEADER_VARIATIONS.get(required_header, [required_header])

            # Check if any variation exists
            found = False
            for variation in variations:
                # Remove markdown prefix for comparison
                clean_variation = variation.replace("#", "").strip()
                if any(clean_variation.lower() in h.lower() for h in found_headers):
                    found = True
                    break

            if not found:
                self.errors.append(ValidationIssue(
                    "missing_header",
                    f"Required header '{required_header}' not found in SKILL.md",
                    severity="error",
                    file_path="SKILL.md",
                    suggestion=f"Add a '## {required_header}' section to SKILL.md"
                ))

    def _validate_manifest_parity(self) -> None:
        """Validate that SKILL.md matches skill.yaml manifest fields."""
        if not self.manifest or not self.skill_md_content:
            return

        # Check skill name
        skill_name = self.manifest.get("name", "")
        if skill_name and skill_name not in self.skill_md_content:
            self.warnings.append(ValidationIssue(
                "missing_skill_name",
                f"Skill name '{skill_name}' not found in SKILL.md",
                severity="warning",
                suggestion=f"Include the skill name '{skill_name}' in the documentation"
            ))

        # Check inputs
        manifest_inputs = self.manifest.get("inputs", [])
        if manifest_inputs:
            self._validate_inputs_documented(manifest_inputs)

        # Check outputs
        manifest_outputs = self.manifest.get("outputs", [])
        if manifest_outputs:
            self._validate_outputs_documented(manifest_outputs)

        # Check status
        status = self.manifest.get("status", "")
        if status and status not in ["active", "deprecated", "experimental"]:
            self.warnings.append(ValidationIssue(
                "invalid_status",
                f"Manifest status '{status}' is not a standard value",
                severity="warning",
                suggestion="Use 'active', 'deprecated', or 'experimental'"
            ))

        # Check tags
        tags = self.manifest.get("tags", [])
        if not tags:
            self.warnings.append(ValidationIssue(
                "missing_tags",
                "No tags defined in skill.yaml manifest",
                severity="warning",
                suggestion="Add relevant tags to improve skill discoverability"
            ))

    def _validate_inputs_documented(self, inputs: List[Any]) -> None:
        """Validate that all manifest inputs are documented in SKILL.md."""
        for input_spec in inputs:
            # Handle both string format and dict format
            if isinstance(input_spec, str):
                # Simple string format: "input_name (optional)"
                input_name = input_spec.split("(")[0].strip()
            elif isinstance(input_spec, dict):
                # Dictionary format with name, type, description
                input_name = input_spec.get("name", "")
            else:
                logger.warning("Unexpected input format: %s", type(input_spec))
                continue

            if not input_name:
                continue

            # Check if input is mentioned in the documentation
            # Look for various patterns: input_name, --input-name, `input_name`, etc.
            patterns = [
                input_name,
                f"`{input_name}`",
                f"--{input_name.replace('_', '-')}",
                f"`--{input_name.replace('_', '-')}`"
            ]

            found = any(pattern in self.skill_md_content for pattern in patterns)

            if not found:
                self.errors.append(ValidationIssue(
                    "undocumented_input",
                    f"Input '{input_name}' from manifest not documented in SKILL.md",
                    severity="error",
                    file_path="SKILL.md",
                    suggestion=f"Document the '{input_name}' input in the Inputs section"
                ))

    def _validate_outputs_documented(self, outputs: List[Any]) -> None:
        """Validate that all manifest outputs are documented in SKILL.md."""
        for output_spec in outputs:
            # Handle both string format and dict format
            if isinstance(output_spec, str):
                # Simple string format: "output_name.json" or just "output_name"
                output_name = output_spec.split(".")[0].strip()
            elif isinstance(output_spec, dict):
                # Dictionary format with name, type, description
                output_name = output_spec.get("name", "")
            else:
                logger.warning("Unexpected output format: %s", type(output_spec))
                continue

            if not output_name:
                continue

            # Check if output is mentioned in the documentation
            patterns = [
                output_name,
                f"`{output_name}`",
                f'"{output_name}"'
            ]

            found = any(pattern in self.skill_md_content for pattern in patterns)

            if not found:
                self.warnings.append(ValidationIssue(
                    "undocumented_output",
                    f"Output '{output_name}' from manifest not documented in SKILL.md",
                    severity="warning",
                    file_path="SKILL.md",
                    suggestion=f"Document the '{output_name}' output in the Outputs section"
                ))


def build_response(ok: bool, skill_path: str, errors: Optional[List[Dict[str, Any]]] = None,
                   warnings: Optional[List[Dict[str, Any]]] = None,
                   details: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
    """Build standard response format."""
    return {
        "ok": ok,
        "status": "valid" if ok else "invalid",
        "skill_path": skill_path,
        "errors": errors or [],
        "warnings": warnings or [],
        "details": details or {}
    }


def print_summary_table(results: List[Dict[str, Any]]) -> None:
    """Print a summary table of validation results."""
    print("\n" + "="*80)
    print("SKILL DOCUMENTATION VALIDATION SUMMARY")
    print("="*80)

    if not results:
        print("No skills validated.")
        return

    # Calculate column widths
    max_name_len = max(len(r.get("skill_name", "")) for r in results)
    max_name_len = max(max_name_len, len("Skill Name"))

    # Print header
    print(f"{'Skill Name':<{max_name_len}}  {'Status':<8}  {'Errors':<8}  {'Warnings':<8}")
    print("-" * 80)

    # Print each result
    total_errors = 0
    total_warnings = 0
    valid_count = 0

    for result in results:
        skill_name = result.get("skill_name", "unknown")
        valid = result.get("valid", False)
        error_count = len(result.get("errors", []))
        warning_count = len(result.get("warnings", []))

        status = "VALID" if valid else "INVALID"
        status_color = status

        print(f"{skill_name:<{max_name_len}}  {status:<8}  {error_count:<8}  {warning_count:<8}")

        total_errors += error_count
        total_warnings += warning_count
        if valid:
            valid_count += 1

    # Print summary
    print("-" * 80)
    print(f"Total: {len(results)} skills | Valid: {valid_count} | "
          f"Total Errors: {total_errors} | Total Warnings: {total_warnings}")
    print("="*80 + "\n")


@telemetry_tracked(skill_name="docs.validate.skill_docs", caller="cli")
def main(argv: Optional[List[str]] = None) -> int:
    """Entry point for CLI execution."""
    argv = argv or sys.argv[1:]

    # Parse arguments
    if len(argv) == 0:
        response = build_response(
            False,
            "",
            errors=[{
                "type": "usage_error",
                "message": "Usage: skill_docs_validate.py <skill_path> [--summary] [--no-headers] [--no-manifest-parity]",
                "severity": "error"
            }]
        )
        print(json.dumps(response, indent=2))
        return 1

    skill_path = argv[0]
    summary_mode = "--summary" in argv
    check_headers = "--no-headers" not in argv
    check_manifest_parity = "--no-manifest-parity" not in argv

    try:
        # Handle batch validation of multiple skills if path is parent directory
        skill_dir = Path(skill_path)

        # Check if this is a single skill or batch validation
        if (skill_dir / "skill.yaml").exists():
            # Single skill validation
            validator = SkillDocsValidator(
                skill_path,
                check_headers=check_headers,
                check_manifest_parity=check_manifest_parity
            )
            result = validator.validate()

            if summary_mode:
                print_summary_table([result])
            else:
                response = build_response(
                    result["valid"],
                    skill_path,
                    errors=result["errors"],
                    warnings=result["warnings"],
                    details=result
                )
                print(json.dumps(response, indent=2))

            return 0 if result["valid"] else 1

        else:
            # Batch validation - check if directory contains skill subdirectories
            if not skill_dir.is_dir():
                response = build_response(
                    False,
                    skill_path,
                    errors=[{
                        "type": "invalid_path",
                        "message": f"Path is not a directory: {skill_path}",
                        "severity": "error"
                    }]
                )
                print(json.dumps(response, indent=2))
                return 1

            # Find all skill directories
            results = []
            for subdir in sorted(skill_dir.iterdir()):
                if subdir.is_dir() and (subdir / "skill.yaml").exists():
                    validator = SkillDocsValidator(
                        str(subdir),
                        check_headers=check_headers,
                        check_manifest_parity=check_manifest_parity
                    )
                    result = validator.validate()
                    results.append(result)

            if not results:
                response = build_response(
                    False,
                    skill_path,
                    errors=[{
                        "type": "no_skills_found",
                        "message": f"No skills found in directory: {skill_path}",
                        "severity": "error"
                    }]
                )
                print(json.dumps(response, indent=2))
                return 1

            # Output results
            if summary_mode:
                print_summary_table(results)
            else:
                # Print full JSON for each skill
                for result in results:
                    response = build_response(
                        result["valid"],
                        result["skill_path"],
                        errors=result["errors"],
                        warnings=result["warnings"],
                        details=result
                    )
                    print(json.dumps(response, indent=2))
                    print()  # Blank line between results

            # Return error if any skill is invalid
            any_invalid = any(not r["valid"] for r in results)
            return 1 if any_invalid else 0

    except Exception as exc:
        logger.error("Validation failed: %s", exc, exc_info=True)
        response = build_response(
            False,
            skill_path,
            errors=[{
                "type": "validation_error",
                "message": str(exc),
                "severity": "error"
            }]
        )
        print(json.dumps(response, indent=2))
        return 1


if __name__ == "__main__":
    sys.exit(main(sys.argv[1:]))