gh-human-frontier-labs-inc-…/scripts/workflow_executor.py

#!/usr/bin/env python3
"""
Workflow executor for Tailscale SSH Sync Agent.
Common multi-machine workflow automation.
"""

import sys
from pathlib import Path
from typing import Dict, List, Optional
import time
import logging

# Add utils to path
sys.path.insert(0, str(Path(__file__).parent))

from utils.helpers import format_duration, get_timestamp
from sshsync_wrapper import execute_on_group, execute_on_host, push_to_hosts
from load_balancer import get_group_capacity

logger = logging.getLogger(__name__)


def deploy_workflow(code_path: str,
                    staging_group: str,
                    prod_group: str,
                    run_tests: bool = True) -> Dict:
    """
    Full deployment pipeline: staging → test → production.

    Args:
        code_path: Path to code to deploy
        staging_group: Staging server group
        prod_group: Production server group
        run_tests: Whether to run tests on staging

    Returns:
        Dict with deployment results

    Example:
        >>> result = deploy_workflow("./dist", "staging", "production")
        >>> result['success']
        True
        >>> result['duration']
        "12m 45s"
    """
    start_time = time.time()
    results = {
        'stages': {},
        'success': False,
        'start_time': get_timestamp()
    }

    try:
        # Stage 1: Deploy to staging
        logger.info("Stage 1: Deploying to staging...")
        stage1 = push_to_hosts(
            local_path=code_path,
            remote_path="/var/www/app",
            group=staging_group,
            recurse=True
        )

        results['stages']['staging_deploy'] = stage1

        if not stage1.get('success'):
            results['error'] = 'Staging deployment failed'
            return results

        # Build on staging
        logger.info("Building on staging...")
        build_result = execute_on_group(
            staging_group,
            "cd /var/www/app && npm run build",
            timeout=300
        )

        results['stages']['staging_build'] = build_result

        if not build_result.get('success'):
            results['error'] = 'Staging build failed'
            return results

        # Stage 2: Run tests (if enabled)
        if run_tests:
            logger.info("Stage 2: Running tests...")
            test_result = execute_on_group(
                staging_group,
                "cd /var/www/app && npm test",
                timeout=600
            )

            results['stages']['tests'] = test_result

            if not test_result.get('success'):
                results['error'] = 'Tests failed on staging'
                return results

        # Stage 3: Validation
        logger.info("Stage 3: Validating staging...")
        health_result = execute_on_group(
            staging_group,
            "curl -f http://localhost:3000/health || echo 'Health check failed'",
            timeout=10
        )

        results['stages']['staging_validation'] = health_result

        # Stage 4: Deploy to production
        logger.info("Stage 4: Deploying to production...")
        prod_deploy = push_to_hosts(
            local_path=code_path,
            remote_path="/var/www/app",
            group=prod_group,
            recurse=True
        )

        results['stages']['production_deploy'] = prod_deploy

        if not prod_deploy.get('success'):
            results['error'] = 'Production deployment failed'
            return results

        # Build and restart on production
        logger.info("Building and restarting production...")
        prod_build = execute_on_group(
            prod_group,
            "cd /var/www/app && npm run build && pm2 restart app",
            timeout=300
        )

        results['stages']['production_build'] = prod_build

        # Stage 5: Production verification
        logger.info("Stage 5: Verifying production...")
        prod_health = execute_on_group(
            prod_group,
            "curl -f http://localhost:3000/health",
            timeout=15
        )

        results['stages']['production_verification'] = prod_health

        # Success!
        results['success'] = True
        results['duration'] = format_duration(time.time() - start_time)

        return results

    except Exception as e:
        logger.error(f"Deployment workflow error: {e}")
        results['error'] = str(e)
        results['duration'] = format_duration(time.time() - start_time)
        return results


def backup_workflow(hosts: List[str],
                   backup_paths: List[str],
                   destination: str) -> Dict:
    """
    Backup files from multiple hosts.

    Args:
        hosts: List of hosts to backup from
        backup_paths: Paths to backup on each host
        destination: Local destination directory

    Returns:
        Dict with backup results

    Example:
        >>> result = backup_workflow(
        ...     ["db-01", "db-02"],
        ...     ["/var/lib/mysql"],
        ...     "./backups"
        ... )
        >>> result['backed_up_hosts']
        2
    """
    from sshsync_wrapper import pull_from_host

    start_time = time.time()
    results = {
        'hosts': {},
        'success': True,
        'backed_up_hosts': 0
    }

    for host in hosts:
        host_results = []

        for backup_path in backup_paths:
            # Create timestamped backup directory
            timestamp = time.strftime("%Y%m%d_%H%M%S")
            host_dest = f"{destination}/{host}_{timestamp}"

            result = pull_from_host(
                host=host,
                remote_path=backup_path,
                local_path=host_dest,
                recurse=True
            )

            host_results.append(result)

            if not result.get('success'):
                results['success'] = False

        results['hosts'][host] = host_results

        if all(r.get('success') for r in host_results):
            results['backed_up_hosts'] += 1

    results['duration'] = format_duration(time.time() - start_time)

    return results


def sync_workflow(source_host: str,
                 target_group: str,
                 paths: List[str]) -> Dict:
    """
    Sync files from one host to many.

    Args:
        source_host: Host to pull from
        target_group: Group to push to
        paths: Paths to sync

    Returns:
        Dict with sync results

    Example:
        >>> result = sync_workflow(
        ...     "master-db",
        ...     "replica-dbs",
        ...     ["/var/lib/mysql/config"]
        ... )
        >>> result['success']
        True
    """
    from sshsync_wrapper import pull_from_host, push_to_hosts
    import tempfile
    import shutil

    start_time = time.time()
    results = {'paths': {}, 'success': True}

    # Create temp directory
    with tempfile.TemporaryDirectory() as temp_dir:
        for path in paths:
            # Pull from source
            pull_result = pull_from_host(
                host=source_host,
                remote_path=path,
                local_path=f"{temp_dir}/{Path(path).name}",
                recurse=True
            )

            if not pull_result.get('success'):
                results['paths'][path] = {
                    'success': False,
                    'error': 'Pull from source failed'
                }
                results['success'] = False
                continue

            # Push to targets
            push_result = push_to_hosts(
                local_path=f"{temp_dir}/{Path(path).name}",
                remote_path=path,
                group=target_group,
                recurse=True
            )

            results['paths'][path] = {
                'pull': pull_result,
                'push': push_result,
                'success': push_result.get('success', False)
            }

            if not push_result.get('success'):
                results['success'] = False

    results['duration'] = format_duration(time.time() - start_time)

    return results


def rolling_restart(group: str,
                   service_name: str,
                   wait_between: int = 30) -> Dict:
    """
    Zero-downtime rolling restart of a service across group.

    Args:
        group: Group to restart
        service_name: Service name (e.g., "nginx", "app")
        wait_between: Seconds to wait between restarts

    Returns:
        Dict with restart results

    Example:
        >>> result = rolling_restart("web-servers", "nginx")
        >>> result['restarted_count']
        3
    """
    from utils.helpers import parse_sshsync_config

    start_time = time.time()
    groups_config = parse_sshsync_config()
    hosts = groups_config.get(group, [])

    if not hosts:
        return {
            'success': False,
            'error': f'Group {group} not found or empty'
        }

    results = {
        'hosts': {},
        'restarted_count': 0,
        'failed_count': 0,
        'success': True
    }

    for host in hosts:
        logger.info(f"Restarting {service_name} on {host}...")

        # Restart service
        restart_result = execute_on_host(
            host,
            f"sudo systemctl restart {service_name} || sudo service {service_name} restart",
            timeout=30
        )

        # Health check
        time.sleep(5)  # Wait for service to start

        health_result = execute_on_host(
            host,
            f"sudo systemctl is-active {service_name} || sudo service {service_name} status",
            timeout=10
        )

        success = restart_result.get('success') and health_result.get('success')

        results['hosts'][host] = {
            'restart': restart_result,
            'health': health_result,
            'success': success
        }

        if success:
            results['restarted_count'] += 1
            logger.info(f"✓ {host} restarted successfully")
        else:
            results['failed_count'] += 1
            results['success'] = False
            logger.error(f"✗ {host} restart failed")

        # Wait before next restart (except last)
        if host != hosts[-1]:
            time.sleep(wait_between)

    results['duration'] = format_duration(time.time() - start_time)

    return results


def health_check_workflow(group: str,
                         endpoint: str = "/health",
                         timeout: int = 10) -> Dict:
    """
    Check health endpoint across group.

    Args:
        group: Group to check
        endpoint: Health endpoint path
        timeout: Request timeout

    Returns:
        Dict with health check results

    Example:
        >>> result = health_check_workflow("production", "/health")
        >>> result['healthy_count']
        3
    """
    from utils.helpers import parse_sshsync_config

    groups_config = parse_sshsync_config()
    hosts = groups_config.get(group, [])

    if not hosts:
        return {
            'success': False,
            'error': f'Group {group} not found or empty'
        }

    results = {
        'hosts': {},
        'healthy_count': 0,
        'unhealthy_count': 0
    }

    for host in hosts:
        health_result = execute_on_host(
            host,
            f"curl -f -s -o /dev/null -w '%{{http_code}}' http://localhost:3000{endpoint}",
            timeout=timeout
        )

        is_healthy = (
            health_result.get('success') and
            '200' in health_result.get('stdout', '')
        )

        results['hosts'][host] = {
            'healthy': is_healthy,
            'response': health_result.get('stdout', '').strip()
        }

        if is_healthy:
            results['healthy_count'] += 1
        else:
            results['unhealthy_count'] += 1

    results['success'] = results['unhealthy_count'] == 0

    return results


def main():
    """Test workflow executor functions."""
    print("Testing workflow executor...\n")

    print("Note: Workflow executor requires configured hosts and groups.")
    print("Tests would execute real operations, so showing dry-run simulations.\n")

    print("✅ Workflow executor ready")


if __name__ == "__main__":
    main()