446 lines
12 KiB
Python
446 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Workflow executor for Tailscale SSH Sync Agent.
|
|
Common multi-machine workflow automation.
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional
|
|
import time
|
|
import logging
|
|
|
|
# Add utils to path
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
|
|
from utils.helpers import format_duration, get_timestamp
|
|
from sshsync_wrapper import execute_on_group, execute_on_host, push_to_hosts
|
|
from load_balancer import get_group_capacity
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def deploy_workflow(code_path: str,
|
|
staging_group: str,
|
|
prod_group: str,
|
|
run_tests: bool = True) -> Dict:
|
|
"""
|
|
Full deployment pipeline: staging → test → production.
|
|
|
|
Args:
|
|
code_path: Path to code to deploy
|
|
staging_group: Staging server group
|
|
prod_group: Production server group
|
|
run_tests: Whether to run tests on staging
|
|
|
|
Returns:
|
|
Dict with deployment results
|
|
|
|
Example:
|
|
>>> result = deploy_workflow("./dist", "staging", "production")
|
|
>>> result['success']
|
|
True
|
|
>>> result['duration']
|
|
"12m 45s"
|
|
"""
|
|
start_time = time.time()
|
|
results = {
|
|
'stages': {},
|
|
'success': False,
|
|
'start_time': get_timestamp()
|
|
}
|
|
|
|
try:
|
|
# Stage 1: Deploy to staging
|
|
logger.info("Stage 1: Deploying to staging...")
|
|
stage1 = push_to_hosts(
|
|
local_path=code_path,
|
|
remote_path="/var/www/app",
|
|
group=staging_group,
|
|
recurse=True
|
|
)
|
|
|
|
results['stages']['staging_deploy'] = stage1
|
|
|
|
if not stage1.get('success'):
|
|
results['error'] = 'Staging deployment failed'
|
|
return results
|
|
|
|
# Build on staging
|
|
logger.info("Building on staging...")
|
|
build_result = execute_on_group(
|
|
staging_group,
|
|
"cd /var/www/app && npm run build",
|
|
timeout=300
|
|
)
|
|
|
|
results['stages']['staging_build'] = build_result
|
|
|
|
if not build_result.get('success'):
|
|
results['error'] = 'Staging build failed'
|
|
return results
|
|
|
|
# Stage 2: Run tests (if enabled)
|
|
if run_tests:
|
|
logger.info("Stage 2: Running tests...")
|
|
test_result = execute_on_group(
|
|
staging_group,
|
|
"cd /var/www/app && npm test",
|
|
timeout=600
|
|
)
|
|
|
|
results['stages']['tests'] = test_result
|
|
|
|
if not test_result.get('success'):
|
|
results['error'] = 'Tests failed on staging'
|
|
return results
|
|
|
|
# Stage 3: Validation
|
|
logger.info("Stage 3: Validating staging...")
|
|
health_result = execute_on_group(
|
|
staging_group,
|
|
"curl -f http://localhost:3000/health || echo 'Health check failed'",
|
|
timeout=10
|
|
)
|
|
|
|
results['stages']['staging_validation'] = health_result
|
|
|
|
# Stage 4: Deploy to production
|
|
logger.info("Stage 4: Deploying to production...")
|
|
prod_deploy = push_to_hosts(
|
|
local_path=code_path,
|
|
remote_path="/var/www/app",
|
|
group=prod_group,
|
|
recurse=True
|
|
)
|
|
|
|
results['stages']['production_deploy'] = prod_deploy
|
|
|
|
if not prod_deploy.get('success'):
|
|
results['error'] = 'Production deployment failed'
|
|
return results
|
|
|
|
# Build and restart on production
|
|
logger.info("Building and restarting production...")
|
|
prod_build = execute_on_group(
|
|
prod_group,
|
|
"cd /var/www/app && npm run build && pm2 restart app",
|
|
timeout=300
|
|
)
|
|
|
|
results['stages']['production_build'] = prod_build
|
|
|
|
# Stage 5: Production verification
|
|
logger.info("Stage 5: Verifying production...")
|
|
prod_health = execute_on_group(
|
|
prod_group,
|
|
"curl -f http://localhost:3000/health",
|
|
timeout=15
|
|
)
|
|
|
|
results['stages']['production_verification'] = prod_health
|
|
|
|
# Success!
|
|
results['success'] = True
|
|
results['duration'] = format_duration(time.time() - start_time)
|
|
|
|
return results
|
|
|
|
except Exception as e:
|
|
logger.error(f"Deployment workflow error: {e}")
|
|
results['error'] = str(e)
|
|
results['duration'] = format_duration(time.time() - start_time)
|
|
return results
|
|
|
|
|
|
def backup_workflow(hosts: List[str],
|
|
backup_paths: List[str],
|
|
destination: str) -> Dict:
|
|
"""
|
|
Backup files from multiple hosts.
|
|
|
|
Args:
|
|
hosts: List of hosts to backup from
|
|
backup_paths: Paths to backup on each host
|
|
destination: Local destination directory
|
|
|
|
Returns:
|
|
Dict with backup results
|
|
|
|
Example:
|
|
>>> result = backup_workflow(
|
|
... ["db-01", "db-02"],
|
|
... ["/var/lib/mysql"],
|
|
... "./backups"
|
|
... )
|
|
>>> result['backed_up_hosts']
|
|
2
|
|
"""
|
|
from sshsync_wrapper import pull_from_host
|
|
|
|
start_time = time.time()
|
|
results = {
|
|
'hosts': {},
|
|
'success': True,
|
|
'backed_up_hosts': 0
|
|
}
|
|
|
|
for host in hosts:
|
|
host_results = []
|
|
|
|
for backup_path in backup_paths:
|
|
# Create timestamped backup directory
|
|
timestamp = time.strftime("%Y%m%d_%H%M%S")
|
|
host_dest = f"{destination}/{host}_{timestamp}"
|
|
|
|
result = pull_from_host(
|
|
host=host,
|
|
remote_path=backup_path,
|
|
local_path=host_dest,
|
|
recurse=True
|
|
)
|
|
|
|
host_results.append(result)
|
|
|
|
if not result.get('success'):
|
|
results['success'] = False
|
|
|
|
results['hosts'][host] = host_results
|
|
|
|
if all(r.get('success') for r in host_results):
|
|
results['backed_up_hosts'] += 1
|
|
|
|
results['duration'] = format_duration(time.time() - start_time)
|
|
|
|
return results
|
|
|
|
|
|
def sync_workflow(source_host: str,
|
|
target_group: str,
|
|
paths: List[str]) -> Dict:
|
|
"""
|
|
Sync files from one host to many.
|
|
|
|
Args:
|
|
source_host: Host to pull from
|
|
target_group: Group to push to
|
|
paths: Paths to sync
|
|
|
|
Returns:
|
|
Dict with sync results
|
|
|
|
Example:
|
|
>>> result = sync_workflow(
|
|
... "master-db",
|
|
... "replica-dbs",
|
|
... ["/var/lib/mysql/config"]
|
|
... )
|
|
>>> result['success']
|
|
True
|
|
"""
|
|
from sshsync_wrapper import pull_from_host, push_to_hosts
|
|
import tempfile
|
|
import shutil
|
|
|
|
start_time = time.time()
|
|
results = {'paths': {}, 'success': True}
|
|
|
|
# Create temp directory
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
for path in paths:
|
|
# Pull from source
|
|
pull_result = pull_from_host(
|
|
host=source_host,
|
|
remote_path=path,
|
|
local_path=f"{temp_dir}/{Path(path).name}",
|
|
recurse=True
|
|
)
|
|
|
|
if not pull_result.get('success'):
|
|
results['paths'][path] = {
|
|
'success': False,
|
|
'error': 'Pull from source failed'
|
|
}
|
|
results['success'] = False
|
|
continue
|
|
|
|
# Push to targets
|
|
push_result = push_to_hosts(
|
|
local_path=f"{temp_dir}/{Path(path).name}",
|
|
remote_path=path,
|
|
group=target_group,
|
|
recurse=True
|
|
)
|
|
|
|
results['paths'][path] = {
|
|
'pull': pull_result,
|
|
'push': push_result,
|
|
'success': push_result.get('success', False)
|
|
}
|
|
|
|
if not push_result.get('success'):
|
|
results['success'] = False
|
|
|
|
results['duration'] = format_duration(time.time() - start_time)
|
|
|
|
return results
|
|
|
|
|
|
def rolling_restart(group: str,
|
|
service_name: str,
|
|
wait_between: int = 30) -> Dict:
|
|
"""
|
|
Zero-downtime rolling restart of a service across group.
|
|
|
|
Args:
|
|
group: Group to restart
|
|
service_name: Service name (e.g., "nginx", "app")
|
|
wait_between: Seconds to wait between restarts
|
|
|
|
Returns:
|
|
Dict with restart results
|
|
|
|
Example:
|
|
>>> result = rolling_restart("web-servers", "nginx")
|
|
>>> result['restarted_count']
|
|
3
|
|
"""
|
|
from utils.helpers import parse_sshsync_config
|
|
|
|
start_time = time.time()
|
|
groups_config = parse_sshsync_config()
|
|
hosts = groups_config.get(group, [])
|
|
|
|
if not hosts:
|
|
return {
|
|
'success': False,
|
|
'error': f'Group {group} not found or empty'
|
|
}
|
|
|
|
results = {
|
|
'hosts': {},
|
|
'restarted_count': 0,
|
|
'failed_count': 0,
|
|
'success': True
|
|
}
|
|
|
|
for host in hosts:
|
|
logger.info(f"Restarting {service_name} on {host}...")
|
|
|
|
# Restart service
|
|
restart_result = execute_on_host(
|
|
host,
|
|
f"sudo systemctl restart {service_name} || sudo service {service_name} restart",
|
|
timeout=30
|
|
)
|
|
|
|
# Health check
|
|
time.sleep(5) # Wait for service to start
|
|
|
|
health_result = execute_on_host(
|
|
host,
|
|
f"sudo systemctl is-active {service_name} || sudo service {service_name} status",
|
|
timeout=10
|
|
)
|
|
|
|
success = restart_result.get('success') and health_result.get('success')
|
|
|
|
results['hosts'][host] = {
|
|
'restart': restart_result,
|
|
'health': health_result,
|
|
'success': success
|
|
}
|
|
|
|
if success:
|
|
results['restarted_count'] += 1
|
|
logger.info(f"✓ {host} restarted successfully")
|
|
else:
|
|
results['failed_count'] += 1
|
|
results['success'] = False
|
|
logger.error(f"✗ {host} restart failed")
|
|
|
|
# Wait before next restart (except last)
|
|
if host != hosts[-1]:
|
|
time.sleep(wait_between)
|
|
|
|
results['duration'] = format_duration(time.time() - start_time)
|
|
|
|
return results
|
|
|
|
|
|
def health_check_workflow(group: str,
|
|
endpoint: str = "/health",
|
|
timeout: int = 10) -> Dict:
|
|
"""
|
|
Check health endpoint across group.
|
|
|
|
Args:
|
|
group: Group to check
|
|
endpoint: Health endpoint path
|
|
timeout: Request timeout
|
|
|
|
Returns:
|
|
Dict with health check results
|
|
|
|
Example:
|
|
>>> result = health_check_workflow("production", "/health")
|
|
>>> result['healthy_count']
|
|
3
|
|
"""
|
|
from utils.helpers import parse_sshsync_config
|
|
|
|
groups_config = parse_sshsync_config()
|
|
hosts = groups_config.get(group, [])
|
|
|
|
if not hosts:
|
|
return {
|
|
'success': False,
|
|
'error': f'Group {group} not found or empty'
|
|
}
|
|
|
|
results = {
|
|
'hosts': {},
|
|
'healthy_count': 0,
|
|
'unhealthy_count': 0
|
|
}
|
|
|
|
for host in hosts:
|
|
health_result = execute_on_host(
|
|
host,
|
|
f"curl -f -s -o /dev/null -w '%{{http_code}}' http://localhost:3000{endpoint}",
|
|
timeout=timeout
|
|
)
|
|
|
|
is_healthy = (
|
|
health_result.get('success') and
|
|
'200' in health_result.get('stdout', '')
|
|
)
|
|
|
|
results['hosts'][host] = {
|
|
'healthy': is_healthy,
|
|
'response': health_result.get('stdout', '').strip()
|
|
}
|
|
|
|
if is_healthy:
|
|
results['healthy_count'] += 1
|
|
else:
|
|
results['unhealthy_count'] += 1
|
|
|
|
results['success'] = results['unhealthy_count'] == 0
|
|
|
|
return results
|
|
|
|
|
|
def main():
|
|
"""Test workflow executor functions."""
|
|
print("Testing workflow executor...\n")
|
|
|
|
print("Note: Workflow executor requires configured hosts and groups.")
|
|
print("Tests would execute real operations, so showing dry-run simulations.\n")
|
|
|
|
print("✅ Workflow executor ready")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|