From 14c678ceac362b99ae37a448a6859dfa1227898b Mon Sep 17 00:00:00 2001 From: Zhongwei Li Date: Sat, 29 Nov 2025 18:47:40 +0800 Subject: [PATCH] Initial commit --- .claude-plugin/plugin.json | 12 + CHANGELOG.md | 163 +++ DECISIONS.md | 458 +++++++ INSTALLATION.md | 707 ++++++++++ README.md | 3 + SKILL.md | 1204 +++++++++++++++++ VERSION | 1 + plugin.lock.json | 117 ++ references/sshsync-guide.md | 466 +++++++ references/tailscale-integration.md | 468 +++++++ scripts/load_balancer.py | 378 ++++++ scripts/sshsync_wrapper.py | 409 ++++++ scripts/tailscale_manager.py | 426 ++++++ scripts/utils/helpers.py | 628 +++++++++ scripts/utils/validators/__init__.py | 43 + .../utils/validators/connection_validator.py | 275 ++++ scripts/utils/validators/host_validator.py | 232 ++++ .../utils/validators/parameter_validator.py | 363 +++++ scripts/workflow_executor.py | 445 ++++++ tests/test_helpers.py | 180 +++ tests/test_integration.py | 346 +++++ tests/test_validation.py | 177 +++ 22 files changed, 7501 insertions(+) create mode 100644 .claude-plugin/plugin.json create mode 100644 CHANGELOG.md create mode 100644 DECISIONS.md create mode 100644 INSTALLATION.md create mode 100644 README.md create mode 100644 SKILL.md create mode 100644 VERSION create mode 100644 plugin.lock.json create mode 100644 references/sshsync-guide.md create mode 100644 references/tailscale-integration.md create mode 100644 scripts/load_balancer.py create mode 100644 scripts/sshsync_wrapper.py create mode 100644 scripts/tailscale_manager.py create mode 100644 scripts/utils/helpers.py create mode 100644 scripts/utils/validators/__init__.py create mode 100644 scripts/utils/validators/connection_validator.py create mode 100644 scripts/utils/validators/host_validator.py create mode 100644 scripts/utils/validators/parameter_validator.py create mode 100644 scripts/workflow_executor.py create mode 100644 tests/test_helpers.py create mode 100644 tests/test_integration.py create mode 100644 tests/test_validation.py diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json new file mode 100644 index 0000000..f61a7ec --- /dev/null +++ b/.claude-plugin/plugin.json @@ -0,0 +1,12 @@ +{ + "name": "tailscale-sshsync-agent", + "description": "Manages distributed workloads and file sharing across Tailscale SSH-connected machines. Automates remote command execution, intelligent load balancing, file synchronization workflows, host health monitoring, and multi-machine orchestration using sshsync.", + "version": "0.0.0-2025.11.28", + "author": { + "name": "William VanSickle III", + "email": "noreply@humanfrontierlabs.com" + }, + "skills": [ + "./" + ] +} \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..281b1f4 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,163 @@ +# Changelog + +All notable changes to Tailscale SSH Sync Agent will be documented here. + +Format based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). +Versioning follows [Semantic Versioning](https://semver.org/). + +## [1.0.0] - 2025-10-19 + +### Added + +**Core Functionality:** +- `sshsync_wrapper.py`: Python interface to sshsync CLI operations + - `get_host_status()`: Check online/offline status of hosts + - `execute_on_all()`: Run commands on all configured hosts + - `execute_on_group()`: Run commands on specific groups + - `execute_on_host()`: Run commands on single host + - `push_to_hosts()`: Push files to multiple hosts (with groups support) + - `pull_from_host()`: Pull files from hosts + - `list_hosts()`: List all configured hosts + - `get_groups()`: Get group configuration + +- `tailscale_manager.py`: Tailscale-specific operations + - `get_tailscale_status()`: Get complete network status + - `check_connectivity()`: Ping hosts via Tailscale + - `get_peer_info()`: Get detailed peer information + - `list_online_machines()`: List all online Tailscale machines + - `validate_tailscale_ssh()`: Check if Tailscale SSH works for a host + - `get_network_summary()`: Human-readable network summary + +- `load_balancer.py`: Intelligent task distribution + - `get_machine_load()`: Get CPU, memory, disk metrics for a machine + - `select_optimal_host()`: Pick best host based on current load + - `get_group_capacity()`: Get aggregate capacity of a group + - `distribute_tasks()`: Distribute multiple tasks optimally across hosts + - `format_load_report()`: Format load metrics as human-readable report + +- `workflow_executor.py`: Common multi-machine workflows + - `deploy_workflow()`: Full deployment pipeline (staging → test → production) + - `backup_workflow()`: Backup files from multiple hosts + - `sync_workflow()`: Sync files from one host to many + - `rolling_restart()`: Zero-downtime service restart across group + - `health_check_workflow()`: Check health endpoints across group + +**Utilities:** +- `utils/helpers.py`: Common formatting and parsing functions + - Byte formatting (`format_bytes`) + - Duration formatting (`format_duration`) + - Percentage formatting (`format_percentage`) + - SSH config parsing (`parse_ssh_config`) + - sshsync config parsing (`parse_sshsync_config`) + - System metrics parsing (`parse_disk_usage`, `parse_memory_usage`, `parse_cpu_load`) + - Load score calculation (`calculate_load_score`) + - Status classification (`classify_load_status`, `classify_latency`) + - Safe command execution (`run_command`, `safe_execute`) + +- `utils/validators/`: Comprehensive validation system + - `parameter_validator.py`: Input validation (hosts, groups, paths, timeouts, commands) + - `host_validator.py`: Host configuration and availability validation + - `connection_validator.py`: SSH and Tailscale connection validation + +**Testing:** +- `tests/test_integration.py`: 11 end-to-end integration tests +- `tests/test_helpers.py`: 11 helper function tests +- `tests/test_validation.py`: 7 validation tests +- **Total: 29 tests** covering all major functionality + +**Documentation:** +- `SKILL.md`: Complete skill documentation (6,000+ words) + - When to use this skill + - How it works + - Data sources (sshsync CLI, Tailscale) + - Detailed workflows for each operation type + - Available scripts and functions + - Error handling and validations + - Performance and caching strategies + - Usage examples +- `references/sshsync-guide.md`: Complete sshsync CLI reference +- `references/tailscale-integration.md`: Tailscale integration guide +- `README.md`: Installation and quick start guide +- `INSTALLATION.md`: Detailed setup tutorial +- `DECISIONS.md`: Architecture decisions and rationale + +### Data Sources + +**sshsync CLI:** +- Installation: `pip install sshsync` +- Configuration: `~/.config/sshsync/config.yaml` +- SSH config integration: `~/.ssh/config` +- Group-based host management +- Remote command execution with timeouts +- File push/pull operations (single or recursive) +- Status checking and connectivity validation + +**Tailscale:** +- Zero-config VPN with WireGuard encryption +- MagicDNS for easy host addressing +- Built-in SSH capabilities +- Seamless integration with standard SSH +- Peer-to-peer connections +- Works across NATs and firewalls + +### Coverage + +**Operations:** +- Host status monitoring and availability checks +- Intelligent load-based task distribution +- Multi-host command execution (all hosts, groups, individual) +- File synchronization workflows (push/pull) +- Deployment pipelines (staging → production) +- Backup and sync workflows +- Rolling restarts with zero downtime +- Health checking across services + +**Geographic Coverage:** All hosts in Tailscale network (global) + +**Temporal Coverage:** Real-time status and operations + +### Known Limitations + +**v1.0.0:** +- sshsync must be installed separately (`pip install sshsync`) +- Tailscale must be configured separately +- SSH keys must be set up manually on each host +- Load balancing uses simple metrics (CPU, memory, disk) +- No built-in monitoring dashboards (terminal output only) +- No persistence of operation history (logs only) +- Requires SSH config and sshsync config to be manually maintained + +### Planned for v2.0 + +**Enhanced Features:** +- Automated SSH key distribution across hosts +- Built-in operation history and logging database +- Web dashboard for monitoring and operations +- Advanced load balancing with custom metrics +- Scheduled operations and cron integration +- Operation rollback capabilities +- Integration with configuration management tools (Ansible, Terraform) +- Cost tracking for cloud resources +- Performance metrics collection and visualization +- Alert system for failed operations +- Multi-tenancy support for team environments + +**Integrations:** +- Prometheus metrics export +- Grafana dashboard templates +- Slack/Discord notifications +- CI/CD pipeline integration +- Container orchestration support (Docker, Kubernetes) + +## [Unreleased] + +### Planned + +- Add support for Windows hosts (PowerShell remoting) +- Improve performance for large host groups (100+) +- Add SSH connection pooling for faster operations +- Implement operation queueing for long-running tasks +- Add support for custom validation plugins +- Expand coverage to Docker containers via SSH +- Add retry strategies with exponential backoff +- Implement circuit breaker pattern for failing hosts diff --git a/DECISIONS.md b/DECISIONS.md new file mode 100644 index 0000000..2fd49ea --- /dev/null +++ b/DECISIONS.md @@ -0,0 +1,458 @@ +# Architecture Decisions + +Documentation of all technical decisions made for Tailscale SSH Sync Agent. + +## Tool Selection + +### Selected Tool: sshsync + +**Justification:** + +✅ **Advantages:** +- **Ready-to-use**: Available via `pip install sshsync` +- **Group management**: Built-in support for organizing hosts into groups +- **Integration**: Works with existing SSH config (`~/.ssh/config`) +- **Simple API**: Easy-to-wrap CLI interface +- **Parallel execution**: Commands run concurrently across hosts +- **File operations**: Push/pull with recursive support +- **Timeout handling**: Per-command timeouts for reliability +- **Active maintenance**: Regular updates and bug fixes +- **Python-based**: Easy to extend and integrate + +✅ **Coverage:** +- All SSH-accessible hosts +- Works with any SSH server (Linux, macOS, BSD, etc.) +- Platform-agnostic (runs on any OS with Python) + +✅ **Cost:** +- Free and open-source +- No API keys or subscriptions required +- No rate limits + +✅ **Documentation:** +- Clear command-line interface +- PyPI documentation available +- GitHub repository with examples + +**Alternatives Considered:** + +❌ **Fabric (Python library)** +- Pros: Pure Python, very flexible +- Cons: Requires writing more code, no built-in group management +- **Rejected because**: sshsync provides ready-made functionality + +❌ **Ansible** +- Pros: Industry standard, very powerful +- Cons: Requires learning YAML playbooks, overkill for simple operations +- **Rejected because**: Too heavyweight for ad-hoc commands and file transfers + +❌ **pssh (parallel-ssh)** +- Pros: Simple parallel SSH +- Cons: No group management, no file transfer built-in, less actively maintained +- **Rejected because**: sshsync has better group management and file operations + +❌ **Custom SSH wrapper** +- Pros: Full control +- Cons: Reinventing the wheel, maintaining parallel execution logic +- **Rejected because**: sshsync already provides what we need + +**Conclusion:** + +sshsync is the best tool for this use case because it: +1. Provides group-based host management out of the box +2. Handles parallel execution automatically +3. Integrates with existing SSH configuration +4. Supports both command execution and file transfers +5. Requires minimal wrapper code + +## Integration: Tailscale + +**Decision**: Integrate with Tailscale for network connectivity + +**Justification:** + +✅ **Why Tailscale:** +- **Zero-config VPN**: No manual firewall/NAT configuration +- **Secure by default**: WireGuard encryption +- **Works everywhere**: Coffee shop, home, office, cloud +- **MagicDNS**: Easy addressing (machine-name.tailnet.ts.net) +- **Standard SSH**: Works with all SSH tools including sshsync +- **No overhead**: Uses regular SSH protocol over Tailscale network + +✅ **Integration approach:** +- Tailscale provides the network layer +- Standard SSH works over Tailscale +- sshsync operates normally using Tailscale hostnames/IPs +- No Tailscale-specific code needed in core operations +- Tailscale status checking for diagnostics + +**Alternatives:** + +❌ **Direct public internet + port forwarding** +- Cons: Complex firewall setup, security risks, doesn't work on mobile/restricted networks +- **Rejected because**: Requires too much configuration and has security concerns + +❌ **Other VPNs (WireGuard, OpenVPN, ZeroTier)** +- Cons: More manual configuration, less zero-config +- **Rejected because**: Tailscale is easier to set up and use + +**Conclusion:** + +Tailscale + standard SSH is the optimal combination: +- Secure connectivity without configuration +- Works with existing SSH tools +- No vendor lock-in (can use other VPNs if needed) + +## Architecture + +### Structure: Modular Scripts + Utilities + +**Decision**: Separate concerns into focused modules + +``` +scripts/ +├── sshsync_wrapper.py # sshsync CLI interface +├── tailscale_manager.py # Tailscale operations +├── load_balancer.py # Task distribution logic +├── workflow_executor.py # Common workflows +└── utils/ + ├── helpers.py # Formatting, parsing + └── validators/ # Input validation +``` + +**Justification:** + +✅ **Modularity:** +- Each script has single responsibility +- Easy to test independently +- Easy to extend without breaking others + +✅ **Reusability:** +- Helpers used across all scripts +- Validators prevent duplicate validation logic +- Workflows compose lower-level operations + +✅ **Maintainability:** +- Clear file organization +- Easy to locate specific functionality +- Separation of concerns + +**Alternatives:** + +❌ **Monolithic single script** +- Cons: Hard to test, hard to maintain, becomes too large +- **Rejected because**: Doesn't scale well + +❌ **Over-engineered class hierarchy** +- Cons: Unnecessary complexity for this use case +- **Rejected because**: Simple functions are sufficient + +**Conclusion:** + +Modular functional approach provides good balance of simplicity and maintainability. + +### Validation Strategy: Multi-Layer + +**Decision**: Validate at multiple layers + +**Layers:** + +1. **Parameter validation** (`parameter_validator.py`) + - Validates user inputs before any operations + - Prevents invalid hosts, groups, paths, etc. + +2. **Host validation** (`host_validator.py`) + - Validates SSH configuration exists + - Checks host reachability + - Validates group membership + +3. **Connection validation** (`connection_validator.py`) + - Tests actual SSH connectivity + - Verifies Tailscale status + - Checks SSH key authentication + +**Justification:** + +✅ **Early failure:** +- Catch errors before expensive operations +- Clear error messages at each layer + +✅ **Comprehensive:** +- Multiple validation points catch different issues +- Reduces runtime failures + +✅ **User-friendly:** +- Helpful error messages with suggestions +- Clear indication of what went wrong + +**Conclusion:** + +Multi-layer validation provides robust error handling and great user experience. + +## Load Balancing Strategy + +### Decision: Simple Composite Score + +**Formula:** +```python +score = (cpu_pct * 0.4) + (mem_pct * 0.3) + (disk_pct * 0.3) +``` + +**Weights:** +- CPU: 40% (most important for compute tasks) +- Memory: 30% (important for data processing) +- Disk: 30% (important for I/O operations) + +**Justification:** + +✅ **Simple and effective:** +- Easy to understand +- Fast to calculate +- Works well for most workloads + +✅ **Balanced:** +- Considers multiple resource types +- No single metric dominates + +**Alternatives:** + +❌ **CPU only** +- Cons: Ignores memory-bound and I/O-bound tasks +- **Rejected because**: Too narrow + +❌ **Complex ML-based prediction** +- Cons: Overkill, slow, requires training data +- **Rejected because**: Unnecessary complexity + +❌ **Fixed round-robin** +- Cons: Doesn't consider actual load +- **Rejected because**: Can overload already-busy hosts + +**Conclusion:** + +Simple weighted score provides good balance without complexity. + +## Error Handling Philosophy + +### Decision: Graceful Degradation + Clear Messages + +**Principles:** + +1. **Fail early with validation**: Catch errors before operations +2. **Isolate failures**: One host failure doesn't stop others +3. **Clear messages**: Tell user exactly what went wrong and how to fix +4. **Automatic retry**: Retry transient errors (network, timeout) +5. **Dry-run support**: Preview operations before execution + +**Implementation:** + +```python +# Example error handling pattern +try: + validate_host(host) + validate_ssh_connection(host) + result = execute_command(host, command) +except ValidationError as e: + return {'error': str(e), 'suggestion': 'Fix: ...'} +except ConnectionError as e: + return {'error': str(e), 'diagnostics': get_diagnostics(host)} +``` + +**Justification:** + +✅ **Better UX:** +- Users know exactly what's wrong +- Suggestions help fix issues quickly + +✅ **Reliability:** +- Automatic retry handles transient issues +- Dry-run prevents mistakes + +✅ **Debugging:** +- Clear error messages speed up troubleshooting +- Diagnostics provide actionable information + +**Conclusion:** + +Graceful degradation with helpful messages creates better user experience. + +## Caching Strategy + +**Decision**: Minimal caching for real-time accuracy + +**What we cache:** +- Nothing (v1.0.0) + +**Why no caching:** +- Host status changes frequently +- Load metrics change constantly +- Operations need real-time data +- Cache invalidation is complex + +**Future consideration (v2.0):** +- Cache Tailscale status (60s TTL) +- Cache group configuration (5min TTL) +- Cache SSH config parsing (5min TTL) + +**Justification:** + +✅ **Simplicity:** +- No cache invalidation logic needed +- No stale data issues + +✅ **Accuracy:** +- Always get current state +- No surprises from cached data + +**Trade-off:** +- Slightly slower repeated operations +- More network calls + +**Conclusion:** + +For v1.0.0, simplicity and accuracy outweigh performance concerns. Real-time data is more valuable than speed. + +## Testing Strategy + +### Decision: Comprehensive Unit + Integration Tests + +**Coverage:** + +- **29 tests total:** + - 11 integration tests (end-to-end workflows) + - 11 helper tests (formatting, parsing, calculations) + - 7 validation tests (input validation, safety checks) + +**Test Philosophy:** + +1. **Test real functionality**: Integration tests use actual functions +2. **Test edge cases**: Validation tests cover error conditions +3. **Test helpers**: Ensure formatting/parsing works correctly +4. **Fast execution**: All tests run in < 10 seconds +5. **No external dependencies**: Tests don't require Tailscale or sshsync to be running + +**Justification:** + +✅ **Confidence:** +- Tests verify code works as expected +- Catches regressions when modifying code + +✅ **Documentation:** +- Tests show how to use functions +- Examples of expected behavior + +✅ **Reliability:** +- Production-ready code from v1.0.0 + +**Conclusion:** + +Comprehensive testing ensures reliable code from the start. + +## Performance Considerations + +### Parallel Execution + +**Decision**: Leverage sshsync's built-in parallelization + +- sshsync runs commands concurrently across hosts automatically +- No need to implement custom threading/multiprocessing +- Timeout applies per-host independently + +**Trade-offs:** + +✅ **Pros:** +- Simple to use +- Fast for large host groups +- No concurrency bugs + +⚠️ **Cons:** +- Less control over parallelism level +- Can overwhelm network with too many concurrent connections + +**Conclusion:** + +Built-in parallelization is sufficient for most use cases. Custom control can be added in v2.0 if needed. + +## Security Considerations + +### SSH Key Authentication + +**Decision**: Require SSH keys (no password auth) + +**Justification:** + +✅ **Security:** +- Keys are more secure than passwords +- Can't be brute-forced +- Can be revoked per-host + +✅ **Automation:** +- Non-interactive (no password prompts) +- Works in scripts and CI/CD + +**Implementation:** +- Validators check SSH key auth works +- Clear error messages guide users to set up keys +- Documentation explains SSH key setup + +### Command Safety + +**Decision**: Validate dangerous commands + +**Dangerous patterns blocked:** +- `rm -rf /` (root deletion) +- `mkfs.*` (filesystem formatting) +- `dd.*of=/dev/` (direct disk writes) +- Fork bombs +- Direct disk writes + +**Override**: Use `allow_dangerous=True` to bypass + +**Justification:** + +✅ **Safety:** +- Prevents accidental destructive operations +- Dry-run provides preview + +✅ **Flexibility:** +- Can still run dangerous commands if explicitly allowed + +**Conclusion:** + +Safety by default with escape hatch for advanced users. + +## Decisions Summary + +| Decision | Choice | Rationale | +|----------|--------|-----------| +| **CLI Tool** | sshsync | Best balance of features, ease of use, and maintenance | +| **Network** | Tailscale | Zero-config secure VPN, works everywhere | +| **Architecture** | Modular scripts | Clear separation of concerns, maintainable | +| **Validation** | Multi-layer | Catch errors early with helpful messages | +| **Load Balancing** | Composite score | Simple, effective, considers multiple resources | +| **Caching** | None (v1.0) | Simplicity and real-time accuracy | +| **Testing** | 29 tests | Comprehensive coverage for reliability | +| **Security** | SSH keys + validation | Secure and automation-friendly | + +## Trade-offs Accepted + +1. **No caching** → Slightly slower, but always accurate +2. **sshsync dependency** → External tool, but saves development time +3. **SSH key requirement** → Setup needed, but more secure +4. **Simple load balancing** → Less sophisticated, but fast and easy to understand +5. **Terminal UI only** → No web dashboard, but simpler to develop and maintain + +## Future Improvements + +### v2.0 Considerations + +1. **Add caching** for frequently-accessed data (Tailscale status, groups) +2. **Web dashboard** for visualization and monitoring +3. **Operation history** database for audit trail +4. **Advanced load balancing** with custom metrics +5. **Automated SSH key distribution** across hosts +6. **Integration with config management** tools (Ansible, Terraform) +7. **Container support** via SSH to Docker containers +8. **Custom validation plugins** for domain-specific checks + +All decisions prioritize **simplicity**, **security**, and **maintainability** for v1.0.0. diff --git a/INSTALLATION.md b/INSTALLATION.md new file mode 100644 index 0000000..4592bdd --- /dev/null +++ b/INSTALLATION.md @@ -0,0 +1,707 @@ +# Installation Guide + +Complete step-by-step tutorial for setting up Tailscale SSH Sync Agent. + +## Table of Contents + +1. [Prerequisites](#prerequisites) +2. [Step 1: Install Tailscale](#step-1-install-tailscale) +3. [Step 2: Install sshsync](#step-2-install-sshsync) +4. [Step 3: Configure SSH](#step-3-configure-ssh) +5. [Step 4: Configure sshsync Groups](#step-4-configure-sshsync-groups) +6. [Step 5: Install Agent](#step-5-install-agent) +7. [Step 6: Test Installation](#step-6-test-installation) +8. [Troubleshooting](#troubleshooting) + +## Prerequisites + +Before you begin, ensure you have: + +- **Operating System**: macOS, Linux, or BSD +- **Python**: Version 3.10 or higher +- **pip**: Python package installer +- **Claude Code**: Installed and running +- **Remote machines**: At least one machine you want to manage +- **SSH access**: Ability to SSH to remote machines + +**Check Python version**: +```bash +python3 --version +# Should show: Python 3.10.x or higher +``` + +**Check pip**: +```bash +pip3 --version +# Should show: pip xx.x.x from ... +``` + +## Step 1: Install Tailscale + +Tailscale provides secure networking between your machines. + +### macOS + +```bash +# Install via Homebrew +brew install tailscale + +# Start Tailscale +sudo tailscale up + +# Follow authentication link in terminal +# This will open browser to log in +``` + +### Linux (Ubuntu/Debian) + +```bash +# Install Tailscale +curl -fsSL https://tailscale.com/install.sh | sh + +# Start and authenticate +sudo tailscale up + +# Follow authentication link +``` + +### Linux (Fedora/RHEL) + +```bash +# Add repository +sudo dnf config-manager --add-repo https://pkgs.tailscale.com/stable/fedora/tailscale.repo + +# Install +sudo dnf install tailscale + +# Enable and start +sudo systemctl enable --now tailscaled +sudo tailscale up +``` + +### Verify Installation + +```bash +# Check Tailscale status +tailscale status + +# Should show list of machines in your tailnet +# Example output: +# 100.64.1.10 homelab-1 user@ linux - +# 100.64.1.11 laptop user@ macOS - +``` + +**Important**: Install and authenticate Tailscale on **all machines** you want to manage. + +## Step 2: Install sshsync + +sshsync is the CLI tool for managing SSH operations across multiple hosts. + +```bash +# Install via pip +pip3 install sshsync + +# Or use pipx for isolated installation +pipx install sshsync +``` + +### Verify Installation + +```bash +# Check version +sshsync --version + +# Should show: sshsync, version x.x.x +``` + +### Common Installation Issues + +**Issue**: `pip3: command not found` + +**Solution**: +```bash +# macOS +brew install python3 + +# Linux (Ubuntu/Debian) +sudo apt install python3-pip + +# Linux (Fedora/RHEL) +sudo dnf install python3-pip +``` + +**Issue**: Permission denied during install + +**Solution**: +```bash +# Install for current user only +pip3 install --user sshsync + +# Or use pipx +pip3 install --user pipx +pipx install sshsync +``` + +## Step 3: Configure SSH + +SSH configuration defines how to connect to each machine. + +### Step 3.1: Generate SSH Keys (if you don't have them) + +```bash +# Generate ed25519 key (recommended) +ssh-keygen -t ed25519 -C "your_email@example.com" + +# Press Enter to use default location (~/.ssh/id_ed25519) +# Enter passphrase (or leave empty for no passphrase) +``` + +**Output**: +``` +Your identification has been saved in /Users/you/.ssh/id_ed25519 +Your public key has been saved in /Users/you/.ssh/id_ed25519.pub +``` + +### Step 3.2: Copy Public Key to Remote Machines + +For each remote machine: + +```bash +# Copy SSH key to remote +ssh-copy-id user@machine-hostname + +# Example: +ssh-copy-id admin@100.64.1.10 +``` + +**Manual method** (if ssh-copy-id doesn't work): + +```bash +# Display public key +cat ~/.ssh/id_ed25519.pub + +# SSH to remote machine +ssh user@remote-host + +# On remote machine: +mkdir -p ~/.ssh +chmod 700 ~/.ssh +echo "your-public-key-here" >> ~/.ssh/authorized_keys +chmod 600 ~/.ssh/authorized_keys +exit +``` + +### Step 3.3: Test SSH Connection + +```bash +# Test connection (should not ask for password) +ssh user@remote-host "hostname" + +# If successful, should print remote hostname +``` + +### Step 3.4: Create SSH Config File + +Edit `~/.ssh/config`: + +```bash +vim ~/.ssh/config +``` + +**Add host entries**: + +``` +# Production servers +Host prod-web-01 + HostName prod-web-01.tailnet.ts.net + User deploy + IdentityFile ~/.ssh/id_ed25519 + Port 22 + +Host prod-web-02 + HostName 100.64.1.21 + User deploy + IdentityFile ~/.ssh/id_ed25519 + +Host prod-db-01 + HostName 100.64.1.30 + User deploy + IdentityFile ~/.ssh/id_ed25519 + +# Development +Host dev-laptop + HostName dev-laptop.tailnet.ts.net + User developer + IdentityFile ~/.ssh/id_ed25519 + +Host dev-desktop + HostName 100.64.1.40 + User developer + IdentityFile ~/.ssh/id_ed25519 + +# Homelab +Host homelab-1 + HostName 100.64.1.10 + User admin + IdentityFile ~/.ssh/id_ed25519 + +Host homelab-2 + HostName 100.64.1.11 + User admin + IdentityFile ~/.ssh/id_ed25519 +``` + +**Important fields**: +- **Host**: Alias you'll use (e.g., "homelab-1") +- **HostName**: Actual hostname or IP (Tailscale hostname or IP) +- **User**: SSH username on remote machine +- **IdentityFile**: Path to SSH private key + +### Step 3.5: Set Correct Permissions + +```bash +# SSH config should be readable only by you +chmod 600 ~/.ssh/config + +# SSH directory permissions +chmod 700 ~/.ssh + +# Private key permissions +chmod 600 ~/.ssh/id_ed25519 + +# Public key permissions +chmod 644 ~/.ssh/id_ed25519.pub +``` + +### Step 3.6: Verify All Hosts + +Test each host in your config: + +```bash +# Test each host +ssh homelab-1 "echo 'Connection successful'" +ssh prod-web-01 "echo 'Connection successful'" +ssh dev-laptop "echo 'Connection successful'" + +# Should connect without asking for password +``` + +## Step 4: Configure sshsync Groups + +Groups organize your hosts for easy management. + +### Step 4.1: Initialize sshsync Configuration + +```bash +# Sync hosts and create groups +sshsync sync +``` + +**What this does**: +1. Reads all hosts from `~/.ssh/config` +2. Prompts you to assign hosts to groups +3. Creates `~/.config/sshsync/config.yaml` + +### Step 4.2: Follow Interactive Prompts + +``` +Found 7 ungrouped hosts: +1. homelab-1 +2. homelab-2 +3. prod-web-01 +4. prod-web-02 +5. prod-db-01 +6. dev-laptop +7. dev-desktop + +Assign groups now? [Y/n]: Y + +Enter group name for homelab-1 (or skip): homelab +Enter group name for homelab-2 (or skip): homelab +Enter group name for prod-web-01 (or skip): production,web +Enter group name for prod-web-02 (or skip): production,web +Enter group name for prod-db-01 (or skip): production,database +Enter group name for dev-laptop (or skip): development +Enter group name for dev-desktop (or skip): development +``` + +**Tips**: +- Hosts can belong to multiple groups (separate with commas) +- Use meaningful group names (production, development, web, database, homelab) +- Skip hosts you don't want to group yet + +### Step 4.3: Verify Configuration + +```bash +# View generated config +cat ~/.config/sshsync/config.yaml +``` + +**Expected output**: +```yaml +groups: + production: + - prod-web-01 + - prod-web-02 + - prod-db-01 + web: + - prod-web-01 + - prod-web-02 + database: + - prod-db-01 + development: + - dev-laptop + - dev-desktop + homelab: + - homelab-1 + - homelab-2 +``` + +### Step 4.4: Test sshsync + +```bash +# List hosts +sshsync ls + +# List with status +sshsync ls --with-status + +# Test command execution +sshsync all "hostname" + +# Test group execution +sshsync group homelab "uptime" +``` + +## Step 5: Install Agent + +### Step 5.1: Navigate to Agent Directory + +```bash +cd /path/to/tailscale-sshsync-agent +``` + +### Step 5.2: Verify Agent Structure + +```bash +# List files +ls -la + +# Should see: +# .claude-plugin/ +# scripts/ +# tests/ +# references/ +# SKILL.md +# README.md +# VERSION +# CHANGELOG.md +# etc. +``` + +### Step 5.3: Validate marketplace.json + +```bash +# Check JSON is valid +python3 -c "import json; json.load(open('.claude-plugin/marketplace.json')); print('✅ Valid JSON')" + +# Should output: ✅ Valid JSON +``` + +### Step 5.4: Install via Claude Code + +In Claude Code: + +``` +/plugin marketplace add /absolute/path/to/tailscale-sshsync-agent +``` + +**Example**: +``` +/plugin marketplace add /Users/you/tailscale-sshsync-agent +``` + +**Expected output**: +``` +✓ Plugin installed successfully +✓ Skill: tailscale-sshsync-agent +✓ Description: Manages distributed workloads and file sharing... +``` + +### Step 5.5: Verify Installation + +In Claude Code: + +``` +"Which of my machines are online?" +``` + +**Expected response**: Agent should activate and check your Tailscale network. + +## Step 6: Test Installation + +### Test 1: Host Status + +**Query**: +``` +"Which of my machines are online?" +``` + +**Expected**: List of hosts with online/offline status + +### Test 2: List Groups + +**Query**: +``` +"What groups do I have configured?" +``` + +**Expected**: List of your sshsync groups + +### Test 3: Execute Command + +**Query**: +``` +"Check disk space on homelab machines" +``` + +**Expected**: Disk usage for hosts in homelab group + +### Test 4: Dry-Run + +**Query**: +``` +"Show me what would happen if I ran 'uptime' on all machines (dry-run)" +``` + +**Expected**: Preview without execution + +### Test 5: Run Test Suite + +```bash +cd /path/to/tailscale-sshsync-agent + +# Run all tests +python3 tests/test_integration.py + +# Should show: +# Results: 11/11 passed +# 🎉 All tests passed! +``` + +## Troubleshooting + +### Agent Not Activating + +**Symptoms**: Agent doesn't respond to queries about machines/hosts + +**Solutions**: + +1. **Check installation**: + ``` + /plugin list + ``` + Should show `tailscale-sshsync-agent` in list. + +2. **Reinstall**: + ``` + /plugin remove tailscale-sshsync-agent + /plugin marketplace add /path/to/tailscale-sshsync-agent + ``` + +3. **Check marketplace.json**: + ```bash + cat .claude-plugin/marketplace.json + # Verify "description" field matches SKILL.md frontmatter + ``` + +### SSH Connection Fails + +**Symptoms**: "Permission denied" or "Connection refused" + +**Solutions**: + +1. **Check SSH key**: + ```bash + ssh-add -l + # Should list your SSH key + ``` + + If not listed: + ```bash + ssh-add ~/.ssh/id_ed25519 + ``` + +2. **Test SSH directly**: + ```bash + ssh -v hostname + # -v shows verbose debug info + ``` + +3. **Verify authorized_keys on remote**: + ```bash + ssh hostname "cat ~/.ssh/authorized_keys" + # Should contain your public key + ``` + +### Tailscale Connection Issues + +**Symptoms**: Hosts show as offline in Tailscale + +**Solutions**: + +1. **Check Tailscale status**: + ```bash + tailscale status + ``` + +2. **Restart Tailscale**: + ```bash + # macOS + brew services restart tailscale + + # Linux + sudo systemctl restart tailscaled + ``` + +3. **Re-authenticate**: + ```bash + sudo tailscale up + ``` + +### sshsync Errors + +**Symptoms**: "sshsync: command not found" + +**Solutions**: + +1. **Reinstall sshsync**: + ```bash + pip3 install --upgrade sshsync + ``` + +2. **Check PATH**: + ```bash + which sshsync + # Should show path to sshsync + ``` + + If not found, add to PATH: + ```bash + echo 'export PATH="$HOME/.local/bin:$PATH"' >> ~/.bashrc + source ~/.bashrc + ``` + +### Config File Issues + +**Symptoms**: "Group not found" or "Host not found" + +**Solutions**: + +1. **Verify SSH config**: + ```bash + cat ~/.ssh/config + # Check host aliases are correct + ``` + +2. **Verify sshsync config**: + ```bash + cat ~/.config/sshsync/config.yaml + # Check groups are defined + ``` + +3. **Re-sync**: + ```bash + sshsync sync + ``` + +### Test Failures + +**Symptoms**: Tests fail with errors + +**Solutions**: + +1. **Check dependencies**: + ```bash + pip3 list | grep -E "sshsync|pyyaml" + ``` + +2. **Check Python version**: + ```bash + python3 --version + # Must be 3.10+ + ``` + +3. **Run tests individually**: + ```bash + python3 tests/test_helpers.py + python3 tests/test_validation.py + python3 tests/test_integration.py + ``` + +## Post-Installation + +### Recommended Next Steps + +1. **Create more groups** for better organization: + ```bash + sshsync gadd staging + sshsync gadd backup-servers + ``` + +2. **Test file operations**: + ``` + "Push test file to homelab machines (dry-run)" + ``` + +3. **Set up automation**: + - Create scripts for common tasks + - Schedule backups + - Automate deployments + +4. **Review documentation**: + - Read `references/sshsync-guide.md` for advanced sshsync usage + - Read `references/tailscale-integration.md` for Tailscale tips + +### Security Checklist + +- ✅ SSH keys are password-protected +- ✅ SSH config has correct permissions (600) +- ✅ Private keys have correct permissions (600) +- ✅ Tailscale ACLs configured (if using teams) +- ✅ Only necessary hosts have SSH access +- ✅ Regularly review connected devices in Tailscale + +## Summary + +You now have: + +1. ✅ Tailscale installed and connected +2. ✅ sshsync installed and configured +3. ✅ SSH keys set up on all machines +4. ✅ SSH config with all hosts +5. ✅ sshsync groups organized +6. ✅ Agent installed in Claude Code +7. ✅ Tests passing + +**Start using**: + +``` +"Which machines are online?" +"Run this on the least loaded machine" +"Push files to production servers" +"Deploy to staging then production" +``` + +For more examples, see README.md and SKILL.md. + +## Support + +If you encounter issues: + +1. Check this troubleshooting section +2. Review references/ for detailed guides +3. Check DECISIONS.md for architecture rationale +4. Run tests to verify installation + +Happy automating! 🚀 diff --git a/README.md b/README.md new file mode 100644 index 0000000..d2a2afc --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# tailscale-sshsync-agent + +Manages distributed workloads and file sharing across Tailscale SSH-connected machines. Automates remote command execution, intelligent load balancing, file synchronization workflows, host health monitoring, and multi-machine orchestration using sshsync. diff --git a/SKILL.md b/SKILL.md new file mode 100644 index 0000000..71b00d4 --- /dev/null +++ b/SKILL.md @@ -0,0 +1,1204 @@ +--- +name: tailscale-sshsync-agent +description: Manages distributed workloads and file sharing across Tailscale SSH-connected machines. Automates remote command execution, intelligent load balancing, file synchronization workflows, host health monitoring, and multi-machine orchestration using sshsync. Activates when discussing remote machines, Tailscale SSH, workload distribution, file sharing, or multi-host operations. +--- + +# Tailscale SSH Sync Agent + +## When to Use This Skill + +This skill automatically activates when you need to: + +✅ **Distribute workloads** across multiple machines +- "Run this on my least loaded machine" +- "Execute this task on the machine with most resources" +- "Balance work across my Tailscale network" + +✅ **Share files** between Tailscale-connected hosts +- "Push this directory to all my development machines" +- "Sync code across my homelab servers" +- "Deploy configuration to production group" + +✅ **Execute commands** remotely across host groups +- "Run system updates on all servers" +- "Check disk space across web-servers group" +- "Restart services on database hosts" + +✅ **Monitor machine availability** and health +- "Which machines are online?" +- "Show status of my Tailscale network" +- "Check connectivity to remote hosts" + +✅ **Automate multi-machine workflows** +- "Deploy to staging, test, then production" +- "Backup files from all machines" +- "Synchronize development environment across laptops" + +## How It Works + +This agent provides intelligent workload distribution and file sharing management across Tailscale SSH-connected machines using the `sshsync` CLI tool. + +**Core Architecture**: + +1. **SSH Sync Wrapper**: Python interface to sshsync CLI operations +2. **Tailscale Manager**: Tailscale-specific connectivity and status management +3. **Load Balancer**: Intelligent task distribution based on machine resources +4. **Workflow Executor**: Common multi-machine workflow automation +5. **Validators**: Parameter, host, and connection validation +6. **Helpers**: Temporal context, formatting, and utilities + +**Key Features**: + +- **Automatic host discovery** via Tailscale and SSH config +- **Intelligent load balancing** based on CPU, memory, and current load +- **Group-based operations** (execute on all web servers, databases, etc.) +- **Dry-run mode** for preview before execution +- **Parallel execution** across multiple hosts +- **Comprehensive error handling** and retry logic +- **Connection validation** before operations +- **Progress tracking** for long-running operations + +## Data Sources + +### sshsync CLI Tool + +**What is sshsync?** + +sshsync is a Python CLI tool for managing SSH connections and executing operations across multiple hosts. It provides: + +- Group-based host management +- Remote command execution with timeouts +- File push/pull operations (single or recursive) +- Integration with existing SSH config (~/.ssh/config) +- Status checking and connectivity validation + +**Installation**: +```bash +pip install sshsync +``` + +**Configuration**: + +sshsync uses two configuration sources: + +1. **SSH Config** (`~/.ssh/config`): Host connection details +2. **sshsync Config** (`~/.config/sshsync/config.yaml`): Group assignments + +**Example SSH Config**: +``` +Host homelab-1 + HostName 100.64.1.10 + User admin + IdentityFile ~/.ssh/id_ed25519 + +Host prod-web-01 + HostName 100.64.1.20 + User deploy + Port 22 +``` + +**Example sshsync Config**: +```yaml +groups: + homelab: + - homelab-1 + - homelab-2 + production: + - prod-web-01 + - prod-web-02 + - prod-db-01 + development: + - dev-laptop + - dev-desktop +``` + +**sshsync Commands Used**: + +| Command | Purpose | Example | +|---------|---------|---------| +| `sshsync all` | Execute on all hosts | `sshsync all "df -h"` | +| `sshsync group` | Execute on group | `sshsync group web "systemctl status nginx"` | +| `sshsync push` | Push files to hosts | `sshsync push --group prod ./app /var/www/` | +| `sshsync pull` | Pull files from hosts | `sshsync pull --host db /var/log/mysql ./logs/` | +| `sshsync ls` | List hosts | `sshsync ls --with-status` | +| `sshsync sync` | Sync ungrouped hosts | `sshsync sync` | + +### Tailscale Integration + +**What is Tailscale?** + +Tailscale is a zero-config VPN that creates a secure network between your devices. It provides: + +- **Automatic peer-to-peer connections** via WireGuard +- **Magic DNS** for easy host addressing (e.g., `machine-name.tailnet-name.ts.net`) +- **SSH capabilities** built-in to Tailscale CLI +- **ACLs** for access control + +**Tailscale SSH**: + +Tailscale includes SSH functionality that works seamlessly with standard SSH: + +```bash +# Standard SSH via Tailscale +ssh user@machine-name + +# Tailscale-specific SSH command +tailscale ssh machine-name +``` + +**Integration with sshsync**: + +Since Tailscale SSH uses standard SSH protocol, it works perfectly with sshsync. Just configure your SSH config with Tailscale hostnames: + +``` +Host homelab-1 + HostName homelab-1.tailnet.ts.net + User admin +``` + +**Tailscale Commands Used**: + +| Command | Purpose | Example | +|---------|---------|---------| +| `tailscale status` | Show network status | Lists all connected machines | +| `tailscale ping` | Check connectivity | `tailscale ping machine-name` | +| `tailscale ssh` | SSH to machine | `tailscale ssh user@machine` | + +## Workflows + +### 1. Host Health Monitoring + +**User Query**: "Which of my machines are online?" + +**Workflow**: + +1. Load SSH config and sshsync groups +2. Execute `sshsync ls --with-status` +3. Parse connectivity results +4. Query Tailscale status for additional context +5. Return formatted health report with: + - Online/offline status per host + - Group memberships + - Tailscale connection state + - Last seen timestamp + +**Implementation**: `scripts/sshsync_wrapper.py` → `get_host_status()` + +**Output Format**: +``` +🟢 homelab-1 (homelab) - Online - Tailscale: Connected +🟢 prod-web-01 (production, web-servers) - Online - Tailscale: Connected +🔴 dev-laptop (development) - Offline - Last seen: 2h ago +🟢 prod-db-01 (production, databases) - Online - Tailscale: Connected + +Summary: 3/4 hosts online (75%) +``` + +### 2. Intelligent Load Balancing + +**User Query**: "Run this task on the least loaded machine" + +**Workflow**: + +1. Get list of candidate hosts (from group or all) +2. For each online host, check: + - CPU load (via `uptime` or `top`) + - Memory usage (via `free` or `vm_stat`) + - Disk space (via `df`) +3. Calculate composite load score +4. Select host with lowest score +5. Execute task on selected host +6. Return result with performance metrics + +**Implementation**: `scripts/load_balancer.py` → `select_optimal_host()` + +**Load Score Calculation**: +``` +score = (cpu_pct * 0.4) + (mem_pct * 0.3) + (disk_pct * 0.3) +``` + +Lower score = better candidate for task execution. + +**Output Format**: +``` +✓ Selected host: prod-web-02 + Reason: Lowest load score (0.32) + - CPU: 15% (vs avg 45%) + - Memory: 30% (vs avg 60%) + - Disk: 40% (vs avg 55%) + +Executing: npm run build +[Task output...] + +✓ Completed in 2m 15s +``` + +### 3. File Synchronization Workflows + +**User Query**: "Sync my code to all development machines" + +**Workflow**: + +1. Validate source path exists locally +2. Identify target group ("development") +3. Check connectivity to all group members +4. Show dry-run preview (files to be synced, sizes) +5. Execute parallel push to all hosts +6. Validate successful transfer on each host +7. Return summary with per-host status + +**Implementation**: `scripts/sshsync_wrapper.py` → `push_to_group()` + +**Supported Operations**: + +- **Push to all**: Sync files to every configured host +- **Push to group**: Sync to specific group (dev, prod, etc.) +- **Pull from host**: Retrieve files from single host +- **Pull from group**: Collect files from multiple hosts +- **Recursive sync**: Entire directory trees with `--recurse` + +**Output Format**: +``` +📤 Syncing: ~/projects/myapp → /var/www/myapp +Group: development (3 hosts) + +Preview (dry-run): + - dev-laptop: 145 files, 12.3 MB + - dev-desktop: 145 files, 12.3 MB + - dev-server: 145 files, 12.3 MB + +Execute? [Proceeding...] + +✓ dev-laptop: Synced 145 files in 8s +✓ dev-desktop: Synced 145 files in 6s +✓ dev-server: Synced 145 files in 10s + +Summary: 3/3 successful (435 files, 36.9 MB total) +``` + +### 4. Remote Command Orchestration + +**User Query**: "Check disk space on all web servers" + +**Workflow**: + +1. Identify target group ("web-servers") +2. Validate group exists and has members +3. Check connectivity to group members +4. Execute command in parallel across group +5. Collect and parse outputs +6. Format results with per-host breakdown + +**Implementation**: `scripts/sshsync_wrapper.py` → `execute_on_group()` + +**Features**: + +- **Parallel execution**: Commands run simultaneously on all hosts +- **Timeout handling**: Configurable per-command timeout (default 10s) +- **Error isolation**: Failure on one host doesn't stop others +- **Output aggregation**: Collect and correlate all outputs +- **Dry-run mode**: Preview what would execute without running + +**Output Format**: +``` +🔧 Executing on group 'web-servers': df -h /var/www + +web-01: + Filesystem: /dev/sda1 + Size: 100G, Used: 45G, Available: 50G (45% used) + +web-02: + Filesystem: /dev/sda1 + Size: 100G, Used: 67G, Available: 28G (67% used) ⚠️ + +web-03: + Filesystem: /dev/sda1 + Size: 100G, Used: 52G, Available: 43G (52% used) + +⚠️ Alert: web-02 is above 60% disk usage +``` + +### 5. Multi-Stage Deployment Workflow + +**User Query**: "Deploy to staging, test, then production" + +**Workflow**: + +1. **Stage 1 - Staging Deploy**: + - Push code to staging group + - Run build process + - Execute automated tests + - If tests fail: STOP and report error + +2. **Stage 2 - Validation**: + - Check staging health endpoints + - Validate database migrations + - Run smoke tests + +3. **Stage 3 - Production Deploy**: + - Push to production group (one at a time for zero-downtime) + - Restart services gracefully + - Verify each host before proceeding to next + +4. **Stage 4 - Verification**: + - Check production health + - Monitor for errors + - Rollback if issues detected + +**Implementation**: `scripts/workflow_executor.py` → `deploy_workflow()` + +**Output Format**: +``` +🚀 Multi-Stage Deployment Workflow + +Stage 1: Staging Deployment + ✓ Pushed code to staging-01 + ✓ Build completed (2m 15s) + ✓ Tests passed (145/145) + +Stage 2: Validation + ✓ Health check passed + ✓ Database migration OK + ✓ Smoke tests passed (12/12) + +Stage 3: Production Deployment + ✓ prod-web-01: Deployed & verified + ✓ prod-web-02: Deployed & verified + ✓ prod-web-03: Deployed & verified + +Stage 4: Verification + ✓ All health checks passed + ✓ No errors in logs (5min window) + +✅ Deployment completed successfully in 12m 45s +``` + +## Available Scripts + +### scripts/sshsync_wrapper.py + +**Purpose**: Python wrapper around sshsync CLI for programmatic access + +**Functions**: + +- `get_host_status(group=None)`: Get online/offline status of hosts +- `execute_on_all(command, timeout=10, dry_run=False)`: Run command on all hosts +- `execute_on_group(group, command, timeout=10, dry_run=False)`: Run on specific group +- `execute_on_host(host, command, timeout=10)`: Run on single host +- `push_to_hosts(local_path, remote_path, hosts=None, group=None, recurse=False, dry_run=False)`: Push files +- `pull_from_host(host, remote_path, local_path, recurse=False, dry_run=False)`: Pull files +- `list_hosts(with_status=True)`: List all configured hosts +- `get_groups()`: Get all defined groups and their members +- `add_hosts_to_group(group, hosts)`: Add hosts to a group + +**Usage Example**: +```python +from sshsync_wrapper import execute_on_group, push_to_hosts + +# Execute command +result = execute_on_group( + group="web-servers", + command="systemctl status nginx", + timeout=15 +) + +# Push files +push_to_hosts( + local_path="./dist", + remote_path="/var/www/app", + group="production", + recurse=True +) +``` + +### scripts/tailscale_manager.py + +**Purpose**: Tailscale-specific operations and status management + +**Functions**: + +- `get_tailscale_status()`: Get Tailscale network status (all peers) +- `check_connectivity(host)`: Ping host via Tailscale +- `get_peer_info(hostname)`: Get detailed info about peer +- `list_online_machines()`: List all online Tailscale machines +- `get_machine_ip(hostname)`: Get Tailscale IP for machine +- `validate_tailscale_ssh(host)`: Check if Tailscale SSH is working + +**Usage Example**: +```python +from tailscale_manager import get_tailscale_status, check_connectivity + +# Get network status +status = get_tailscale_status() +print(f"Online machines: {status['online_count']}") + +# Check specific host +is_online = check_connectivity("homelab-1") +``` + +### scripts/load_balancer.py + +**Purpose**: Intelligent task distribution based on machine resources + +**Functions**: + +- `get_machine_load(host)`: Get CPU, memory, disk metrics +- `calculate_load_score(metrics)`: Calculate composite load score +- `select_optimal_host(candidates, prefer_group=None)`: Pick best host +- `get_group_capacity()`: Get aggregate capacity of group +- `distribute_tasks(tasks, hosts)`: Distribute multiple tasks optimally + +**Usage Example**: +```python +from load_balancer import select_optimal_host + +# Find best machine for task +best_host = select_optimal_host( + candidates=["web-01", "web-02", "web-03"], + prefer_group="production" +) + +# Execute on selected host +execute_on_host(best_host, "npm run build") +``` + +### scripts/workflow_executor.py + +**Purpose**: Common multi-machine workflow automation + +**Functions**: + +- `deploy_workflow(code_path, staging_group, prod_group)`: Full deployment pipeline +- `backup_workflow(hosts, backup_paths, destination)`: Backup from multiple hosts +- `sync_workflow(source_host, target_group, paths)`: Sync from one to many +- `rolling_restart(group, service_name)`: Zero-downtime service restart +- `health_check_workflow(group, endpoint)`: Check health across group + +**Usage Example**: +```python +from workflow_executor import deploy_workflow, backup_workflow + +# Deploy with testing +deploy_workflow( + code_path="./dist", + staging_group="staging", + prod_group="production" +) + +# Backup from all databases +backup_workflow( + hosts=["db-01", "db-02"], + backup_paths=["/var/lib/mysql"], + destination="./backups" +) +``` + +### scripts/utils/helpers.py + +**Purpose**: Common utilities and formatting functions + +**Functions**: + +- `format_bytes(bytes)`: Human-readable byte formatting (1.2 GB) +- `format_duration(seconds)`: Human-readable duration (2m 15s) +- `parse_ssh_config()`: Parse ~/.ssh/config for host details +- `parse_sshsync_config()`: Parse sshsync group configuration +- `get_timestamp()`: Get ISO timestamp for logging +- `safe_execute(func, *args, **kwargs)`: Execute with error handling +- `validate_path(path)`: Check if path exists and is accessible + +### scripts/utils/validators/parameter_validator.py + +**Purpose**: Validate user inputs and parameters + +**Functions**: + +- `validate_host(host, valid_hosts=None)`: Validate host exists +- `validate_group(group, valid_groups=None)`: Validate group exists +- `validate_path_exists(path)`: Check local path exists +- `validate_timeout(timeout)`: Ensure timeout is reasonable +- `validate_command(command)`: Basic command safety validation + +### scripts/utils/validators/host_validator.py + +**Purpose**: Validate host configuration and availability + +**Functions**: + +- `validate_ssh_config(host)`: Check host has SSH config entry +- `validate_host_reachable(host, timeout=5)`: Check host is reachable +- `validate_group_members(group)`: Ensure group has valid members +- `get_invalid_hosts(hosts)`: Find hosts without valid config + +### scripts/utils/validators/connection_validator.py + +**Purpose**: Validate SSH and Tailscale connections + +**Functions**: + +- `validate_ssh_connection(host)`: Test SSH connection works +- `validate_tailscale_connection(host)`: Test Tailscale connectivity +- `validate_ssh_key(host)`: Check SSH key authentication +- `get_connection_diagnostics(host)`: Comprehensive connection testing + +## Available Analyses + +### 1. Host Availability Analysis + +**Function**: `analyze_host_availability(group=None)` + +**Objective**: Determine which machines are online and accessible + +**Inputs**: +- `group` (optional): Specific group to check (None = all hosts) + +**Outputs**: +```python +{ + 'total_hosts': 10, + 'online_hosts': 8, + 'offline_hosts': 2, + 'availability_pct': 80.0, + 'by_group': { + 'production': {'online': 3, 'total': 3, 'pct': 100.0}, + 'development': {'online': 2, 'total': 3, 'pct': 66.7}, + 'homelab': {'online': 3, 'total': 4, 'pct': 75.0} + }, + 'offline_hosts_details': [ + {'host': 'dev-laptop', 'last_seen': '2h ago', 'groups': ['development']}, + {'host': 'homelab-4', 'last_seen': '1d ago', 'groups': ['homelab']} + ] +} +``` + +**Interpretation**: +- **> 90%**: Excellent availability +- **70-90%**: Good availability, monitor offline hosts +- **< 70%**: Poor availability, investigate issues + +### 2. Load Distribution Analysis + +**Function**: `analyze_load_distribution(group=None)` + +**Objective**: Understand resource usage across machines + +**Inputs**: +- `group` (optional): Specific group to analyze + +**Outputs**: +```python +{ + 'hosts': [ + { + 'host': 'web-01', + 'cpu_pct': 45, + 'mem_pct': 60, + 'disk_pct': 40, + 'load_score': 0.49, + 'status': 'moderate' + }, + # ... more hosts + ], + 'aggregate': { + 'avg_cpu': 35, + 'avg_mem': 55, + 'avg_disk': 45, + 'total_capacity': 1200 # GB + }, + 'recommendations': [ + { + 'host': 'web-02', + 'issue': 'High CPU usage (85%)', + 'action': 'Consider migrating workloads' + } + ] +} +``` + +**Load Status**: +- **Low** (score < 0.4): Good capacity for more work +- **Moderate** (0.4-0.7): Normal operation +- **High** (> 0.7): May need to offload work + +### 3. File Sync Status Analysis + +**Function**: `analyze_sync_status(local_path, remote_path, group)` + +**Objective**: Compare local files with remote versions + +**Inputs**: +- `local_path`: Local directory to compare +- `remote_path`: Remote directory path +- `group`: Group to check + +**Outputs**: +```python +{ + 'local_files': 145, + 'local_size': 12582912, # bytes + 'hosts': [ + { + 'host': 'web-01', + 'status': 'in_sync', + 'files_match': 145, + 'files_different': 0, + 'missing_files': 0 + }, + { + 'host': 'web-02', + 'status': 'out_of_sync', + 'files_match': 140, + 'files_different': 3, + 'missing_files': 2, + 'details': ['config.json modified', 'index.html modified', ...] + } + ], + 'sync_percentage': 96.7, + 'recommended_action': 'Push to web-02' +} +``` + +### 4. Network Latency Analysis + +**Function**: `analyze_network_latency(hosts=None)` + +**Objective**: Measure connection latency to hosts + +**Inputs**: +- `hosts` (optional): Specific hosts to test (None = all) + +**Outputs**: +```python +{ + 'hosts': [ + {'host': 'web-01', 'latency_ms': 15, 'status': 'excellent'}, + {'host': 'web-02', 'latency_ms': 45, 'status': 'good'}, + {'host': 'db-01', 'latency_ms': 150, 'status': 'fair'} + ], + 'avg_latency': 70, + 'min_latency': 15, + 'max_latency': 150, + 'recommendations': [ + {'host': 'db-01', 'issue': 'High latency', 'action': 'Check network path'} + ] +} +``` + +**Latency Classification**: +- **Excellent** (< 50ms): Ideal for interactive tasks +- **Good** (50-100ms): Suitable for most operations +- **Fair** (100-200ms): May impact interactive workflows +- **Poor** (> 200ms): Investigate network issues + +### 5. Comprehensive Infrastructure Report + +**Function**: `comprehensive_infrastructure_report(group=None)` + +**Objective**: One-stop function for complete infrastructure overview + +**Inputs**: +- `group` (optional): Limit to specific group (None = all) + +**Outputs**: +```python +{ + 'report_timestamp': '2025-10-19T19:43:41Z', + 'group': 'production', # or 'all' + 'metrics': { + 'availability': {...}, # from analyze_host_availability + 'load_distribution': {...}, # from analyze_load_distribution + 'network_latency': {...}, # from analyze_network_latency + 'tailscale_status': {...} # from Tailscale integration + }, + 'summary': "Production infrastructure: 3/3 hosts online, avg load 45%, network latency 35ms", + 'alerts': [ + "⚠ web-02: High CPU usage (85%)", + "⚠ db-01: Elevated latency (150ms)" + ], + 'recommendations': [ + "Consider rebalancing workload from web-02", + "Investigate network path to db-01" + ], + 'overall_health': 'good' # excellent | good | fair | poor +} +``` + +**Overall Health Classification**: +- **Excellent**: All metrics green, no alerts +- **Good**: Most metrics healthy, minor alerts +- **Fair**: Some concerning metrics, action recommended +- **Poor**: Critical issues, immediate action required + +## Error Handling + +### Connection Errors + +**Error**: Cannot connect to host + +**Causes**: +- Host is offline +- Tailscale not connected +- SSH key missing/invalid +- Firewall blocking connection + +**Handling**: +```python +try: + execute_on_host("web-01", "ls") +except ConnectionError as e: + # Try Tailscale ping first + if not check_connectivity("web-01"): + return { + 'error': 'Host unreachable', + 'suggestion': 'Check Tailscale connection', + 'diagnostics': get_connection_diagnostics("web-01") + } + # Then check SSH + if not validate_ssh_connection("web-01"): + return { + 'error': 'SSH authentication failed', + 'suggestion': 'Check SSH keys: ssh-add -l' + } +``` + +### Timeout Errors + +**Error**: Operation timed out + +**Causes**: +- Command taking too long +- Network latency +- Host overloaded + +**Handling**: +- Automatic retry with exponential backoff (3 attempts) +- Increase timeout for known slow operations +- Fall back to alternative host if available + +### File Transfer Errors + +**Error**: File sync failed + +**Causes**: +- Insufficient disk space +- Permission denied +- Path doesn't exist + +**Handling**: +- Pre-check disk space on target +- Validate permissions before transfer +- Create directories if needed +- Partial transfer recovery + +### Validation Errors + +**Error**: Invalid parameter + +**Examples**: +- Unknown host +- Non-existent group +- Invalid path + +**Handling**: +- Validate all inputs before execution +- Provide suggestions for similar valid options +- Clear error messages with corrective actions + +## Mandatory Validations + +### Before Any Operation + +1. **Parameter Validation**: + ```python + host = validate_host(host, valid_hosts=get_all_hosts()) + group = validate_group(group, valid_groups=get_groups()) + timeout = validate_timeout(timeout) + ``` + +2. **Connection Validation**: + ```python + if not validate_host_reachable(host, timeout=5): + raise ConnectionError(f"Host {host} is not reachable") + ``` + +3. **Path Validation** (for file operations): + ```python + if not validate_path_exists(local_path): + raise ValueError(f"Path does not exist: {local_path}") + ``` + +### During Operation + +1. **Timeout Monitoring**: Every operation has configurable timeout +2. **Progress Tracking**: Long operations show progress +3. **Error Isolation**: Failure on one host doesn't stop others + +### After Operation + +1. **Result Validation**: + ```python + report = validate_operation_result(result) + if report.has_critical_issues(): + raise OperationError(report.get_summary()) + ``` + +2. **State Verification**: Confirm operation succeeded +3. **Logging**: Record all operations for audit trail + +## Performance and Caching + +### Caching Strategy + +**Host Status Cache**: +- **TTL**: 60 seconds +- **Why**: Host status doesn't change rapidly +- **Invalidation**: Manual invalidate when connectivity changes + +**Load Metrics Cache**: +- **TTL**: 30 seconds +- **Why**: Load changes frequently +- **Invalidation**: Automatic on timeout + +**Group Configuration Cache**: +- **TTL**: 5 minutes +- **Why**: Group membership rarely changes +- **Invalidation**: Manual when groups modified + +### Performance Optimizations + +1. **Parallel Execution**: + - Commands execute concurrently across hosts + - ThreadPoolExecutor with max 10 workers + - Prevents sequential bottleneck + +2. **Connection Pooling**: + - Reuse SSH connections when possible + - ControlMaster in SSH config + +3. **Lazy Loading**: + - Only fetch data when needed + - Don't load all host status unless required + +4. **Progressive Results**: + - Stream results as they complete + - Don't wait for slowest host + +## Automatic Detection Keywords + +This skill automatically activates when you mention: + +**Hosts & Machines**: +- remote machine, remote host, remote server +- Tailscale machine, Tailscale host +- SSH host, SSH server +- homelab, servers, cluster + +**Tools**: +- sshsync, ssh sync +- Tailscale, tailnet +- SSH, secure shell + +**Operations**: +- distribute workload, load balance +- file sharing, file sync, file transfer +- push files, pull files, sync files +- remote execution, execute remotely +- deploy to, deploy across +- backup from, sync to + +**Groups**: +- host group, machine group, server group +- production servers, development machines +- web servers, database servers +- staging environment, production environment + +**Actions**: +- check status, check connectivity +- run on all, execute on group +- sync across, distribute to +- deploy to, push to, pull from + +**Monitoring**: +- which machines are online +- host availability, server status +- machine health, connectivity check +- network status, latency check + +**Workflows**: +- multi-stage deployment +- rolling restart, zero-downtime +- backup workflow, sync workflow +- distributed task, parallel execution + +## Usage Examples + +### Example 1: Check Host Status + +**User**: "Which of my Tailscale machines are online?" + +**Agent Response**: +``` +Checking Tailscale network status... + +🟢 homelab-1 (homelab) - Online - Tailscale: Connected + IP: 100.64.1.10 | Latency: 15ms + +🟢 homelab-2 (homelab) - Online - Tailscale: Connected + IP: 100.64.1.11 | Latency: 18ms + +🔴 dev-laptop (development) - Offline + Last seen: 2 hours ago | Tailscale: Not connected + +🟢 prod-web-01 (production, web-servers) - Online - Tailscale: Connected + IP: 100.64.1.20 | Latency: 25ms + +🟢 prod-db-01 (production, databases) - Online - Tailscale: Connected + IP: 100.64.1.30 | Latency: 30ms + +Summary: 4/5 hosts online (80%) +Average latency: 22ms +``` + +### Example 2: Execute on Group + +**User**: "Check disk space on all web servers" + +**Agent Response**: +``` +Executing on group 'web-servers': df -h /var/www + +🔧 web-01: + Filesystem: /dev/sda1 + Size: 100G | Used: 45G | Available: 50G (45% used) + Status: ✓ Healthy + +🔧 web-02: + Filesystem: /dev/sda1 + Size: 100G | Used: 67G | Available: 28G (67% used) + Status: ⚠ Warning - Above 60% + +🔧 web-03: + Filesystem: /dev/sda1 + Size: 100G | Used: 52G | Available: 43G (52% used) + Status: ✓ Healthy + +⚠️ Alert: web-02 is above 60% disk usage +Recommendation: Clear old logs or increase disk space +``` + +### Example 3: Load-Balanced Execution + +**User**: "Run this npm build on the machine with most available resources" + +**Agent Response**: +``` +Analyzing resource availability across 5 machines... + +Load Analysis: + web-01: CPU 45%, Mem 60%, Disk 40% → Score: 0.49 (moderate) + web-02: CPU 85%, Mem 70%, Disk 65% → Score: 0.75 (high load) + web-03: CPU 20%, Mem 35%, Disk 30% → Score: 0.28 (low load) ✓ + +✓ Selected: web-03 + Reason: Lowest load score (0.28) + Available: CPU 80%, Memory 65%, Disk 70% + +Executing on web-03: npm run build + +[Build output...] + ✓ Build completed successfully + Duration: 2m 15s + Output: ./dist (145 files, 12.3 MB) + +Final load: CPU 65%, Mem 50% (still below average) +``` + +### Example 4: File Sync + +**User**: "Push my app directory to all production servers" + +**Agent Response**: +``` +📤 Syncing: ~/projects/myapp → /var/www/myapp +Target group: production (3 hosts) + +Analyzing source... + 145 files, 12.3 MB total + +Preview (dry-run): + prod-web-01: 145 files → /var/www/myapp + prod-web-02: 145 files → /var/www/myapp + prod-db-01: Skipped (not in web-servers group) + +Checking target capacity... + ✓ prod-web-01: 50 GB available + ✓ prod-web-02: 28 GB available + +Executing push... + +✓ prod-web-01: Synced 145 files in 8.2s + Modified: 12 files + Added: 3 files + +✓ prod-web-02: Synced 145 files in 6.5s + Modified: 12 files + Added: 3 files + +Summary: 2/2 successful (290 files, 24.6 MB total) +``` + +### Example 5: Multi-Stage Deployment + +**User**: "Deploy this to staging, run tests, then push to production" + +**Agent Response**: +``` +🚀 Multi-Stage Deployment Workflow + +Stage 1: Staging Deployment + 📤 Pushing to staging-01... + ✓ Code synced (145 files, 12.3 MB) - 8s + + 🔨 Running build... + $ npm run build + ✓ Build completed - 2m 15s + + 🧪 Running tests... + $ npm test + ✓ All tests passed (145/145) - 45s + +Stage 2: Staging Validation + ✓ Health endpoint: 200 OK + ✓ Database migrations: Up to date + ✓ Smoke tests: 12/12 passed + +Stage 3: Production Deployment + Rolling deployment to 3 hosts... + + ⏳ prod-web-01: + ✓ Code synced + ✓ Build completed + ✓ Service restarted gracefully + ✓ Health check passed + Duration: 3m 10s + + ⏳ prod-web-02: + ✓ Code synced + ✓ Build completed + ✓ Service restarted gracefully + ✓ Health check passed + Duration: 3m 5s + + ⏳ prod-web-03: + ✓ Code synced + ✓ Build completed + ✓ Service restarted gracefully + ✓ Health check passed + Duration: 3m 12s + +Stage 4: Production Verification + ✓ All health checks passed (3/3) + ✓ No errors in logs (5min window) + ✓ Response times normal (avg 45ms) + +✅ Deployment completed successfully! + Total duration: 12m 45s + Hosts updated: 3 + Zero downtime: ✓ +``` + +### Example 6: Comprehensive Infrastructure Report + +**User**: "Give me a complete status report of my infrastructure" + +**Agent Response**: +``` +📊 Infrastructure Report +Generated: 2025-10-19 19:43:41 + +=== AVAILABILITY === +Total Hosts: 10 +Online: 8 (80%) +Offline: 2 + +By Group: + production: 3/3 (100%) ✓ + development: 2/3 (66.7%) ⚠ + homelab: 3/4 (75%) + +Offline Hosts: + • dev-laptop (development) - Last seen: 2h ago + • homelab-4 (homelab) - Last seen: 1d ago + +=== LOAD DISTRIBUTION === +Average Load: 45% CPU, 55% Memory, 45% Disk + +Top 3 Loaded Hosts: + 1. web-02: 85% CPU, 70% Mem, 65% Disk (Score: 0.75) ⚠ + 2. db-01: 60% CPU, 75% Mem, 55% Disk (Score: 0.65) + 3. web-01: 45% CPU, 60% Mem, 40% Disk (Score: 0.49) + +Top 3 Available Hosts: + 1. web-03: 20% CPU, 35% Mem, 30% Disk (Score: 0.28) ✓ + 2. homelab-1: 25% CPU, 40% Mem, 35% Disk (Score: 0.33) + 3. homelab-2: 30% CPU, 45% Mem, 40% Disk (Score: 0.38) + +=== NETWORK LATENCY === +Average: 35ms +Range: 15ms - 150ms + +Excellent (< 50ms): 6 hosts +Good (50-100ms): 1 host +Fair (100-200ms): 1 host (db-01: 150ms) ⚠ + +=== TAILSCALE STATUS === +Network: Connected +Peers Online: 8/10 +Exit Node: None +MagicDNS: Enabled + +=== ALERTS === +⚠ web-02: High CPU usage (85%) - Consider load balancing +⚠ db-01: Elevated latency (150ms) - Check network path +⚠ dev-laptop: Offline for 2 hours - May need attention + +=== RECOMMENDATIONS === +1. Rebalance workload from web-02 to web-03 +2. Investigate network latency to db-01 +3. Check status of dev-laptop and homelab-4 +4. Consider scheduling maintenance for web-02 + +Overall Health: GOOD ✓ +``` + +## Installation + +See INSTALLATION.md for detailed setup instructions. + +Quick start: +```bash +# 1. Install sshsync +pip install sshsync + +# 2. Configure SSH hosts +vim ~/.ssh/config + +# 3. Sync host groups +sshsync sync + +# 4. Install agent +/plugin marketplace add ./tailscale-sshsync-agent + +# 5. Test +"Which of my machines are online?" +``` + +## Version + +Current version: 1.0.0 + +See CHANGELOG.md for release history. + +## Architecture Decisions + +See DECISIONS.md for detailed rationale behind tool selection, architecture choices, and trade-offs considered. diff --git a/VERSION b/VERSION new file mode 100644 index 0000000..3eefcb9 --- /dev/null +++ b/VERSION @@ -0,0 +1 @@ +1.0.0 diff --git a/plugin.lock.json b/plugin.lock.json new file mode 100644 index 0000000..6ab8644 --- /dev/null +++ b/plugin.lock.json @@ -0,0 +1,117 @@ +{ + "$schema": "internal://schemas/plugin.lock.v1.json", + "pluginId": "gh:Human-Frontier-Labs-Inc/human-frontier-labs-marketplace:plugins/tailscale-sshsync-agent", + "normalized": { + "repo": null, + "ref": "refs/tags/v20251128.0", + "commit": "3a7cbe9632f245c6b9a4c4bf2731da65c857a7f4", + "treeHash": "832bc62ce02c782663e60a2eb97932166fef39c681a9ca01b9d5dc170860b805", + "generatedAt": "2025-11-28T10:11:41.356928Z", + "toolVersion": "publish_plugins.py@0.2.0" + }, + "origin": { + "remote": "git@github.com:zhongweili/42plugin-data.git", + "branch": "master", + "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390", + "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data" + }, + "manifest": { + "name": "tailscale-sshsync-agent", + "description": "Manages distributed workloads and file sharing across Tailscale SSH-connected machines. Automates remote command execution, intelligent load balancing, file synchronization workflows, host health monitoring, and multi-machine orchestration using sshsync.", + "version": null + }, + "content": { + "files": [ + { + "path": "CHANGELOG.md", + "sha256": "74dbda933868b7cab410144a831b43e4f1ae6161f2402edcb068a8232c50bfe4" + }, + { + "path": "README.md", + "sha256": "470f165d8ac61a8942e6fb3568c49febb7f803bfa0f4010d14e09f807c34c88e" + }, + { + "path": "VERSION", + "sha256": "59854984853104df5c353e2f681a15fc7924742f9a2e468c29af248dce45ce03" + }, + { + "path": "SKILL.md", + "sha256": "31c8f237f9b3617c32c6ff381ae83d427b50eb0877d3763d9826e00ece6618f1" + }, + { + "path": "INSTALLATION.md", + "sha256": "9313ea1bbb0a03e4c078c41b207f3febe800cd38eb57b7205c7b5188238ca46a" + }, + { + "path": "DECISIONS.md", + "sha256": "59549e84aaa8e32d4bdf64d46855714f5cde7f061906e1c74976658883472c82" + }, + { + "path": "references/tailscale-integration.md", + "sha256": "6553b3ceeaca5118a7b005368223ea4b3ab70eb2492ccaf5c2b7f7758b65dd42" + }, + { + "path": "references/sshsync-guide.md", + "sha256": "697ce0b56eda258732a0b924f821e9e24eb6b977934153bdd2045be961e58de2" + }, + { + "path": "tests/test_validation.py", + "sha256": "716ae0d2e86f0e6657903aef6bb714fbd3b5b72d3b109fab4da3f75f90cc2c0a" + }, + { + "path": "tests/test_helpers.py", + "sha256": "3be88e30825414eb3ade048b766c84995dc98a01cb7236ce75201716179279a8" + }, + { + "path": "tests/test_integration.py", + "sha256": "12f7cb857fda23531a9c74caf072cf73b739672b1e99c55f42a2ef8e11238523" + }, + { + "path": "scripts/load_balancer.py", + "sha256": "9d87476562ac848a026e42116e381f733d520e9330da33de3d905585af14398d" + }, + { + "path": "scripts/tailscale_manager.py", + "sha256": "4b75ebb9423d221b9788eb9352b274e0256c101185de11064a7b4cb00684016e" + }, + { + "path": "scripts/workflow_executor.py", + "sha256": "9f23f3bb421e940766e65949e6efa485a313115e297d4c5f1088589155a7bac1" + }, + { + "path": "scripts/sshsync_wrapper.py", + "sha256": "fc2062ebbc72e3ddc6c6bfb5f22019b23050f5c2ed9ac35c315018a96871fb19" + }, + { + "path": "scripts/utils/helpers.py", + "sha256": "b01979ee56ab92037b8f8054a883124d600b8337cf461855092b866091aed24a" + }, + { + "path": "scripts/utils/validators/connection_validator.py", + "sha256": "9ac82108e69690b74d9aa89ca51f7d06fe860e880aaa1983d08242d7199d1601" + }, + { + "path": "scripts/utils/validators/parameter_validator.py", + "sha256": "157dfcb7f1937df88344647a37a124d52e1de1b992b72c9b9e69d3b717ca0195" + }, + { + "path": "scripts/utils/validators/__init__.py", + "sha256": "2d109ad1b5d253578a095c8354159fdf9318154b4f62d9b16eaa1a88a422382d" + }, + { + "path": "scripts/utils/validators/host_validator.py", + "sha256": "79cab42587435a799349ba8a562c4ec0f3d54f3f2790562c894c6289beade6d6" + }, + { + "path": ".claude-plugin/plugin.json", + "sha256": "0ec7466bbf2e8dc2fe1607feff0cc0ef0ebebf44ff54f17dcce96255e2c21215" + } + ], + "dirSha256": "832bc62ce02c782663e60a2eb97932166fef39c681a9ca01b9d5dc170860b805" + }, + "security": { + "scannedAt": null, + "scannerVersion": null, + "flags": [] + } +} \ No newline at end of file diff --git a/references/sshsync-guide.md b/references/sshsync-guide.md new file mode 100644 index 0000000..55fb541 --- /dev/null +++ b/references/sshsync-guide.md @@ -0,0 +1,466 @@ +# sshsync CLI Tool Guide + +Complete reference for using sshsync with Tailscale SSH Sync Agent. + +## Table of Contents + +1. [Installation](#installation) +2. [Configuration](#configuration) +3. [Core Commands](#core-commands) +4. [Advanced Usage](#advanced-usage) +5. [Troubleshooting](#troubleshooting) + +## Installation + +### Via pip + +```bash +pip install sshsync +``` + +### Verify Installation + +```bash +sshsync --version +``` + +## Configuration + +### 1. SSH Config Setup + +sshsync uses your existing SSH configuration. Edit `~/.ssh/config`: + +``` +# Example host entries +Host homelab-1 + HostName 100.64.1.10 + User admin + IdentityFile ~/.ssh/id_ed25519 + Port 22 + +Host prod-web-01 + HostName 100.64.1.20 + User deploy + IdentityFile ~/.ssh/id_rsa + Port 22 + +Host dev-laptop + HostName 100.64.1.30 + User developer +``` + +**Important Notes**: +- sshsync uses the **Host alias** (e.g., "homelab-1"), not the actual hostname +- Ensure SSH key authentication is configured +- Test each host with `ssh host-alias` before using with sshsync + +### 2. Initialize sshsync Configuration + +First run: + +```bash +sshsync sync +``` + +This will: +1. Read all hosts from your SSH config +2. Prompt you to assign hosts to groups +3. Create `~/.config/sshsync/config.yaml` + +### 3. sshsync Config File + +Location: `~/.config/sshsync/config.yaml` + +Structure: +```yaml +groups: + production: + - prod-web-01 + - prod-web-02 + - prod-db-01 + development: + - dev-laptop + - dev-desktop + homelab: + - homelab-1 + - homelab-2 +``` + +**Manual Editing**: +- Groups are arbitrary labels (use what makes sense for you) +- Hosts can belong to multiple groups +- Use consistent host aliases from SSH config + +## Core Commands + +### List Hosts + +```bash +# List all configured hosts +sshsync ls + +# List with online/offline status +sshsync ls --with-status +``` + +**Output Example**: +``` +Host Status +homelab-1 online +homelab-2 offline +prod-web-01 online +dev-laptop online +``` + +### Execute Commands + +#### On All Hosts + +```bash +# Execute on all configured hosts +sshsync all "df -h" + +# With custom timeout (default: 10s) +sshsync all --timeout 20 "systemctl status nginx" + +# Dry-run (preview without executing) +sshsync all --dry-run "reboot" +``` + +#### On Specific Group + +```bash +# Execute on group +sshsync group production "uptime" + +# With timeout +sshsync group web-servers --timeout 30 "npm run build" + +# Filter with regex +sshsync group production --regex "web-.*" "df -h" +``` + +**Regex Filtering**: +- Filters group members by alias matching pattern +- Uses Python regex syntax +- Example: `--regex "web-0[1-3]"` matches web-01, web-02, web-03 + +### File Transfer + +#### Push Files + +```bash +# Push to specific host +sshsync push --host web-01 ./app /var/www/app + +# Push to group +sshsync push --group production ./dist /var/www/app + +# Push to all hosts +sshsync push --all ./config.yml /etc/app/config.yml + +# Recursive push (directory with contents) +sshsync push --group web --recurse ./app /var/www/app + +# Dry-run +sshsync push --group production --dry-run ./dist /var/www/app +``` + +**Important**: +- Local path comes first, remote path second +- Use `--recurse` for directories +- Dry-run shows what would be transferred without executing + +#### Pull Files + +```bash +# Pull from specific host +sshsync pull --host db-01 /var/log/mysql/error.log ./logs/ + +# Pull from group (creates separate directories per host) +sshsync pull --group databases /var/backups ./backups/ + +# Recursive pull +sshsync pull --host web-01 --recurse /var/www/app ./backup/ +``` + +**Pull Behavior**: +- When pulling from groups, creates subdirectory per host +- Use `--recurse` to pull entire directory trees +- Destination directory created if doesn't exist + +### Group Management + +#### Add Hosts to Group + +```bash +# Interactive: prompts to select hosts +sshsync gadd production + +# Follow prompts to select which hosts to add +``` + +#### Add Host to SSH Config + +```bash +# Interactive host addition +sshsync hadd + +# Follow prompts for: +# - Host alias +# - Hostname/IP +# - Username +# - Port (optional) +# - Identity file (optional) +``` + +#### Sync Ungrouped Hosts + +```bash +# Assign groups to hosts not yet in any group +sshsync sync +``` + +## Advanced Usage + +### Parallel Execution + +sshsync automatically executes commands in parallel across hosts: + +```bash +# This runs simultaneously on all hosts in group +sshsync group web-servers "npm run build" +``` + +**Performance**: +- Commands execute concurrently +- Results collected as they complete +- Timeout applies per-host independently + +### Timeout Strategies + +Different operations need different timeouts: + +```bash +# Quick checks (5-10s) +sshsync all --timeout 5 "hostname" + +# Moderate operations (30-60s) +sshsync group web --timeout 60 "npm install" + +# Long-running tasks (300s+) +sshsync group build --timeout 300 "docker build ." +``` + +**Timeout Best Practices**: +- Set timeout 20-30% longer than expected duration +- Use dry-run first to estimate timing +- Increase timeout for network-intensive operations + +### Combining with Other Tools + +#### With xargs + +```bash +# Get list of online hosts +sshsync ls --with-status | grep online | awk '{print $1}' | xargs -I {} echo "Host {} is online" +``` + +#### With jq (if using JSON output) + +```bash +# Parse structured output (if sshsync supports --json flag) +sshsync ls --json | jq '.hosts[] | select(.status=="online") | .name' +``` + +#### In Shell Scripts + +```bash +#!/bin/bash + +# Deploy script using sshsync +echo "Deploying to staging..." +sshsync push --group staging --recurse ./dist /var/www/app + +if [ $? -eq 0 ]; then + echo "Staging deployment successful" + + echo "Running tests..." + sshsync group staging "cd /var/www/app && npm test" + + if [ $? -eq 0 ]; then + echo "Tests passed, deploying to production..." + sshsync push --group production --recurse ./dist /var/www/app + fi +fi +``` + +## Troubleshooting + +### Common Issues + +#### 1. "Permission denied (publickey)" + +**Cause**: SSH key not configured or not added to ssh-agent + +**Solution**: +```bash +# Add SSH key to agent +ssh-add ~/.ssh/id_ed25519 + +# Verify it's added +ssh-add -l + +# Copy public key to remote +ssh-copy-id user@host +``` + +#### 2. "Connection timed out" + +**Cause**: Host is offline or network issue + +**Solution**: +```bash +# Test connectivity +ping hostname + +# Test Tailscale specifically +tailscale ping hostname + +# Check Tailscale status +tailscale status +``` + +#### 3. "Host not found in SSH config" + +**Cause**: Host alias not in `~/.ssh/config` + +**Solution**: +```bash +# Add host to SSH config +sshsync hadd + +# Or manually edit ~/.ssh/config +vim ~/.ssh/config +``` + +#### 4. "Group not found" + +**Cause**: Group doesn't exist in sshsync config + +**Solution**: +```bash +# Add hosts to new group +sshsync gadd mygroup + +# Or manually edit config +vim ~/.config/sshsync/config.yaml +``` + +#### 5. File Transfer Fails + +**Cause**: Insufficient permissions, disk space, or path doesn't exist + +**Solution**: +```bash +# Check remote disk space +sshsync group production "df -h" + +# Check remote path exists +sshsync group production "ls -ld /target/path" + +# Check permissions +sshsync group production "ls -la /target/path" +``` + +### Debug Mode + +While sshsync doesn't have a built-in verbose mode, you can debug underlying SSH: + +```bash +# Increase SSH verbosity +SSH_VERBOSE=1 sshsync all "uptime" + +# Or use dry-run to see what would execute +sshsync all --dry-run "command" +``` + +### Performance Issues + +If operations are slow: + +1. **Reduce parallelism** (run on fewer hosts at once) +2. **Increase timeout** for network-bound operations +3. **Check network latency**: + ```bash + sshsync all "echo $HOSTNAME" --timeout 5 + ``` + +### Configuration Validation + +```bash +# Verify SSH config is readable +cat ~/.ssh/config + +# Verify sshsync config +cat ~/.config/sshsync/config.yaml + +# Test hosts individually +for host in $(sshsync ls | awk '{print $1}'); do + echo "Testing $host..." + ssh $host "echo OK" || echo "FAILED: $host" +done +``` + +## Best Practices + +1. **Use meaningful host aliases** in SSH config +2. **Organize groups logically** (by function, environment, location) +3. **Always dry-run first** for destructive operations +4. **Set appropriate timeouts** based on operation type +5. **Test SSH keys** before using sshsync +6. **Keep groups updated** as infrastructure changes +7. **Use --with-status** to check availability before operations + +## Integration with Tailscale + +sshsync works seamlessly with Tailscale SSH: + +```bash +# SSH config using Tailscale hostname +Host homelab-1 + HostName homelab-1.tailnet.ts.net + User admin + +# Or using Tailscale IP directly +Host homelab-1 + HostName 100.64.1.10 + User admin +``` + +**Tailscale Advantages**: +- No need for port forwarding +- Encrypted connections +- MagicDNS for easy hostnames +- Works across NATs + +**Verify Tailscale**: +```bash +# Check Tailscale network +tailscale status + +# Ping host via Tailscale +tailscale ping homelab-1 +``` + +## Summary + +sshsync simplifies multi-host SSH operations: +- ✅ Execute commands across host groups +- ✅ Transfer files to/from multiple hosts +- ✅ Organize hosts into logical groups +- ✅ Parallel execution for speed +- ✅ Dry-run mode for safety +- ✅ Works great with Tailscale + +For more help: `sshsync --help` diff --git a/references/tailscale-integration.md b/references/tailscale-integration.md new file mode 100644 index 0000000..b4301e5 --- /dev/null +++ b/references/tailscale-integration.md @@ -0,0 +1,468 @@ +# Tailscale Integration Guide + +How to use Tailscale SSH with sshsync for secure, zero-config remote access. + +## What is Tailscale? + +Tailscale is a zero-config VPN that creates a secure network between your devices using WireGuard. It provides: + +- **Peer-to-peer encrypted connections** +- **No port forwarding required** +- **Works across NATs and firewalls** +- **MagicDNS for easy device addressing** +- **Built-in SSH functionality** +- **Access control lists (ACLs)** + +## Why Tailscale + sshsync? + +Combining Tailscale with sshsync gives you: + +1. **Secure connections** everywhere (Tailscale encryption) +2. **Simple addressing** (MagicDNS hostnames) +3. **Multi-host operations** (sshsync groups and execution) +4. **No firewall configuration** needed +5. **Works from anywhere** (coffee shop, home, office) + +## Setup + +### 1. Install Tailscale + +**macOS**: +```bash +brew install tailscale +``` + +**Linux**: +```bash +curl -fsSL https://tailscale.com/install.sh | sh +``` + +**Verify Installation**: +```bash +tailscale version +``` + +### 2. Connect to Tailscale + +```bash +# Start Tailscale +sudo tailscale up + +# Follow the authentication link +# This opens browser to authenticate + +# Verify connection +tailscale status +``` + +### 3. Configure SSH via Tailscale + +Tailscale provides two SSH options: + +#### Option A: Tailscale SSH (Built-in) + +**Enable on each machine**: +```bash +sudo tailscale up --ssh +``` + +**Use**: +```bash +tailscale ssh user@machine-name +``` + +**Advantages**: +- No SSH server configuration needed +- Uses Tailscale authentication +- Automatic key management + +#### Option B: Standard SSH over Tailscale (Recommended for sshsync) + +**Configure SSH config** to use Tailscale hostnames: + +```bash +# ~/.ssh/config + +Host homelab-1 + HostName homelab-1.tailnet-name.ts.net + User admin + IdentityFile ~/.ssh/id_ed25519 + +# Or use Tailscale IP directly +Host homelab-2 + HostName 100.64.1.10 + User admin + IdentityFile ~/.ssh/id_ed25519 +``` + +**Advantages**: +- Works with all SSH tools (including sshsync) +- Standard SSH key authentication +- More flexibility + +## Getting Tailscale Hostnames and IPs + +### View All Machines + +```bash +tailscale status +``` + +**Output**: +``` +100.64.1.10 homelab-1 user@ linux - +100.64.1.11 homelab-2 user@ linux - +100.64.1.20 laptop user@ macOS - +100.64.1.30 phone user@ iOS offline +``` + +### Get MagicDNS Hostname + +**Format**: `machine-name.tailnet-name.ts.net` + +**Find your tailnet name**: +```bash +tailscale status --json | grep -i tailnet +``` + +Or check in Tailscale admin console: https://login.tailscale.com/admin/machines + +### Get Tailscale IP + +```bash +# Your own IP +tailscale ip -4 + +# Another machine's IP (from status output) +tailscale status | grep machine-name +``` + +## Testing Connectivity + +### Ping via Tailscale + +```bash +# Ping by hostname +tailscale ping homelab-1 + +# Ping by IP +tailscale ping 100.64.1.10 +``` + +**Successful output**: +``` +pong from homelab-1 (100.64.1.10) via DERP(nyc) in 45ms +pong from homelab-1 (100.64.1.10) via DERP(nyc) in 43ms +``` + +**Failed output**: +``` +timeout waiting for pong +``` + +### SSH Test + +```bash +# Test SSH connection +ssh user@homelab-1.tailnet.ts.net + +# Or with IP +ssh user@100.64.1.10 +``` + +## Configuring sshsync with Tailscale + +### Step 1: Add Tailscale Hosts to SSH Config + +```bash +vim ~/.ssh/config +``` + +**Example configuration**: +``` +# Production servers +Host prod-web-01 + HostName prod-web-01.tailnet.ts.net + User deploy + IdentityFile ~/.ssh/id_ed25519 + +Host prod-web-02 + HostName prod-web-02.tailnet.ts.net + User deploy + IdentityFile ~/.ssh/id_ed25519 + +Host prod-db-01 + HostName prod-db-01.tailnet.ts.net + User deploy + IdentityFile ~/.ssh/id_ed25519 + +# Homelab +Host homelab-1 + HostName 100.64.1.10 + User admin + IdentityFile ~/.ssh/id_ed25519 + +Host homelab-2 + HostName 100.64.1.11 + User admin + IdentityFile ~/.ssh/id_ed25519 + +# Development +Host dev-laptop + HostName dev-laptop.tailnet.ts.net + User developer + IdentityFile ~/.ssh/id_ed25519 +``` + +### Step 2: Test Each Host + +```bash +# Test connectivity to each host +ssh prod-web-01 "hostname" +ssh homelab-1 "hostname" +ssh dev-laptop "hostname" +``` + +### Step 3: Initialize sshsync + +```bash +# Sync hosts and create groups +sshsync sync + +# Add hosts to groups +sshsync gadd production +# Select: prod-web-01, prod-web-02, prod-db-01 + +sshsync gadd homelab +# Select: homelab-1, homelab-2 + +sshsync gadd development +# Select: dev-laptop +``` + +### Step 4: Verify Configuration + +```bash +# List all hosts with status +sshsync ls --with-status + +# Test command execution +sshsync all "uptime" + +# Test group execution +sshsync group production "df -h" +``` + +## Advanced Tailscale Features + +### Tailnet Lock + +Prevents unauthorized device additions: + +```bash +tailscale lock status +``` + +### Exit Nodes + +Route all traffic through a specific machine: + +```bash +# Enable exit node on a machine +sudo tailscale up --advertise-exit-node + +# Use exit node from another machine +sudo tailscale set --exit-node=exit-node-name +``` + +### Subnet Routing + +Access networks behind Tailscale machines: + +```bash +# Advertise subnet routes +sudo tailscale up --advertise-routes=192.168.1.0/24 +``` + +### ACLs (Access Control Lists) + +Control who can access what: https://login.tailscale.com/admin/acls + +**Example ACL**: +```json +{ + "acls": [ + { + "action": "accept", + "src": ["group:admins"], + "dst": ["*:22", "*:80", "*:443"] + }, + { + "action": "accept", + "src": ["group:developers"], + "dst": ["tag:development:*"] + } + ] +} +``` + +## Troubleshooting + +### Machine Shows Offline + +**Check Tailscale status**: +```bash +tailscale status +``` + +**Restart Tailscale**: +```bash +# macOS +brew services restart tailscale + +# Linux +sudo systemctl restart tailscaled +``` + +**Re-authenticate**: +```bash +sudo tailscale up +``` + +### Cannot Connect via SSH + +1. **Verify Tailscale connectivity**: + ```bash + tailscale ping machine-name + ``` + +2. **Check SSH is running** on remote: + ```bash + tailscale ssh machine-name "systemctl status sshd" + ``` + +3. **Verify SSH keys**: + ```bash + ssh-add -l + ``` + +4. **Test SSH directly**: + ```bash + ssh -v user@machine-name.tailnet.ts.net + ``` + +### High Latency + +**Check connection method**: +```bash +tailscale status +``` + +Look for "direct" vs "DERP relay": +- **Direct**: Low latency (< 50ms) +- **DERP relay**: Higher latency (100-200ms) + +**Force direct connection**: +```bash +# Ensure both machines can establish P2P +# May require NAT traversal +``` + +### MagicDNS Not Working + +**Enable MagicDNS**: +1. Go to https://login.tailscale.com/admin/dns +2. Enable MagicDNS + +**Verify**: +```bash +nslookup machine-name.tailnet.ts.net +``` + +## Security Best Practices + +1. **Use SSH keys**, not passwords +2. **Enable Tailnet Lock** to prevent unauthorized devices +3. **Use ACLs** to restrict access +4. **Regularly review** connected devices +5. **Set up key expiry** for team members who leave +6. **Use tags** for machine roles +7. **Enable two-factor auth** for Tailscale account + +## Monitoring + +### Check Network Status + +```bash +# All machines +tailscale status + +# Self status +tailscale status --self + +# JSON format for parsing +tailscale status --json +``` + +### View Logs + +```bash +# macOS +tail -f /var/log/tailscaled.log + +# Linux +journalctl -u tailscaled -f +``` + +## Use Cases with sshsync + +### 1. Deploy to All Production Servers + +```bash +sshsync push --group production --recurse ./dist /var/www/app +sshsync group production "cd /var/www/app && pm2 restart all" +``` + +### 2. Collect Logs from All Servers + +```bash +sshsync pull --group production /var/log/app/error.log ./logs/ +``` + +### 3. Update All Homelab Machines + +```bash +sshsync group homelab "sudo apt update && sudo apt upgrade -y" +``` + +### 4. Check Disk Space Everywhere + +```bash +sshsync all "df -h /" +``` + +### 5. Sync Configuration Across Machines + +```bash +sshsync push --all ~/dotfiles/.bashrc ~/.bashrc +sshsync push --all ~/dotfiles/.vimrc ~/.vimrc +``` + +## Summary + +Tailscale + sshsync = **Powerful Remote Management** + +- ✅ Secure connections everywhere (WireGuard encryption) +- ✅ No firewall configuration needed +- ✅ Easy addressing (MagicDNS) +- ✅ Multi-host operations (sshsync groups) +- ✅ Works from anywhere + +**Quick Start**: +1. Install Tailscale: `brew install tailscale` +2. Connect: `sudo tailscale up` +3. Configure SSH config with Tailscale hostnames +4. Initialize sshsync: `sshsync sync` +5. Start managing: `sshsync all "uptime"` + +For more: https://tailscale.com/kb/ diff --git a/scripts/load_balancer.py b/scripts/load_balancer.py new file mode 100644 index 0000000..9b162a1 --- /dev/null +++ b/scripts/load_balancer.py @@ -0,0 +1,378 @@ +#!/usr/bin/env python3 +""" +Load balancer for Tailscale SSH Sync Agent. +Intelligent task distribution based on machine resources. +""" + +import sys +from pathlib import Path +from typing import Dict, List, Optional, Tuple +from dataclasses import dataclass +import logging + +# Add utils to path +sys.path.insert(0, str(Path(__file__).parent)) + +from utils.helpers import parse_cpu_load, parse_memory_usage, parse_disk_usage, calculate_load_score, classify_load_status +from sshsync_wrapper import execute_on_host + +logger = logging.getLogger(__name__) + + +@dataclass +class MachineMetrics: + """Resource metrics for a machine.""" + host: str + cpu_pct: float + mem_pct: float + disk_pct: float + load_score: float + status: str + + +def get_machine_load(host: str, timeout: int = 10) -> Optional[MachineMetrics]: + """ + Get CPU, memory, disk metrics for a machine. + + Args: + host: Host to check + timeout: Command timeout + + Returns: + MachineMetrics object or None on failure + + Example: + >>> metrics = get_machine_load("web-01") + >>> metrics.cpu_pct + 45.2 + >>> metrics.load_score + 0.49 + """ + try: + # Get CPU load + cpu_result = execute_on_host(host, "uptime", timeout=timeout) + cpu_data = {} + if cpu_result.get('success'): + cpu_data = parse_cpu_load(cpu_result['stdout']) + + # Get memory usage + mem_result = execute_on_host(host, "free -m 2>/dev/null || vm_stat", timeout=timeout) + mem_data = {} + if mem_result.get('success'): + mem_data = parse_memory_usage(mem_result['stdout']) + + # Get disk usage + disk_result = execute_on_host(host, "df -h / | tail -1", timeout=timeout) + disk_data = {} + if disk_result.get('success'): + disk_data = parse_disk_usage(disk_result['stdout']) + + # Calculate metrics + # CPU: Use 1-min load average, normalize by assuming 4 cores (adjust as needed) + cpu_pct = (cpu_data.get('load_1min', 0) / 4.0) * 100 if cpu_data else 50.0 + + # Memory: Direct percentage + mem_pct = mem_data.get('use_pct', 50.0) + + # Disk: Direct percentage + disk_pct = disk_data.get('use_pct', 50.0) + + # Calculate load score + score = calculate_load_score(cpu_pct, mem_pct, disk_pct) + status = classify_load_status(score) + + return MachineMetrics( + host=host, + cpu_pct=cpu_pct, + mem_pct=mem_pct, + disk_pct=disk_pct, + load_score=score, + status=status + ) + + except Exception as e: + logger.error(f"Error getting load for {host}: {e}") + return None + + +def select_optimal_host(candidates: List[str], + prefer_group: Optional[str] = None, + timeout: int = 10) -> Tuple[Optional[str], Optional[MachineMetrics]]: + """ + Pick best host from candidates based on load. + + Args: + candidates: List of candidate hosts + prefer_group: Prefer hosts from this group if available + timeout: Timeout for metric gathering + + Returns: + Tuple of (selected_host, metrics) + + Example: + >>> host, metrics = select_optimal_host(["web-01", "web-02", "web-03"]) + >>> host + "web-03" + >>> metrics.load_score + 0.28 + """ + if not candidates: + return None, None + + # Get metrics for all candidates + metrics_list: List[MachineMetrics] = [] + + for host in candidates: + metrics = get_machine_load(host, timeout=timeout) + if metrics: + metrics_list.append(metrics) + + if not metrics_list: + logger.warning("No valid metrics collected from candidates") + return None, None + + # Sort by load score (lower is better) + metrics_list.sort(key=lambda m: m.load_score) + + # If prefer_group specified, prioritize those hosts if load is similar + if prefer_group: + from utils.helpers import parse_sshsync_config, get_groups_for_host + groups_config = parse_sshsync_config() + + # Find hosts in preferred group + preferred_metrics = [ + m for m in metrics_list + if prefer_group in get_groups_for_host(m.host, groups_config) + ] + + # Use preferred if load score within 20% of absolute best + if preferred_metrics: + best_score = metrics_list[0].load_score + for m in preferred_metrics: + if m.load_score <= best_score * 1.2: + return m.host, m + + # Return absolute best + best = metrics_list[0] + return best.host, best + + +def get_group_capacity(group: str, timeout: int = 10) -> Dict: + """ + Get aggregate capacity of a group. + + Args: + group: Group name + timeout: Timeout for metric gathering + + Returns: + Dict with aggregate metrics: + { + 'hosts': List[MachineMetrics], + 'total_hosts': int, + 'avg_cpu': float, + 'avg_mem': float, + 'avg_disk': float, + 'avg_load_score': float, + 'total_capacity': str # descriptive + } + + Example: + >>> capacity = get_group_capacity("production") + >>> capacity['avg_load_score'] + 0.45 + """ + from utils.helpers import parse_sshsync_config + + groups_config = parse_sshsync_config() + group_hosts = groups_config.get(group, []) + + if not group_hosts: + return { + 'error': f'Group {group} not found or has no members', + 'hosts': [] + } + + # Get metrics for all hosts in group + metrics_list: List[MachineMetrics] = [] + + for host in group_hosts: + metrics = get_machine_load(host, timeout=timeout) + if metrics: + metrics_list.append(metrics) + + if not metrics_list: + return { + 'error': f'Could not get metrics for any hosts in {group}', + 'hosts': [] + } + + # Calculate aggregates + avg_cpu = sum(m.cpu_pct for m in metrics_list) / len(metrics_list) + avg_mem = sum(m.mem_pct for m in metrics_list) / len(metrics_list) + avg_disk = sum(m.disk_pct for m in metrics_list) / len(metrics_list) + avg_score = sum(m.load_score for m in metrics_list) / len(metrics_list) + + # Determine overall capacity description + if avg_score < 0.4: + capacity_desc = "High capacity available" + elif avg_score < 0.7: + capacity_desc = "Moderate capacity" + else: + capacity_desc = "Limited capacity" + + return { + 'group': group, + 'hosts': metrics_list, + 'total_hosts': len(metrics_list), + 'available_hosts': len(group_hosts), + 'avg_cpu': avg_cpu, + 'avg_mem': avg_mem, + 'avg_disk': avg_disk, + 'avg_load_score': avg_score, + 'total_capacity': capacity_desc + } + + +def distribute_tasks(tasks: List[Dict], hosts: List[str], + timeout: int = 10) -> Dict[str, List[Dict]]: + """ + Distribute multiple tasks optimally across hosts. + + Args: + tasks: List of task dicts (each with 'command', 'priority', etc) + hosts: Available hosts + timeout: Timeout for metric gathering + + Returns: + Dict mapping hosts to assigned tasks + + Algorithm: + - Get current load for all hosts + - Assign tasks to least loaded hosts + - Balance by estimated task weight + + Example: + >>> tasks = [ + ... {'command': 'npm run build', 'weight': 3}, + ... {'command': 'npm test', 'weight': 2} + ... ] + >>> distribution = distribute_tasks(tasks, ["web-01", "web-02"]) + >>> distribution["web-01"] + [{'command': 'npm run build', 'weight': 3}] + """ + if not tasks or not hosts: + return {} + + # Get current load for all hosts + host_metrics = {} + for host in hosts: + metrics = get_machine_load(host, timeout=timeout) + if metrics: + host_metrics[host] = metrics + + if not host_metrics: + logger.error("No valid host metrics available") + return {} + + # Initialize assignment + assignment: Dict[str, List[Dict]] = {host: [] for host in host_metrics.keys()} + host_loads = {host: m.load_score for host, m in host_metrics.items()} + + # Sort tasks by weight (descending) to assign heavy tasks first + sorted_tasks = sorted( + tasks, + key=lambda t: t.get('weight', 1), + reverse=True + ) + + # Assign each task to least loaded host + for task in sorted_tasks: + # Find host with minimum current load + min_host = min(host_loads.keys(), key=lambda h: host_loads[h]) + + # Assign task + assignment[min_host].append(task) + + # Update simulated load (add task weight normalized) + task_weight = task.get('weight', 1) + host_loads[min_host] += (task_weight * 0.1) # 0.1 = scaling factor + + return assignment + + +def format_load_report(metrics: MachineMetrics, compare_to_avg: Optional[Dict] = None) -> str: + """ + Format load metrics as human-readable report. + + Args: + metrics: Machine metrics + compare_to_avg: Optional dict with avg_cpu, avg_mem, avg_disk for comparison + + Returns: + Formatted report string + + Example: + >>> metrics = MachineMetrics('web-01', 45, 60, 40, 0.49, 'moderate') + >>> print(format_load_report(metrics)) + web-01: Load Score: 0.49 (moderate) + CPU: 45.0% | Memory: 60.0% | Disk: 40.0% + """ + lines = [ + f"{metrics.host}: Load Score: {metrics.load_score:.2f} ({metrics.status})", + f" CPU: {metrics.cpu_pct:.1f}% | Memory: {metrics.mem_pct:.1f}% | Disk: {metrics.disk_pct:.1f}%" + ] + + if compare_to_avg: + cpu_vs = metrics.cpu_pct - compare_to_avg.get('avg_cpu', 0) + mem_vs = metrics.mem_pct - compare_to_avg.get('avg_mem', 0) + disk_vs = metrics.disk_pct - compare_to_avg.get('avg_disk', 0) + + comparisons = [] + if abs(cpu_vs) > 10: + comparisons.append(f"CPU {'+' if cpu_vs > 0 else ''}{cpu_vs:.0f}% vs avg") + if abs(mem_vs) > 10: + comparisons.append(f"Mem {'+' if mem_vs > 0 else ''}{mem_vs:.0f}% vs avg") + if abs(disk_vs) > 10: + comparisons.append(f"Disk {'+' if disk_vs > 0 else ''}{disk_vs:.0f}% vs avg") + + if comparisons: + lines.append(f" vs Average: {' | '.join(comparisons)}") + + return "\n".join(lines) + + +def main(): + """Test load balancer functions.""" + print("Testing load balancer...\n") + + print("1. Testing select_optimal_host:") + print(" (Requires configured hosts - using dry-run simulation)") + + # Simulate metrics + test_metrics = [ + MachineMetrics('web-01', 45, 60, 40, 0.49, 'moderate'), + MachineMetrics('web-02', 85, 70, 65, 0.75, 'high'), + MachineMetrics('web-03', 20, 35, 30, 0.28, 'low'), + ] + + # Sort by score + test_metrics.sort(key=lambda m: m.load_score) + best = test_metrics[0] + + print(f" ✓ Best host: {best.host} (score: {best.load_score:.2f})") + print(f" Reason: {best.status} load") + + print("\n2. Format load report:") + report = format_load_report(test_metrics[0], { + 'avg_cpu': 50, + 'avg_mem': 55, + 'avg_disk': 45 + }) + print(report) + + print("\n✅ Load balancer tested") + + +if __name__ == "__main__": + main() diff --git a/scripts/sshsync_wrapper.py b/scripts/sshsync_wrapper.py new file mode 100644 index 0000000..c240a90 --- /dev/null +++ b/scripts/sshsync_wrapper.py @@ -0,0 +1,409 @@ +#!/usr/bin/env python3 +""" +SSH Sync wrapper for Tailscale SSH Sync Agent. +Python interface to sshsync CLI operations. +""" + +import subprocess +import sys +from pathlib import Path +from typing import Dict, List, Optional, Tuple +import json +import logging + +# Add utils to path +sys.path.insert(0, str(Path(__file__).parent)) + +from utils.helpers import parse_ssh_config, parse_sshsync_config, format_bytes, format_duration +from utils.validators import validate_host, validate_group, validate_path_exists, validate_timeout, validate_command + +logger = logging.getLogger(__name__) + + +def get_host_status(group: Optional[str] = None) -> Dict: + """ + Get online/offline status of hosts. + + Args: + group: Optional group to filter (None = all hosts) + + Returns: + Dict with status info + + Example: + >>> status = get_host_status() + >>> status['online_count'] + 8 + """ + try: + # Run sshsync ls --with-status + cmd = ["sshsync", "ls", "--with-status"] + result = subprocess.run(cmd, capture_output=True, text=True, timeout=30) + + if result.returncode != 0: + return {'error': result.stderr, 'hosts': []} + + # Parse output + hosts = [] + for line in result.stdout.strip().split('\n'): + if not line or line.startswith('Host') or line.startswith('---'): + continue + + parts = line.split() + if len(parts) >= 2: + host_name = parts[0] + status = parts[1] if len(parts) > 1 else 'unknown' + + hosts.append({ + 'host': host_name, + 'online': status.lower() in ['online', 'reachable', '✓'], + 'status': status + }) + + # Filter by group if specified + if group: + groups_config = parse_sshsync_config() + group_hosts = groups_config.get(group, []) + hosts = [h for h in hosts if h['host'] in group_hosts] + + online_count = sum(1 for h in hosts if h['online']) + + return { + 'hosts': hosts, + 'total_count': len(hosts), + 'online_count': online_count, + 'offline_count': len(hosts) - online_count, + 'availability_pct': (online_count / len(hosts) * 100) if hosts else 0 + } + + except Exception as e: + logger.error(f"Error getting host status: {e}") + return {'error': str(e), 'hosts': []} + + +def execute_on_all(command: str, timeout: int = 10, dry_run: bool = False) -> Dict: + """ + Execute command on all hosts. + + Args: + command: Command to execute + timeout: Timeout in seconds + dry_run: If True, don't actually execute + + Returns: + Dict with results per host + + Example: + >>> result = execute_on_all("uptime", timeout=15) + >>> len(result['results']) + 10 + """ + validate_command(command) + validate_timeout(timeout) + + if dry_run: + return { + 'dry_run': True, + 'command': command, + 'message': 'Would execute on all hosts' + } + + try: + cmd = ["sshsync", "all", f"--timeout={timeout}", command] + result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout + 30) + + # Parse results (format varies, simplified here) + return { + 'success': result.returncode == 0, + 'stdout': result.stdout, + 'stderr': result.stderr, + 'command': command + } + + except subprocess.TimeoutExpired: + return {'error': f'Command timed out after {timeout}s'} + except Exception as e: + return {'error': str(e)} + + +def execute_on_group(group: str, command: str, timeout: int = 10, dry_run: bool = False) -> Dict: + """ + Execute command on specific group. + + Args: + group: Group name + command: Command to execute + timeout: Timeout in seconds + dry_run: Preview without executing + + Returns: + Dict with execution results + + Example: + >>> result = execute_on_group("web-servers", "df -h /var/www") + >>> result['success'] + True + """ + groups_config = parse_sshsync_config() + validate_group(group, list(groups_config.keys())) + validate_command(command) + validate_timeout(timeout) + + if dry_run: + group_hosts = groups_config.get(group, []) + return { + 'dry_run': True, + 'group': group, + 'hosts': group_hosts, + 'command': command, + 'message': f'Would execute on {len(group_hosts)} hosts in group {group}' + } + + try: + cmd = ["sshsync", "group", f"--timeout={timeout}", group, command] + result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout + 30) + + return { + 'success': result.returncode == 0, + 'group': group, + 'stdout': result.stdout, + 'stderr': result.stderr, + 'command': command + } + + except subprocess.TimeoutExpired: + return {'error': f'Command timed out after {timeout}s'} + except Exception as e: + return {'error': str(e)} + + +def execute_on_host(host: str, command: str, timeout: int = 10) -> Dict: + """ + Execute command on single host. + + Args: + host: Host name + command: Command to execute + timeout: Timeout in seconds + + Returns: + Dict with result + + Example: + >>> result = execute_on_host("web-01", "hostname") + >>> result['stdout'] + "web-01" + """ + ssh_hosts = parse_ssh_config() + validate_host(host, list(ssh_hosts.keys())) + validate_command(command) + validate_timeout(timeout) + + try: + cmd = ["ssh", "-o", f"ConnectTimeout={timeout}", host, command] + result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout + 5) + + return { + 'success': result.returncode == 0, + 'host': host, + 'stdout': result.stdout, + 'stderr': result.stderr, + 'command': command + } + + except subprocess.TimeoutExpired: + return {'error': f'Command timed out after {timeout}s'} + except Exception as e: + return {'error': str(e)} + + +def push_to_hosts(local_path: str, remote_path: str, + hosts: Optional[List[str]] = None, + group: Optional[str] = None, + recurse: bool = False, + dry_run: bool = False) -> Dict: + """ + Push files to hosts. + + Args: + local_path: Local file/directory path + remote_path: Remote destination path + hosts: Specific hosts (None = all if group also None) + group: Group name + recurse: Recursive copy + dry_run: Preview without executing + + Returns: + Dict with push results + + Example: + >>> result = push_to_hosts("./dist", "/var/www/app", group="production", recurse=True) + >>> result['success'] + True + """ + validate_path_exists(local_path) + + if dry_run: + return { + 'dry_run': True, + 'local_path': local_path, + 'remote_path': remote_path, + 'hosts': hosts, + 'group': group, + 'recurse': recurse, + 'message': 'Would push files' + } + + try: + cmd = ["sshsync", "push"] + + if hosts: + for host in hosts: + cmd.extend(["--host", host]) + elif group: + cmd.extend(["--group", group]) + else: + cmd.append("--all") + + if recurse: + cmd.append("--recurse") + + cmd.extend([local_path, remote_path]) + + result = subprocess.run(cmd, capture_output=True, text=True, timeout=300) + + return { + 'success': result.returncode == 0, + 'stdout': result.stdout, + 'stderr': result.stderr, + 'local_path': local_path, + 'remote_path': remote_path + } + + except subprocess.TimeoutExpired: + return {'error': 'Push operation timed out'} + except Exception as e: + return {'error': str(e)} + + +def pull_from_host(host: str, remote_path: str, local_path: str, + recurse: bool = False, dry_run: bool = False) -> Dict: + """ + Pull files from host. + + Args: + host: Host to pull from + remote_path: Remote file/directory path + local_path: Local destination path + recurse: Recursive copy + dry_run: Preview without executing + + Returns: + Dict with pull results + + Example: + >>> result = pull_from_host("web-01", "/var/log/nginx", "./logs", recurse=True) + >>> result['success'] + True + """ + ssh_hosts = parse_ssh_config() + validate_host(host, list(ssh_hosts.keys())) + + if dry_run: + return { + 'dry_run': True, + 'host': host, + 'remote_path': remote_path, + 'local_path': local_path, + 'recurse': recurse, + 'message': f'Would pull from {host}' + } + + try: + cmd = ["sshsync", "pull", "--host", host] + + if recurse: + cmd.append("--recurse") + + cmd.extend([remote_path, local_path]) + + result = subprocess.run(cmd, capture_output=True, text=True, timeout=300) + + return { + 'success': result.returncode == 0, + 'host': host, + 'stdout': result.stdout, + 'stderr': result.stderr, + 'remote_path': remote_path, + 'local_path': local_path + } + + except subprocess.TimeoutExpired: + return {'error': 'Pull operation timed out'} + except Exception as e: + return {'error': str(e)} + + +def list_hosts(with_status: bool = True) -> Dict: + """ + List all configured hosts. + + Args: + with_status: Include online/offline status + + Returns: + Dict with hosts info + + Example: + >>> result = list_hosts(with_status=True) + >>> len(result['hosts']) + 10 + """ + if with_status: + return get_host_status() + else: + ssh_hosts = parse_ssh_config() + return { + 'hosts': [{'host': name} for name in ssh_hosts.keys()], + 'count': len(ssh_hosts) + } + + +def get_groups() -> Dict[str, List[str]]: + """ + Get all defined groups and their members. + + Returns: + Dict mapping group names to host lists + + Example: + >>> groups = get_groups() + >>> groups['production'] + ['prod-web-01', 'prod-db-01'] + """ + return parse_sshsync_config() + + +def main(): + """Test sshsync wrapper functions.""" + print("Testing sshsync wrapper...\n") + + print("1. List hosts:") + result = list_hosts(with_status=False) + print(f" Found {result.get('count', 0)} hosts") + + print("\n2. Get groups:") + groups = get_groups() + print(f" Found {len(groups)} groups") + for group, hosts in groups.items(): + print(f" - {group}: {len(hosts)} hosts") + + print("\n3. Test dry-run:") + result = execute_on_all("uptime", dry_run=True) + print(f" Dry-run: {result.get('message', 'OK')}") + + print("\n✅ sshsync wrapper tested") + + +if __name__ == "__main__": + main() diff --git a/scripts/tailscale_manager.py b/scripts/tailscale_manager.py new file mode 100644 index 0000000..2867638 --- /dev/null +++ b/scripts/tailscale_manager.py @@ -0,0 +1,426 @@ +#!/usr/bin/env python3 +""" +Tailscale manager for Tailscale SSH Sync Agent. +Tailscale-specific operations and status management. +""" + +import subprocess +import re +import json +from typing import Dict, List, Optional +from dataclasses import dataclass +import logging + +logger = logging.getLogger(__name__) + + +@dataclass +class TailscalePeer: + """Represents a Tailscale peer.""" + hostname: str + ip: str + online: bool + last_seen: Optional[str] = None + os: Optional[str] = None + relay: Optional[str] = None + + +def get_tailscale_status() -> Dict: + """ + Get Tailscale network status (all peers). + + Returns: + Dict with network status: + { + 'connected': bool, + 'peers': List[TailscalePeer], + 'online_count': int, + 'total_count': int, + 'self_ip': str + } + + Example: + >>> status = get_tailscale_status() + >>> status['online_count'] + 8 + >>> status['peers'][0].hostname + 'homelab-1' + """ + try: + # Get status in JSON format + result = subprocess.run( + ["tailscale", "status", "--json"], + capture_output=True, + text=True, + timeout=10 + ) + + if result.returncode != 0: + # Try text format if JSON fails + result = subprocess.run( + ["tailscale", "status"], + capture_output=True, + text=True, + timeout=10 + ) + + if result.returncode != 0: + return { + 'connected': False, + 'error': 'Tailscale not running or accessible', + 'peers': [] + } + + # Parse text format + return _parse_text_status(result.stdout) + + # Parse JSON format + data = json.loads(result.stdout) + return _parse_json_status(data) + + except FileNotFoundError: + return { + 'connected': False, + 'error': 'Tailscale not installed', + 'peers': [] + } + except subprocess.TimeoutExpired: + return { + 'connected': False, + 'error': 'Timeout getting Tailscale status', + 'peers': [] + } + except Exception as e: + logger.error(f"Error getting Tailscale status: {e}") + return { + 'connected': False, + 'error': str(e), + 'peers': [] + } + + +def _parse_json_status(data: Dict) -> Dict: + """Parse Tailscale JSON status.""" + peers = [] + + self_data = data.get('Self', {}) + self_ip = self_data.get('TailscaleIPs', [''])[0] + + for peer_id, peer_data in data.get('Peer', {}).items(): + hostname = peer_data.get('HostName', 'unknown') + ips = peer_data.get('TailscaleIPs', []) + ip = ips[0] if ips else 'unknown' + online = peer_data.get('Online', False) + os = peer_data.get('OS', 'unknown') + + peers.append(TailscalePeer( + hostname=hostname, + ip=ip, + online=online, + os=os + )) + + online_count = sum(1 for p in peers if p.online) + + return { + 'connected': True, + 'peers': peers, + 'online_count': online_count, + 'total_count': len(peers), + 'self_ip': self_ip + } + + +def _parse_text_status(output: str) -> Dict: + """Parse Tailscale text status output.""" + peers = [] + self_ip = None + + for line in output.strip().split('\n'): + line = line.strip() + if not line: + continue + + # Parse format: hostname ip status ... + parts = line.split() + if len(parts) >= 2: + hostname = parts[0] + ip = parts[1] if len(parts) > 1 else 'unknown' + + # Check for self (usually marked with *) + if hostname.endswith('-'): + self_ip = ip + continue + + # Determine online status from additional fields + online = 'offline' not in line.lower() + + peers.append(TailscalePeer( + hostname=hostname, + ip=ip, + online=online + )) + + online_count = sum(1 for p in peers if p.online) + + return { + 'connected': True, + 'peers': peers, + 'online_count': online_count, + 'total_count': len(peers), + 'self_ip': self_ip or 'unknown' + } + + +def check_connectivity(host: str, timeout: int = 5) -> bool: + """ + Ping host via Tailscale. + + Args: + host: Hostname to ping + timeout: Timeout in seconds + + Returns: + True if host responds to ping + + Example: + >>> check_connectivity("homelab-1") + True + """ + try: + result = subprocess.run( + ["tailscale", "ping", "--timeout", f"{timeout}s", "--c", "1", host], + capture_output=True, + text=True, + timeout=timeout + 2 + ) + + # Check if ping succeeded + return result.returncode == 0 or 'pong' in result.stdout.lower() + + except (FileNotFoundError, subprocess.TimeoutExpired): + return False + except Exception as e: + logger.error(f"Error pinging {host}: {e}") + return False + + +def get_peer_info(hostname: str) -> Optional[TailscalePeer]: + """ + Get detailed info about a specific peer. + + Args: + hostname: Peer hostname + + Returns: + TailscalePeer object or None if not found + + Example: + >>> peer = get_peer_info("homelab-1") + >>> peer.ip + '100.64.1.10' + """ + status = get_tailscale_status() + + if not status.get('connected'): + return None + + for peer in status.get('peers', []): + if peer.hostname == hostname or hostname in peer.hostname: + return peer + + return None + + +def list_online_machines() -> List[str]: + """ + List all online Tailscale machines. + + Returns: + List of online machine hostnames + + Example: + >>> machines = list_online_machines() + >>> len(machines) + 8 + """ + status = get_tailscale_status() + + if not status.get('connected'): + return [] + + return [ + peer.hostname + for peer in status.get('peers', []) + if peer.online + ] + + +def get_machine_ip(hostname: str) -> Optional[str]: + """ + Get Tailscale IP for a machine. + + Args: + hostname: Machine hostname + + Returns: + IP address or None if not found + + Example: + >>> ip = get_machine_ip("homelab-1") + >>> ip + '100.64.1.10' + """ + peer = get_peer_info(hostname) + return peer.ip if peer else None + + +def validate_tailscale_ssh(host: str, timeout: int = 10) -> Dict: + """ + Check if Tailscale SSH is working for a host. + + Args: + host: Host to check + timeout: Connection timeout + + Returns: + Dict with validation results: + { + 'working': bool, + 'message': str, + 'details': Dict + } + + Example: + >>> result = validate_tailscale_ssh("homelab-1") + >>> result['working'] + True + """ + # First check if host is in Tailscale network + peer = get_peer_info(host) + + if not peer: + return { + 'working': False, + 'message': f'Host {host} not found in Tailscale network', + 'details': {'peer_found': False} + } + + if not peer.online: + return { + 'working': False, + 'message': f'Host {host} is offline in Tailscale', + 'details': {'peer_found': True, 'online': False} + } + + # Check connectivity + if not check_connectivity(host, timeout=timeout): + return { + 'working': False, + 'message': f'Cannot ping {host} via Tailscale', + 'details': {'peer_found': True, 'online': True, 'ping': False} + } + + # Try SSH connection + try: + result = subprocess.run( + ["tailscale", "ssh", host, "echo", "test"], + capture_output=True, + text=True, + timeout=timeout + ) + + if result.returncode == 0: + return { + 'working': True, + 'message': f'Tailscale SSH to {host} is working', + 'details': { + 'peer_found': True, + 'online': True, + 'ping': True, + 'ssh': True, + 'ip': peer.ip + } + } + else: + return { + 'working': False, + 'message': f'Tailscale SSH failed: {result.stderr}', + 'details': { + 'peer_found': True, + 'online': True, + 'ping': True, + 'ssh': False, + 'error': result.stderr + } + } + + except subprocess.TimeoutExpired: + return { + 'working': False, + 'message': f'Tailscale SSH timed out after {timeout}s', + 'details': {'timeout': True} + } + except Exception as e: + return { + 'working': False, + 'message': f'Error testing Tailscale SSH: {e}', + 'details': {'error': str(e)} + } + + +def get_network_summary() -> str: + """ + Get human-readable network summary. + + Returns: + Formatted summary string + + Example: + >>> print(get_network_summary()) + Tailscale Network: Connected + Online: 8/10 machines (80%) + Self IP: 100.64.1.5 + """ + status = get_tailscale_status() + + if not status.get('connected'): + return "Tailscale Network: Not connected\nError: {}".format( + status.get('error', 'Unknown error') + ) + + lines = [ + "Tailscale Network: Connected", + f"Online: {status['online_count']}/{status['total_count']} machines ({status['online_count']/status['total_count']*100:.0f}%)", + f"Self IP: {status.get('self_ip', 'unknown')}" + ] + + return "\n".join(lines) + + +def main(): + """Test Tailscale manager functions.""" + print("Testing Tailscale manager...\n") + + print("1. Get Tailscale status:") + status = get_tailscale_status() + if status.get('connected'): + print(f" ✓ Connected") + print(f" Peers: {status['total_count']} total, {status['online_count']} online") + else: + print(f" ✗ Not connected: {status.get('error', 'Unknown error')}") + + print("\n2. List online machines:") + machines = list_online_machines() + print(f" Found {len(machines)} online machines") + for machine in machines[:5]: # Show first 5 + print(f" - {machine}") + + print("\n3. Network summary:") + print(get_network_summary()) + + print("\n✅ Tailscale manager tested") + + +if __name__ == "__main__": + main() diff --git a/scripts/utils/helpers.py b/scripts/utils/helpers.py new file mode 100644 index 0000000..5b0921b --- /dev/null +++ b/scripts/utils/helpers.py @@ -0,0 +1,628 @@ +#!/usr/bin/env python3 +""" +Helper utilities for Tailscale SSH Sync Agent. +Provides common formatting, parsing, and utility functions. +""" + +import os +import re +import subprocess +from datetime import datetime, timedelta +from pathlib import Path +from typing import Dict, List, Optional, Tuple, Any +import yaml +import logging + +logger = logging.getLogger(__name__) + + +def format_bytes(bytes_value: int) -> str: + """ + Format bytes as human-readable string. + + Args: + bytes_value: Number of bytes + + Returns: + Formatted string (e.g., "12.3 MB", "1.5 GB") + + Example: + >>> format_bytes(12582912) + "12.0 MB" + >>> format_bytes(1610612736) + "1.5 GB" + """ + for unit in ['B', 'KB', 'MB', 'GB', 'TB']: + if bytes_value < 1024.0: + return f"{bytes_value:.1f} {unit}" + bytes_value /= 1024.0 + return f"{bytes_value:.1f} PB" + + +def format_duration(seconds: float) -> str: + """ + Format duration as human-readable string. + + Args: + seconds: Duration in seconds + + Returns: + Formatted string (e.g., "2m 15s", "1h 30m") + + Example: + >>> format_duration(135) + "2m 15s" + >>> format_duration(5430) + "1h 30m 30s" + """ + if seconds < 60: + return f"{int(seconds)}s" + + minutes = int(seconds // 60) + secs = int(seconds % 60) + + if minutes < 60: + return f"{minutes}m {secs}s" if secs > 0 else f"{minutes}m" + + hours = minutes // 60 + minutes = minutes % 60 + + parts = [f"{hours}h"] + if minutes > 0: + parts.append(f"{minutes}m") + if secs > 0 and hours == 0: # Only show seconds if < 1 hour + parts.append(f"{secs}s") + + return " ".join(parts) + + +def format_percentage(value: float, decimals: int = 1) -> str: + """ + Format percentage with specified decimals. + + Args: + value: Percentage value (0-100) + decimals: Number of decimal places + + Returns: + Formatted string (e.g., "45.5%") + + Example: + >>> format_percentage(45.567) + "45.6%" + """ + return f"{value:.{decimals}f}%" + + +def parse_ssh_config(config_path: Optional[Path] = None) -> Dict[str, Dict[str, str]]: + """ + Parse SSH config file for host definitions. + + Args: + config_path: Path to SSH config (default: ~/.ssh/config) + + Returns: + Dict mapping host aliases to their configuration: + { + 'host-alias': { + 'hostname': '100.64.1.10', + 'user': 'admin', + 'port': '22', + 'identityfile': '~/.ssh/id_ed25519' + } + } + + Example: + >>> hosts = parse_ssh_config() + >>> hosts['homelab-1']['hostname'] + '100.64.1.10' + """ + if config_path is None: + config_path = Path.home() / '.ssh' / 'config' + + if not config_path.exists(): + logger.warning(f"SSH config not found: {config_path}") + return {} + + hosts = {} + current_host = None + + try: + with open(config_path, 'r') as f: + for line in f: + line = line.strip() + + # Skip comments and empty lines + if not line or line.startswith('#'): + continue + + # Host directive + if line.lower().startswith('host '): + host_alias = line.split(maxsplit=1)[1] + # Skip wildcards + if '*' not in host_alias and '?' not in host_alias: + current_host = host_alias + hosts[current_host] = {} + + # Configuration directives + elif current_host: + parts = line.split(maxsplit=1) + if len(parts) == 2: + key, value = parts + hosts[current_host][key.lower()] = value + + return hosts + + except Exception as e: + logger.error(f"Error parsing SSH config: {e}") + return {} + + +def parse_sshsync_config(config_path: Optional[Path] = None) -> Dict[str, List[str]]: + """ + Parse sshsync config file for group definitions. + + Args: + config_path: Path to sshsync config (default: ~/.config/sshsync/config.yaml) + + Returns: + Dict mapping group names to list of hosts: + { + 'production': ['prod-web-01', 'prod-db-01'], + 'development': ['dev-laptop', 'dev-desktop'] + } + + Example: + >>> groups = parse_sshsync_config() + >>> groups['production'] + ['prod-web-01', 'prod-db-01'] + """ + if config_path is None: + config_path = Path.home() / '.config' / 'sshsync' / 'config.yaml' + + if not config_path.exists(): + logger.warning(f"sshsync config not found: {config_path}") + return {} + + try: + with open(config_path, 'r') as f: + config = yaml.safe_load(f) + + return config.get('groups', {}) + + except Exception as e: + logger.error(f"Error parsing sshsync config: {e}") + return {} + + +def get_timestamp(iso: bool = True) -> str: + """ + Get current timestamp. + + Args: + iso: If True, return ISO format; otherwise human-readable + + Returns: + Timestamp string + + Example: + >>> get_timestamp(iso=True) + "2025-10-19T19:43:41Z" + >>> get_timestamp(iso=False) + "2025-10-19 19:43:41" + """ + now = datetime.now() + if iso: + return now.strftime("%Y-%m-%dT%H:%M:%SZ") + else: + return now.strftime("%Y-%m-%d %H:%M:%S") + + +def safe_execute(func, *args, default=None, **kwargs) -> Any: + """ + Execute function with error handling. + + Args: + func: Function to execute + *args: Positional arguments + default: Value to return on error + **kwargs: Keyword arguments + + Returns: + Function result or default on error + + Example: + >>> safe_execute(int, "not_a_number", default=0) + 0 + >>> safe_execute(int, "42") + 42 + """ + try: + return func(*args, **kwargs) + except Exception as e: + logger.error(f"Error executing {func.__name__}: {e}") + return default + + +def validate_path(path: str, must_exist: bool = True) -> bool: + """ + Check if path is valid and accessible. + + Args: + path: Path to validate + must_exist: If True, path must exist + + Returns: + True if valid, False otherwise + + Example: + >>> validate_path("/tmp") + True + >>> validate_path("/nonexistent", must_exist=True) + False + """ + p = Path(path).expanduser() + + if must_exist: + return p.exists() + else: + # Check if parent directory exists (for paths that will be created) + return p.parent.exists() + + +def parse_disk_usage(df_output: str) -> Dict[str, Any]: + """ + Parse 'df' command output. + + Args: + df_output: Output from 'df -h' command + + Returns: + Dict with disk usage info: + { + 'filesystem': '/dev/sda1', + 'size': '100G', + 'used': '45G', + 'available': '50G', + 'use_pct': 45, + 'mount': '/' + } + + Example: + >>> output = "Filesystem Size Used Avail Use% Mounted on\\n/dev/sda1 100G 45G 50G 45% /" + >>> parse_disk_usage(output) + {'filesystem': '/dev/sda1', 'size': '100G', ...} + """ + lines = df_output.strip().split('\n') + if len(lines) < 2: + return {} + + # Parse last line (actual data, not header) + data_line = lines[-1] + parts = data_line.split() + + if len(parts) < 6: + return {} + + try: + return { + 'filesystem': parts[0], + 'size': parts[1], + 'used': parts[2], + 'available': parts[3], + 'use_pct': int(parts[4].rstrip('%')), + 'mount': parts[5] + } + except (ValueError, IndexError) as e: + logger.error(f"Error parsing disk usage: {e}") + return {} + + +def parse_memory_usage(free_output: str) -> Dict[str, Any]: + """ + Parse 'free' command output (Linux). + + Args: + free_output: Output from 'free -m' command + + Returns: + Dict with memory info: + { + 'total': 16384, # MB + 'used': 8192, + 'free': 8192, + 'use_pct': 50.0 + } + + Example: + >>> output = "Mem: 16384 8192 8192 0 0 0" + >>> parse_memory_usage(output) + {'total': 16384, 'used': 8192, ...} + """ + lines = free_output.strip().split('\n') + + for line in lines: + if line.startswith('Mem:'): + parts = line.split() + if len(parts) >= 3: + try: + total = int(parts[1]) + used = int(parts[2]) + free = int(parts[3]) if len(parts) > 3 else (total - used) + + return { + 'total': total, + 'used': used, + 'free': free, + 'use_pct': (used / total * 100) if total > 0 else 0 + } + except (ValueError, IndexError) as e: + logger.error(f"Error parsing memory usage: {e}") + + return {} + + +def parse_cpu_load(uptime_output: str) -> Dict[str, float]: + """ + Parse 'uptime' command output for load averages. + + Args: + uptime_output: Output from 'uptime' command + + Returns: + Dict with load averages: + { + 'load_1min': 0.45, + 'load_5min': 0.38, + 'load_15min': 0.32 + } + + Example: + >>> output = "19:43:41 up 5 days, 2:15, 3 users, load average: 0.45, 0.38, 0.32" + >>> parse_cpu_load(output) + {'load_1min': 0.45, 'load_5min': 0.38, 'load_15min': 0.32} + """ + # Find "load average:" part + match = re.search(r'load average:\s+([\d.]+),\s+([\d.]+),\s+([\d.]+)', uptime_output) + + if match: + try: + return { + 'load_1min': float(match.group(1)), + 'load_5min': float(match.group(2)), + 'load_15min': float(match.group(3)) + } + except ValueError as e: + logger.error(f"Error parsing CPU load: {e}") + + return {} + + +def format_host_status(host: str, online: bool, groups: List[str], + latency: Optional[int] = None, + tailscale_connected: bool = False) -> str: + """ + Format host status as display string. + + Args: + host: Host name + online: Whether host is online + groups: List of groups host belongs to + latency: Latency in ms (optional) + tailscale_connected: Tailscale connection status + + Returns: + Formatted status string + + Example: + >>> format_host_status("web-01", True, ["production", "web"], 25, True) + "🟢 web-01 (production, web) - Online - Tailscale: Connected | Latency: 25ms" + """ + icon = "🟢" if online else "🔴" + status = "Online" if online else "Offline" + group_str = ", ".join(groups) if groups else "no group" + + parts = [f"{icon} {host} ({group_str}) - {status}"] + + if tailscale_connected: + parts.append("Tailscale: Connected") + + if latency is not None and online: + parts.append(f"Latency: {latency}ms") + + return " - ".join(parts) + + +def calculate_load_score(cpu_pct: float, mem_pct: float, disk_pct: float) -> float: + """ + Calculate composite load score for a machine. + + Args: + cpu_pct: CPU usage percentage (0-100) + mem_pct: Memory usage percentage (0-100) + disk_pct: Disk usage percentage (0-100) + + Returns: + Load score (0-1, lower is better) + + Formula: + score = (cpu * 0.4) + (mem * 0.3) + (disk * 0.3) + + Example: + >>> calculate_load_score(45, 60, 40) + 0.48 # (0.45*0.4 + 0.60*0.3 + 0.40*0.3) + """ + return (cpu_pct * 0.4 + mem_pct * 0.3 + disk_pct * 0.3) / 100 + + +def classify_load_status(score: float) -> str: + """ + Classify load score into status category. + + Args: + score: Load score (0-1) + + Returns: + Status string: "low", "moderate", or "high" + + Example: + >>> classify_load_status(0.28) + "low" + >>> classify_load_status(0.55) + "moderate" + >>> classify_load_status(0.82) + "high" + """ + if score < 0.4: + return "low" + elif score < 0.7: + return "moderate" + else: + return "high" + + +def classify_latency(latency_ms: int) -> Tuple[str, str]: + """ + Classify network latency. + + Args: + latency_ms: Latency in milliseconds + + Returns: + Tuple of (status, description) + + Example: + >>> classify_latency(25) + ("excellent", "Ideal for interactive tasks") + >>> classify_latency(150) + ("fair", "May impact interactive workflows") + """ + if latency_ms < 50: + return ("excellent", "Ideal for interactive tasks") + elif latency_ms < 100: + return ("good", "Suitable for most operations") + elif latency_ms < 200: + return ("fair", "May impact interactive workflows") + else: + return ("poor", "Investigate network issues") + + +def get_hosts_from_groups(group: str, groups_config: Dict[str, List[str]]) -> List[str]: + """ + Get list of hosts in a group. + + Args: + group: Group name + groups_config: Groups configuration dict + + Returns: + List of host names in group + + Example: + >>> groups = {'production': ['web-01', 'db-01']} + >>> get_hosts_from_groups('production', groups) + ['web-01', 'db-01'] + """ + return groups_config.get(group, []) + + +def get_groups_for_host(host: str, groups_config: Dict[str, List[str]]) -> List[str]: + """ + Get list of groups a host belongs to. + + Args: + host: Host name + groups_config: Groups configuration dict + + Returns: + List of group names + + Example: + >>> groups = {'production': ['web-01'], 'web': ['web-01', 'web-02']} + >>> get_groups_for_host('web-01', groups) + ['production', 'web'] + """ + return [group for group, hosts in groups_config.items() if host in hosts] + + +def run_command(command: str, timeout: int = 10) -> Tuple[bool, str, str]: + """ + Run shell command with timeout. + + Args: + command: Command to execute + timeout: Timeout in seconds + + Returns: + Tuple of (success, stdout, stderr) + + Example: + >>> success, stdout, stderr = run_command("echo hello") + >>> success + True + >>> stdout.strip() + "hello" + """ + try: + result = subprocess.run( + command, + shell=True, + capture_output=True, + text=True, + timeout=timeout + ) + + return ( + result.returncode == 0, + result.stdout, + result.stderr + ) + + except subprocess.TimeoutExpired: + return (False, "", f"Command timed out after {timeout}s") + except Exception as e: + return (False, "", str(e)) + + +def main(): + """Test helper functions.""" + print("Testing helper functions...\n") + + # Test formatting + print("1. Format bytes:") + print(f" 12582912 bytes = {format_bytes(12582912)}") + print(f" 1610612736 bytes = {format_bytes(1610612736)}") + + print("\n2. Format duration:") + print(f" 135 seconds = {format_duration(135)}") + print(f" 5430 seconds = {format_duration(5430)}") + + print("\n3. Format percentage:") + print(f" 45.567 = {format_percentage(45.567)}") + + print("\n4. Calculate load score:") + score = calculate_load_score(45, 60, 40) + print(f" CPU 45%, Mem 60%, Disk 40% = {score:.2f}") + print(f" Status: {classify_load_status(score)}") + + print("\n5. Classify latency:") + latencies = [25, 75, 150, 250] + for lat in latencies: + status, desc = classify_latency(lat) + print(f" {lat}ms: {status} - {desc}") + + print("\n6. Parse SSH config:") + ssh_hosts = parse_ssh_config() + print(f" Found {len(ssh_hosts)} hosts") + + print("\n7. Parse sshsync config:") + groups = parse_sshsync_config() + print(f" Found {len(groups)} groups") + for group, hosts in groups.items(): + print(f" - {group}: {len(hosts)} hosts") + + print("\n✅ All helpers tested successfully") + + +if __name__ == "__main__": + main() diff --git a/scripts/utils/validators/__init__.py b/scripts/utils/validators/__init__.py new file mode 100644 index 0000000..e9cad86 --- /dev/null +++ b/scripts/utils/validators/__init__.py @@ -0,0 +1,43 @@ +""" +Validators package for Tailscale SSH Sync Agent. +""" + +from .parameter_validator import ( + ValidationError, + validate_host, + validate_group, + validate_path_exists, + validate_timeout, + validate_command +) + +from .host_validator import ( + validate_ssh_config, + validate_host_reachable, + validate_group_members, + get_invalid_hosts +) + +from .connection_validator import ( + validate_ssh_connection, + validate_tailscale_connection, + validate_ssh_key, + get_connection_diagnostics +) + +__all__ = [ + 'ValidationError', + 'validate_host', + 'validate_group', + 'validate_path_exists', + 'validate_timeout', + 'validate_command', + 'validate_ssh_config', + 'validate_host_reachable', + 'validate_group_members', + 'get_invalid_hosts', + 'validate_ssh_connection', + 'validate_tailscale_connection', + 'validate_ssh_key', + 'get_connection_diagnostics', +] diff --git a/scripts/utils/validators/connection_validator.py b/scripts/utils/validators/connection_validator.py new file mode 100644 index 0000000..7aa89d4 --- /dev/null +++ b/scripts/utils/validators/connection_validator.py @@ -0,0 +1,275 @@ +#!/usr/bin/env python3 +""" +Connection validators for Tailscale SSH Sync Agent. +Validates SSH and Tailscale connections. +""" + +import subprocess +from typing import Dict, Optional +import logging + +from .parameter_validator import ValidationError + +logger = logging.getLogger(__name__) + + +def validate_ssh_connection(host: str, timeout: int = 10) -> bool: + """ + Test SSH connection works. + + Args: + host: Host to connect to + timeout: Connection timeout in seconds + + Returns: + True if SSH connection successful + + Raises: + ValidationError: If connection fails + + Example: + >>> validate_ssh_connection("web-01") + True + """ + try: + # Try to execute a simple command via SSH + result = subprocess.run( + ["ssh", "-o", "ConnectTimeout={}".format(timeout), + "-o", "BatchMode=yes", + "-o", "StrictHostKeyChecking=no", + host, "echo", "test"], + capture_output=True, + text=True, + timeout=timeout + 5 + ) + + if result.returncode == 0: + return True + else: + # Parse error message + error_msg = result.stderr.strip() + + if "Permission denied" in error_msg: + raise ValidationError( + f"SSH authentication failed for '{host}'\n" + "Check:\n" + "1. SSH key is added: ssh-add -l\n" + "2. Public key is on remote: cat ~/.ssh/authorized_keys\n" + "3. User/key in SSH config is correct" + ) + elif "Connection refused" in error_msg: + raise ValidationError( + f"SSH connection refused for '{host}'\n" + "Check:\n" + "1. SSH server is running on remote\n" + "2. Port 22 is not blocked by firewall" + ) + elif "Connection timed out" in error_msg or "timeout" in error_msg.lower(): + raise ValidationError( + f"SSH connection timed out for '{host}'\n" + "Check:\n" + "1. Host is reachable (ping test)\n" + "2. Tailscale is connected\n" + "3. Network connectivity" + ) + else: + raise ValidationError( + f"SSH connection failed for '{host}': {error_msg}" + ) + + except subprocess.TimeoutExpired: + raise ValidationError( + f"SSH connection timed out for '{host}' (>{timeout}s)" + ) + except Exception as e: + raise ValidationError(f"Error testing SSH connection to '{host}': {e}") + + +def validate_tailscale_connection(host: str) -> bool: + """ + Test Tailscale connectivity to host. + + Args: + host: Host to check + + Returns: + True if Tailscale connection active + + Raises: + ValidationError: If Tailscale not connected + + Example: + >>> validate_tailscale_connection("web-01") + True + """ + try: + # Check if tailscale is running + result = subprocess.run( + ["tailscale", "status"], + capture_output=True, + text=True, + timeout=5 + ) + + if result.returncode != 0: + raise ValidationError( + "Tailscale is not running\n" + "Start Tailscale: sudo tailscale up" + ) + + # Check if specific host is in the network + if host in result.stdout or host.replace('-', '.') in result.stdout: + return True + else: + raise ValidationError( + f"Host '{host}' not found in Tailscale network\n" + "Ensure host is:\n" + "1. Connected to Tailscale\n" + "2. In the same tailnet\n" + "3. Not expired/offline" + ) + + except FileNotFoundError: + raise ValidationError( + "Tailscale not installed\n" + "Install: https://tailscale.com/download" + ) + except subprocess.TimeoutExpired: + raise ValidationError("Timeout checking Tailscale status") + except Exception as e: + raise ValidationError(f"Error checking Tailscale connection: {e}") + + +def validate_ssh_key(host: str) -> bool: + """ + Check SSH key authentication is working. + + Args: + host: Host to check + + Returns: + True if SSH key auth works + + Raises: + ValidationError: If key auth fails + + Example: + >>> validate_ssh_key("web-01") + True + """ + try: + # Test connection with explicit key-only auth + result = subprocess.run( + ["ssh", "-o", "BatchMode=yes", + "-o", "PasswordAuthentication=no", + "-o", "ConnectTimeout=5", + host, "echo", "test"], + capture_output=True, + text=True, + timeout=10 + ) + + if result.returncode == 0: + return True + else: + error_msg = result.stderr.strip() + + if "Permission denied" in error_msg: + raise ValidationError( + f"SSH key authentication failed for '{host}'\n" + "Fix:\n" + "1. Add your SSH key: ssh-add ~/.ssh/id_ed25519\n" + "2. Copy public key to remote: ssh-copy-id {}\n" + "3. Verify: ssh -v {} 2>&1 | grep -i 'offering public key'".format(host, host) + ) + else: + raise ValidationError( + f"SSH key validation failed for '{host}': {error_msg}" + ) + + except subprocess.TimeoutExpired: + raise ValidationError(f"Timeout validating SSH key for '{host}'") + except Exception as e: + raise ValidationError(f"Error validating SSH key for '{host}': {e}") + + +def get_connection_diagnostics(host: str) -> Dict[str, any]: + """ + Comprehensive connection testing. + + Args: + host: Host to diagnose + + Returns: + Dict with diagnostic results: + { + 'ping': {'success': bool, 'message': str}, + 'ssh': {'success': bool, 'message': str}, + 'tailscale': {'success': bool, 'message': str}, + 'ssh_key': {'success': bool, 'message': str} + } + + Example: + >>> diag = get_connection_diagnostics("web-01") + >>> diag['ssh']['success'] + True + """ + diagnostics = {} + + # Test 1: Ping + try: + result = subprocess.run( + ["ping", "-c", "1", "-W", "2", host], + capture_output=True, + timeout=3 + ) + diagnostics['ping'] = { + 'success': result.returncode == 0, + 'message': 'Host is reachable' if result.returncode == 0 else 'Host not reachable' + } + except Exception as e: + diagnostics['ping'] = {'success': False, 'message': str(e)} + + # Test 2: SSH connection + try: + validate_ssh_connection(host, timeout=5) + diagnostics['ssh'] = {'success': True, 'message': 'SSH connection works'} + except ValidationError as e: + diagnostics['ssh'] = {'success': False, 'message': str(e).split('\n')[0]} + + # Test 3: Tailscale + try: + validate_tailscale_connection(host) + diagnostics['tailscale'] = {'success': True, 'message': 'Tailscale connected'} + except ValidationError as e: + diagnostics['tailscale'] = {'success': False, 'message': str(e).split('\n')[0]} + + # Test 4: SSH key + try: + validate_ssh_key(host) + diagnostics['ssh_key'] = {'success': True, 'message': 'SSH key authentication works'} + except ValidationError as e: + diagnostics['ssh_key'] = {'success': False, 'message': str(e).split('\n')[0]} + + return diagnostics + + +def main(): + """Test connection validators.""" + print("Testing connection validators...\n") + + print("1. Testing connection diagnostics:") + try: + diag = get_connection_diagnostics("localhost") + print(" Results:") + for test, result in diag.items(): + status = "✓" if result['success'] else "✗" + print(f" {status} {test}: {result['message']}") + except Exception as e: + print(f" Error: {e}") + + print("\n✅ Connection validators tested") + + +if __name__ == "__main__": + main() diff --git a/scripts/utils/validators/host_validator.py b/scripts/utils/validators/host_validator.py new file mode 100644 index 0000000..15bf12b --- /dev/null +++ b/scripts/utils/validators/host_validator.py @@ -0,0 +1,232 @@ +#!/usr/bin/env python3 +""" +Host validators for Tailscale SSH Sync Agent. +Validates host configuration and availability. +""" + +import subprocess +from typing import List, Dict, Optional +from pathlib import Path +import logging + +from .parameter_validator import ValidationError + +logger = logging.getLogger(__name__) + + +def validate_ssh_config(host: str, config_path: Optional[Path] = None) -> bool: + """ + Check if host has SSH config entry. + + Args: + host: Host name to check + config_path: Path to SSH config (default: ~/.ssh/config) + + Returns: + True if host is in SSH config + + Raises: + ValidationError: If host not found in config + + Example: + >>> validate_ssh_config("web-01") + True + """ + if config_path is None: + config_path = Path.home() / '.ssh' / 'config' + + if not config_path.exists(): + raise ValidationError( + f"SSH config file not found: {config_path}\n" + "Create ~/.ssh/config with your host definitions" + ) + + # Parse SSH config for this host + host_found = False + + try: + with open(config_path, 'r') as f: + for line in f: + line = line.strip() + if line.lower().startswith('host ') and host in line: + host_found = True + break + + if not host_found: + raise ValidationError( + f"Host '{host}' not found in SSH config: {config_path}\n" + "Add host to SSH config:\n" + f"Host {host}\n" + f" HostName \n" + f" User " + ) + + return True + + except IOError as e: + raise ValidationError(f"Error reading SSH config: {e}") + + +def validate_host_reachable(host: str, timeout: int = 5) -> bool: + """ + Check if host is reachable via ping. + + Args: + host: Host name to check + timeout: Timeout in seconds + + Returns: + True if host is reachable + + Raises: + ValidationError: If host is not reachable + + Example: + >>> validate_host_reachable("web-01", timeout=5) + True + """ + try: + # Try to resolve via SSH config first + result = subprocess.run( + ["ssh", "-G", host], + capture_output=True, + text=True, + timeout=2 + ) + + if result.returncode == 0: + # Extract hostname from SSH config + for line in result.stdout.split('\n'): + if line.startswith('hostname '): + actual_host = line.split()[1] + break + else: + actual_host = host + else: + actual_host = host + + # Ping the host + ping_result = subprocess.run( + ["ping", "-c", "1", "-W", str(timeout), actual_host], + capture_output=True, + text=True, + timeout=timeout + 1 + ) + + if ping_result.returncode == 0: + return True + else: + raise ValidationError( + f"Host '{host}' ({actual_host}) is not reachable\n" + "Check:\n" + "1. Host is powered on\n" + "2. Tailscale is connected\n" + "3. Network connectivity" + ) + + except subprocess.TimeoutExpired: + raise ValidationError(f"Timeout checking host '{host}' (>{timeout}s)") + except Exception as e: + raise ValidationError(f"Error checking host '{host}': {e}") + + +def validate_group_members(group: str, groups_config: Dict[str, List[str]]) -> List[str]: + """ + Ensure group has valid members. + + Args: + group: Group name + groups_config: Groups configuration dict + + Returns: + List of valid hosts in group + + Raises: + ValidationError: If group is empty or has no valid members + + Example: + >>> groups = {'production': ['web-01', 'db-01']} + >>> validate_group_members('production', groups) + ['web-01', 'db-01'] + """ + if group not in groups_config: + raise ValidationError( + f"Group '{group}' not found in configuration\n" + f"Available groups: {', '.join(groups_config.keys())}" + ) + + members = groups_config[group] + + if not members: + raise ValidationError( + f"Group '{group}' has no members\n" + f"Add hosts to group with: sshsync gadd {group}" + ) + + if not isinstance(members, list): + raise ValidationError( + f"Invalid group configuration for '{group}': members must be a list" + ) + + return members + + +def get_invalid_hosts(hosts: List[str], config_path: Optional[Path] = None) -> List[str]: + """ + Find hosts without valid SSH config. + + Args: + hosts: List of host names + config_path: Path to SSH config + + Returns: + List of hosts without valid config + + Example: + >>> get_invalid_hosts(["web-01", "nonexistent"]) + ["nonexistent"] + """ + if config_path is None: + config_path = Path.home() / '.ssh' / 'config' + + if not config_path.exists(): + return hosts # All invalid if no config + + # Parse SSH config + valid_hosts = set() + try: + with open(config_path, 'r') as f: + for line in f: + line = line.strip() + if line.lower().startswith('host '): + host_alias = line.split(maxsplit=1)[1] + if '*' not in host_alias and '?' not in host_alias: + valid_hosts.add(host_alias) + except IOError: + return hosts + + # Find invalid hosts + return [h for h in hosts if h not in valid_hosts] + + +def main(): + """Test host validators.""" + print("Testing host validators...\n") + + print("1. Testing validate_ssh_config():") + try: + validate_ssh_config("localhost") + print(" ✓ localhost has SSH config") + except ValidationError as e: + print(f" Note: {e.args[0].split(chr(10))[0]}") + + print("\n2. Testing get_invalid_hosts():") + test_hosts = ["localhost", "nonexistent-host-12345"] + invalid = get_invalid_hosts(test_hosts) + print(f" Invalid hosts: {invalid}") + + print("\n✅ Host validators tested") + + +if __name__ == "__main__": + main() diff --git a/scripts/utils/validators/parameter_validator.py b/scripts/utils/validators/parameter_validator.py new file mode 100644 index 0000000..b9cfd9f --- /dev/null +++ b/scripts/utils/validators/parameter_validator.py @@ -0,0 +1,363 @@ +#!/usr/bin/env python3 +""" +Parameter validators for Tailscale SSH Sync Agent. +Validates user inputs before making operations. +""" + +from typing import List, Optional +from pathlib import Path +import re +import logging + +logger = logging.getLogger(__name__) + + +class ValidationError(Exception): + """Raised when validation fails.""" + pass + + +def validate_host(host: str, valid_hosts: Optional[List[str]] = None) -> str: + """ + Validate host parameter. + + Args: + host: Host name or alias + valid_hosts: List of valid hosts (None to skip check) + + Returns: + str: Validated and normalized host name + + Raises: + ValidationError: If host is invalid + + Example: + >>> validate_host("web-01") + "web-01" + >>> validate_host("web-01", ["web-01", "web-02"]) + "web-01" + """ + if not host: + raise ValidationError("Host cannot be empty") + + if not isinstance(host, str): + raise ValidationError(f"Host must be string, got {type(host)}") + + # Normalize (strip whitespace, lowercase for comparison) + host = host.strip() + + # Basic validation: alphanumeric, dash, underscore, dot + if not re.match(r'^[a-zA-Z0-9._-]+$', host): + raise ValidationError( + f"Invalid host name format: {host}\n" + "Host names must contain only letters, numbers, dots, dashes, and underscores" + ) + + # Check if valid (if list provided) + if valid_hosts: + # Try exact match first + if host in valid_hosts: + return host + + # Try case-insensitive match + for valid_host in valid_hosts: + if host.lower() == valid_host.lower(): + return valid_host + + # Not found - provide suggestions + suggestions = [h for h in valid_hosts if host[:3].lower() in h.lower()] + raise ValidationError( + f"Invalid host: {host}\n" + f"Valid options: {', '.join(valid_hosts[:10])}\n" + + (f"Did you mean: {', '.join(suggestions[:3])}?" if suggestions else "") + ) + + return host + + +def validate_group(group: str, valid_groups: Optional[List[str]] = None) -> str: + """ + Validate group parameter. + + Args: + group: Group name + valid_groups: List of valid groups (None to skip check) + + Returns: + str: Validated group name + + Raises: + ValidationError: If group is invalid + + Example: + >>> validate_group("production") + "production" + >>> validate_group("prod", ["production", "development"]) + ValidationError: Invalid group: prod + """ + if not group: + raise ValidationError("Group cannot be empty") + + if not isinstance(group, str): + raise ValidationError(f"Group must be string, got {type(group)}") + + # Normalize + group = group.strip().lower() + + # Basic validation + if not re.match(r'^[a-z0-9_-]+$', group): + raise ValidationError( + f"Invalid group name format: {group}\n" + "Group names must contain only lowercase letters, numbers, dashes, and underscores" + ) + + # Check if valid (if list provided) + if valid_groups: + if group not in valid_groups: + suggestions = [g for g in valid_groups if group[:3] in g] + raise ValidationError( + f"Invalid group: {group}\n" + f"Valid groups: {', '.join(valid_groups)}\n" + + (f"Did you mean: {', '.join(suggestions[:3])}?" if suggestions else "") + ) + + return group + + +def validate_path_exists(path: str, must_be_file: bool = False, + must_be_dir: bool = False) -> Path: + """ + Validate path exists and is accessible. + + Args: + path: Path to validate + must_be_file: If True, path must be a file + must_be_dir: If True, path must be a directory + + Returns: + Path: Validated Path object + + Raises: + ValidationError: If path is invalid + + Example: + >>> validate_path_exists("/tmp", must_be_dir=True) + Path('/tmp') + >>> validate_path_exists("/nonexistent") + ValidationError: Path does not exist: /nonexistent + """ + if not path: + raise ValidationError("Path cannot be empty") + + p = Path(path).expanduser().resolve() + + if not p.exists(): + raise ValidationError( + f"Path does not exist: {path}\n" + f"Resolved to: {p}" + ) + + if must_be_file and not p.is_file(): + raise ValidationError(f"Path must be a file: {path}") + + if must_be_dir and not p.is_dir(): + raise ValidationError(f"Path must be a directory: {path}") + + return p + + +def validate_timeout(timeout: int, min_timeout: int = 1, + max_timeout: int = 600) -> int: + """ + Validate timeout parameter. + + Args: + timeout: Timeout in seconds + min_timeout: Minimum allowed timeout + max_timeout: Maximum allowed timeout + + Returns: + int: Validated timeout + + Raises: + ValidationError: If timeout is invalid + + Example: + >>> validate_timeout(10) + 10 + >>> validate_timeout(0) + ValidationError: Timeout must be between 1 and 600 seconds + """ + if not isinstance(timeout, int): + raise ValidationError(f"Timeout must be integer, got {type(timeout)}") + + if timeout < min_timeout: + raise ValidationError( + f"Timeout too low: {timeout}s (minimum: {min_timeout}s)" + ) + + if timeout > max_timeout: + raise ValidationError( + f"Timeout too high: {timeout}s (maximum: {max_timeout}s)" + ) + + return timeout + + +def validate_command(command: str, allow_dangerous: bool = False) -> str: + """ + Basic command safety validation. + + Args: + command: Command to validate + allow_dangerous: If False, block potentially dangerous commands + + Returns: + str: Validated command + + Raises: + ValidationError: If command is invalid or dangerous + + Example: + >>> validate_command("ls -la") + "ls -la" + >>> validate_command("rm -rf /", allow_dangerous=False) + ValidationError: Potentially dangerous command blocked: rm -rf + """ + if not command: + raise ValidationError("Command cannot be empty") + + if not isinstance(command, str): + raise ValidationError(f"Command must be string, got {type(command)}") + + command = command.strip() + + if not allow_dangerous: + # Check for dangerous patterns + dangerous_patterns = [ + (r'\brm\s+-rf\s+/', "rm -rf on root directory"), + (r'\bmkfs\.', "filesystem formatting"), + (r'\bdd\s+.*of=/dev/', "disk writing with dd"), + (r':(){:|:&};:', "fork bomb"), + (r'>\s*/dev/sd[a-z]', "direct disk writing"), + ] + + for pattern, description in dangerous_patterns: + if re.search(pattern, command, re.IGNORECASE): + raise ValidationError( + f"Potentially dangerous command blocked: {description}\n" + f"Command: {command}\n" + "Use allow_dangerous=True if you really want to execute this" + ) + + return command + + +def validate_hosts_list(hosts: List[str], valid_hosts: Optional[List[str]] = None) -> List[str]: + """ + Validate a list of hosts. + + Args: + hosts: List of host names + valid_hosts: List of valid hosts (None to skip check) + + Returns: + List[str]: Validated host names + + Raises: + ValidationError: If any host is invalid + + Example: + >>> validate_hosts_list(["web-01", "web-02"]) + ["web-01", "web-02"] + """ + if not hosts: + raise ValidationError("Hosts list cannot be empty") + + if not isinstance(hosts, list): + raise ValidationError(f"Hosts must be list, got {type(hosts)}") + + validated = [] + errors = [] + + for host in hosts: + try: + validated.append(validate_host(host, valid_hosts)) + except ValidationError as e: + errors.append(str(e)) + + if errors: + raise ValidationError( + f"Invalid hosts in list:\n" + "\n".join(errors) + ) + + return validated + + +def main(): + """Test validators.""" + print("Testing parameter validators...\n") + + # Test host validation + print("1. Testing validate_host():") + try: + host = validate_host("web-01", ["web-01", "web-02", "db-01"]) + print(f" ✓ Valid host: {host}") + except ValidationError as e: + print(f" ✗ Error: {e}") + + try: + host = validate_host("invalid-host", ["web-01", "web-02"]) + print(f" ✗ Should have failed!") + except ValidationError as e: + print(f" ✓ Correctly rejected: {e.args[0].split(chr(10))[0]}") + + # Test group validation + print("\n2. Testing validate_group():") + try: + group = validate_group("production", ["production", "development"]) + print(f" ✓ Valid group: {group}") + except ValidationError as e: + print(f" ✗ Error: {e}") + + # Test path validation + print("\n3. Testing validate_path_exists():") + try: + path = validate_path_exists("/tmp", must_be_dir=True) + print(f" ✓ Valid path: {path}") + except ValidationError as e: + print(f" ✗ Error: {e}") + + # Test timeout validation + print("\n4. Testing validate_timeout():") + try: + timeout = validate_timeout(10) + print(f" ✓ Valid timeout: {timeout}s") + except ValidationError as e: + print(f" ✗ Error: {e}") + + try: + timeout = validate_timeout(0) + print(f" ✗ Should have failed!") + except ValidationError as e: + print(f" ✓ Correctly rejected: {e.args[0].split(chr(10))[0]}") + + # Test command validation + print("\n5. Testing validate_command():") + try: + cmd = validate_command("ls -la") + print(f" ✓ Safe command: {cmd}") + except ValidationError as e: + print(f" ✗ Error: {e}") + + try: + cmd = validate_command("rm -rf /", allow_dangerous=False) + print(f" ✗ Should have failed!") + except ValidationError as e: + print(f" ✓ Correctly blocked: {e.args[0].split(chr(10))[0]}") + + print("\n✅ All parameter validators tested") + + +if __name__ == "__main__": + main() diff --git a/scripts/workflow_executor.py b/scripts/workflow_executor.py new file mode 100644 index 0000000..1416acb --- /dev/null +++ b/scripts/workflow_executor.py @@ -0,0 +1,445 @@ +#!/usr/bin/env python3 +""" +Workflow executor for Tailscale SSH Sync Agent. +Common multi-machine workflow automation. +""" + +import sys +from pathlib import Path +from typing import Dict, List, Optional +import time +import logging + +# Add utils to path +sys.path.insert(0, str(Path(__file__).parent)) + +from utils.helpers import format_duration, get_timestamp +from sshsync_wrapper import execute_on_group, execute_on_host, push_to_hosts +from load_balancer import get_group_capacity + +logger = logging.getLogger(__name__) + + +def deploy_workflow(code_path: str, + staging_group: str, + prod_group: str, + run_tests: bool = True) -> Dict: + """ + Full deployment pipeline: staging → test → production. + + Args: + code_path: Path to code to deploy + staging_group: Staging server group + prod_group: Production server group + run_tests: Whether to run tests on staging + + Returns: + Dict with deployment results + + Example: + >>> result = deploy_workflow("./dist", "staging", "production") + >>> result['success'] + True + >>> result['duration'] + "12m 45s" + """ + start_time = time.time() + results = { + 'stages': {}, + 'success': False, + 'start_time': get_timestamp() + } + + try: + # Stage 1: Deploy to staging + logger.info("Stage 1: Deploying to staging...") + stage1 = push_to_hosts( + local_path=code_path, + remote_path="/var/www/app", + group=staging_group, + recurse=True + ) + + results['stages']['staging_deploy'] = stage1 + + if not stage1.get('success'): + results['error'] = 'Staging deployment failed' + return results + + # Build on staging + logger.info("Building on staging...") + build_result = execute_on_group( + staging_group, + "cd /var/www/app && npm run build", + timeout=300 + ) + + results['stages']['staging_build'] = build_result + + if not build_result.get('success'): + results['error'] = 'Staging build failed' + return results + + # Stage 2: Run tests (if enabled) + if run_tests: + logger.info("Stage 2: Running tests...") + test_result = execute_on_group( + staging_group, + "cd /var/www/app && npm test", + timeout=600 + ) + + results['stages']['tests'] = test_result + + if not test_result.get('success'): + results['error'] = 'Tests failed on staging' + return results + + # Stage 3: Validation + logger.info("Stage 3: Validating staging...") + health_result = execute_on_group( + staging_group, + "curl -f http://localhost:3000/health || echo 'Health check failed'", + timeout=10 + ) + + results['stages']['staging_validation'] = health_result + + # Stage 4: Deploy to production + logger.info("Stage 4: Deploying to production...") + prod_deploy = push_to_hosts( + local_path=code_path, + remote_path="/var/www/app", + group=prod_group, + recurse=True + ) + + results['stages']['production_deploy'] = prod_deploy + + if not prod_deploy.get('success'): + results['error'] = 'Production deployment failed' + return results + + # Build and restart on production + logger.info("Building and restarting production...") + prod_build = execute_on_group( + prod_group, + "cd /var/www/app && npm run build && pm2 restart app", + timeout=300 + ) + + results['stages']['production_build'] = prod_build + + # Stage 5: Production verification + logger.info("Stage 5: Verifying production...") + prod_health = execute_on_group( + prod_group, + "curl -f http://localhost:3000/health", + timeout=15 + ) + + results['stages']['production_verification'] = prod_health + + # Success! + results['success'] = True + results['duration'] = format_duration(time.time() - start_time) + + return results + + except Exception as e: + logger.error(f"Deployment workflow error: {e}") + results['error'] = str(e) + results['duration'] = format_duration(time.time() - start_time) + return results + + +def backup_workflow(hosts: List[str], + backup_paths: List[str], + destination: str) -> Dict: + """ + Backup files from multiple hosts. + + Args: + hosts: List of hosts to backup from + backup_paths: Paths to backup on each host + destination: Local destination directory + + Returns: + Dict with backup results + + Example: + >>> result = backup_workflow( + ... ["db-01", "db-02"], + ... ["/var/lib/mysql"], + ... "./backups" + ... ) + >>> result['backed_up_hosts'] + 2 + """ + from sshsync_wrapper import pull_from_host + + start_time = time.time() + results = { + 'hosts': {}, + 'success': True, + 'backed_up_hosts': 0 + } + + for host in hosts: + host_results = [] + + for backup_path in backup_paths: + # Create timestamped backup directory + timestamp = time.strftime("%Y%m%d_%H%M%S") + host_dest = f"{destination}/{host}_{timestamp}" + + result = pull_from_host( + host=host, + remote_path=backup_path, + local_path=host_dest, + recurse=True + ) + + host_results.append(result) + + if not result.get('success'): + results['success'] = False + + results['hosts'][host] = host_results + + if all(r.get('success') for r in host_results): + results['backed_up_hosts'] += 1 + + results['duration'] = format_duration(time.time() - start_time) + + return results + + +def sync_workflow(source_host: str, + target_group: str, + paths: List[str]) -> Dict: + """ + Sync files from one host to many. + + Args: + source_host: Host to pull from + target_group: Group to push to + paths: Paths to sync + + Returns: + Dict with sync results + + Example: + >>> result = sync_workflow( + ... "master-db", + ... "replica-dbs", + ... ["/var/lib/mysql/config"] + ... ) + >>> result['success'] + True + """ + from sshsync_wrapper import pull_from_host, push_to_hosts + import tempfile + import shutil + + start_time = time.time() + results = {'paths': {}, 'success': True} + + # Create temp directory + with tempfile.TemporaryDirectory() as temp_dir: + for path in paths: + # Pull from source + pull_result = pull_from_host( + host=source_host, + remote_path=path, + local_path=f"{temp_dir}/{Path(path).name}", + recurse=True + ) + + if not pull_result.get('success'): + results['paths'][path] = { + 'success': False, + 'error': 'Pull from source failed' + } + results['success'] = False + continue + + # Push to targets + push_result = push_to_hosts( + local_path=f"{temp_dir}/{Path(path).name}", + remote_path=path, + group=target_group, + recurse=True + ) + + results['paths'][path] = { + 'pull': pull_result, + 'push': push_result, + 'success': push_result.get('success', False) + } + + if not push_result.get('success'): + results['success'] = False + + results['duration'] = format_duration(time.time() - start_time) + + return results + + +def rolling_restart(group: str, + service_name: str, + wait_between: int = 30) -> Dict: + """ + Zero-downtime rolling restart of a service across group. + + Args: + group: Group to restart + service_name: Service name (e.g., "nginx", "app") + wait_between: Seconds to wait between restarts + + Returns: + Dict with restart results + + Example: + >>> result = rolling_restart("web-servers", "nginx") + >>> result['restarted_count'] + 3 + """ + from utils.helpers import parse_sshsync_config + + start_time = time.time() + groups_config = parse_sshsync_config() + hosts = groups_config.get(group, []) + + if not hosts: + return { + 'success': False, + 'error': f'Group {group} not found or empty' + } + + results = { + 'hosts': {}, + 'restarted_count': 0, + 'failed_count': 0, + 'success': True + } + + for host in hosts: + logger.info(f"Restarting {service_name} on {host}...") + + # Restart service + restart_result = execute_on_host( + host, + f"sudo systemctl restart {service_name} || sudo service {service_name} restart", + timeout=30 + ) + + # Health check + time.sleep(5) # Wait for service to start + + health_result = execute_on_host( + host, + f"sudo systemctl is-active {service_name} || sudo service {service_name} status", + timeout=10 + ) + + success = restart_result.get('success') and health_result.get('success') + + results['hosts'][host] = { + 'restart': restart_result, + 'health': health_result, + 'success': success + } + + if success: + results['restarted_count'] += 1 + logger.info(f"✓ {host} restarted successfully") + else: + results['failed_count'] += 1 + results['success'] = False + logger.error(f"✗ {host} restart failed") + + # Wait before next restart (except last) + if host != hosts[-1]: + time.sleep(wait_between) + + results['duration'] = format_duration(time.time() - start_time) + + return results + + +def health_check_workflow(group: str, + endpoint: str = "/health", + timeout: int = 10) -> Dict: + """ + Check health endpoint across group. + + Args: + group: Group to check + endpoint: Health endpoint path + timeout: Request timeout + + Returns: + Dict with health check results + + Example: + >>> result = health_check_workflow("production", "/health") + >>> result['healthy_count'] + 3 + """ + from utils.helpers import parse_sshsync_config + + groups_config = parse_sshsync_config() + hosts = groups_config.get(group, []) + + if not hosts: + return { + 'success': False, + 'error': f'Group {group} not found or empty' + } + + results = { + 'hosts': {}, + 'healthy_count': 0, + 'unhealthy_count': 0 + } + + for host in hosts: + health_result = execute_on_host( + host, + f"curl -f -s -o /dev/null -w '%{{http_code}}' http://localhost:3000{endpoint}", + timeout=timeout + ) + + is_healthy = ( + health_result.get('success') and + '200' in health_result.get('stdout', '') + ) + + results['hosts'][host] = { + 'healthy': is_healthy, + 'response': health_result.get('stdout', '').strip() + } + + if is_healthy: + results['healthy_count'] += 1 + else: + results['unhealthy_count'] += 1 + + results['success'] = results['unhealthy_count'] == 0 + + return results + + +def main(): + """Test workflow executor functions.""" + print("Testing workflow executor...\n") + + print("Note: Workflow executor requires configured hosts and groups.") + print("Tests would execute real operations, so showing dry-run simulations.\n") + + print("✅ Workflow executor ready") + + +if __name__ == "__main__": + main() diff --git a/tests/test_helpers.py b/tests/test_helpers.py new file mode 100644 index 0000000..d50226e --- /dev/null +++ b/tests/test_helpers.py @@ -0,0 +1,180 @@ +#!/usr/bin/env python3 +""" +Tests for helper utilities. +""" + +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent / 'scripts')) + +from utils.helpers import * + + +def test_format_bytes(): + """Test byte formatting.""" + assert format_bytes(0) == "0.0 B" + assert format_bytes(512) == "512.0 B" + assert format_bytes(1024) == "1.0 KB" + assert format_bytes(1048576) == "1.0 MB" + assert format_bytes(1073741824) == "1.0 GB" + print("✓ format_bytes() passed") + return True + + +def test_format_duration(): + """Test duration formatting.""" + assert format_duration(30) == "30s" + assert format_duration(65) == "1m 5s" + assert format_duration(3600) == "1h" + assert format_duration(3665) == "1h 1m" + assert format_duration(7265) == "2h 1m" + print("✓ format_duration() passed") + return True + + +def test_format_percentage(): + """Test percentage formatting.""" + assert format_percentage(45.567) == "45.6%" + assert format_percentage(100) == "100.0%" + assert format_percentage(0.123, decimals=2) == "0.12%" + print("✓ format_percentage() passed") + return True + + +def test_calculate_load_score(): + """Test load score calculation.""" + score = calculate_load_score(50, 50, 50) + assert 0 <= score <= 1 + assert abs(score - 0.5) < 0.01 + + score_low = calculate_load_score(20, 30, 25) + score_high = calculate_load_score(80, 85, 90) + assert score_low < score_high + + print("✓ calculate_load_score() passed") + return True + + +def test_classify_load_status(): + """Test load status classification.""" + assert classify_load_status(0.2) == "low" + assert classify_load_status(0.5) == "moderate" + assert classify_load_status(0.8) == "high" + print("✓ classify_load_status() passed") + return True + + +def test_classify_latency(): + """Test latency classification.""" + status, desc = classify_latency(25) + assert status == "excellent" + assert "interactive" in desc.lower() + + status, desc = classify_latency(150) + assert status == "fair" + + print("✓ classify_latency() passed") + return True + + +def test_parse_disk_usage(): + """Test disk usage parsing.""" + sample_output = """Filesystem Size Used Avail Use% Mounted on +/dev/sda1 100G 45G 50G 45% /""" + + result = parse_disk_usage(sample_output) + assert result['filesystem'] == '/dev/sda1' + assert result['size'] == '100G' + assert result['used'] == '45G' + assert result['use_pct'] == 45 + + print("✓ parse_disk_usage() passed") + return True + + +def test_parse_cpu_load(): + """Test CPU load parsing.""" + sample_output = "19:43:41 up 5 days, 2:15, 3 users, load average: 0.45, 0.38, 0.32" + + result = parse_cpu_load(sample_output) + assert result['load_1min'] == 0.45 + assert result['load_5min'] == 0.38 + assert result['load_15min'] == 0.32 + + print("✓ parse_cpu_load() passed") + return True + + +def test_get_timestamp(): + """Test timestamp generation.""" + ts_iso = get_timestamp(iso=True) + assert 'T' in ts_iso + assert 'Z' in ts_iso + + ts_human = get_timestamp(iso=False) + assert ' ' in ts_human + assert len(ts_human) == 19 # YYYY-MM-DD HH:MM:SS + + print("✓ get_timestamp() passed") + return True + + +def test_validate_path(): + """Test path validation.""" + assert validate_path("/tmp", must_exist=True) == True + assert validate_path("/nonexistent_path_12345", must_exist=False) == False + + print("✓ validate_path() passed") + return True + + +def test_safe_execute(): + """Test safe execution wrapper.""" + # Should return result on success + result = safe_execute(int, "42") + assert result == 42 + + # Should return default on failure + result = safe_execute(int, "not_a_number", default=0) + assert result == 0 + + print("✓ safe_execute() passed") + return True + + +def main(): + """Run all helper tests.""" + print("=" * 70) + print("HELPER TESTS") + print("=" * 70) + + tests = [ + test_format_bytes, + test_format_duration, + test_format_percentage, + test_calculate_load_score, + test_classify_load_status, + test_classify_latency, + test_parse_disk_usage, + test_parse_cpu_load, + test_get_timestamp, + test_validate_path, + test_safe_execute, + ] + + passed = 0 + for test in tests: + try: + if test(): + passed += 1 + except Exception as e: + print(f"✗ {test.__name__} failed: {e}") + + print(f"\nResults: {passed}/{len(tests)} passed") + return passed == len(tests) + + +if __name__ == "__main__": + success = main() + sys.exit(0 if success else 1) diff --git a/tests/test_integration.py b/tests/test_integration.py new file mode 100644 index 0000000..20b1a9a --- /dev/null +++ b/tests/test_integration.py @@ -0,0 +1,346 @@ +#!/usr/bin/env python3 +""" +Integration tests for Tailscale SSH Sync Agent. +Tests complete workflows from query to result. +""" + +import sys +from pathlib import Path + +# Add scripts to path +sys.path.insert(0, str(Path(__file__).parent.parent / 'scripts')) + +from sshsync_wrapper import get_host_status, list_hosts, get_groups +from tailscale_manager import get_tailscale_status, get_network_summary +from load_balancer import format_load_report, MachineMetrics +from utils.helpers import ( + format_bytes, format_duration, format_percentage, + calculate_load_score, classify_load_status, classify_latency +) + + +def test_host_status_basic(): + """Test get_host_status() without errors.""" + print("\n✓ Testing get_host_status()...") + + try: + result = get_host_status() + + # Validations + assert 'hosts' in result, "Missing 'hosts' in result" + assert isinstance(result.get('hosts', []), list), "'hosts' must be list" + + # Should have basic counts even if no hosts configured + assert 'total_count' in result, "Missing 'total_count'" + assert 'online_count' in result, "Missing 'online_count'" + assert 'offline_count' in result, "Missing 'offline_count'" + + print(f" ✓ Found {result.get('total_count', 0)} hosts") + print(f" ✓ Online: {result.get('online_count', 0)}") + print(f" ✓ Offline: {result.get('offline_count', 0)}") + + return True + + except Exception as e: + print(f" ✗ FAILED: {e}") + import traceback + traceback.print_exc() + return False + + +def test_list_hosts(): + """Test list_hosts() function.""" + print("\n✓ Testing list_hosts()...") + + try: + result = list_hosts(with_status=False) + + assert 'hosts' in result, "Missing 'hosts' in result" + assert 'count' in result, "Missing 'count' in result" + assert isinstance(result['hosts'], list), "'hosts' must be list" + + print(f" ✓ List hosts working") + print(f" ✓ Found {result['count']} configured hosts") + + return True + + except Exception as e: + print(f" ✗ FAILED: {e}") + return False + + +def test_get_groups(): + """Test get_groups() function.""" + print("\n✓ Testing get_groups()...") + + try: + groups = get_groups() + + assert isinstance(groups, dict), "Groups must be dict" + + print(f" ✓ Groups config loaded") + print(f" ✓ Found {len(groups)} groups") + + for group, hosts in list(groups.items())[:3]: # Show first 3 + print(f" - {group}: {len(hosts)} hosts") + + return True + + except Exception as e: + print(f" ✗ FAILED: {e}") + return False + + +def test_tailscale_status(): + """Test Tailscale status check.""" + print("\n✓ Testing get_tailscale_status()...") + + try: + status = get_tailscale_status() + + assert isinstance(status, dict), "Status must be dict" + assert 'connected' in status, "Missing 'connected' field" + + if status.get('connected'): + print(f" ✓ Tailscale connected") + print(f" ✓ Peers: {status.get('total_count', 0)} total, {status.get('online_count', 0)} online") + else: + print(f" ℹ Tailscale not connected: {status.get('error', 'Unknown')}") + print(f" (This is OK if Tailscale is not installed/configured)") + + return True + + except Exception as e: + print(f" ✗ FAILED: {e}") + return False + + +def test_network_summary(): + """Test network summary generation.""" + print("\n✓ Testing get_network_summary()...") + + try: + summary = get_network_summary() + + assert isinstance(summary, str), "Summary must be string" + assert len(summary) > 0, "Summary cannot be empty" + + print(f" ✓ Network summary generated:") + for line in summary.split('\n'): + print(f" {line}") + + return True + + except Exception as e: + print(f" ✗ FAILED: {e}") + return False + + +def test_format_helpers(): + """Test formatting helper functions.""" + print("\n✓ Testing format helpers...") + + try: + # Test format_bytes + assert format_bytes(1024) == "1.0 KB", "format_bytes failed for 1024" + assert format_bytes(12582912) == "12.0 MB", "format_bytes failed for 12MB" + + # Test format_duration + assert format_duration(65) == "1m 5s", "format_duration failed for 65s" + assert format_duration(3665) == "1h 1m", "format_duration failed for 1h+" + + # Test format_percentage + assert format_percentage(45.567) == "45.6%", "format_percentage failed" + + print(f" ✓ format_bytes(12582912) = {format_bytes(12582912)}") + print(f" ✓ format_duration(3665) = {format_duration(3665)}") + print(f" ✓ format_percentage(45.567) = {format_percentage(45.567)}") + + return True + + except Exception as e: + print(f" ✗ FAILED: {e}") + return False + + +def test_load_score_calculation(): + """Test load score calculation.""" + print("\n✓ Testing calculate_load_score()...") + + try: + # Test various scenarios + score1 = calculate_load_score(45, 60, 40) + assert 0 <= score1 <= 1, "Score must be 0-1" + assert abs(score1 - 0.49) < 0.01, f"Expected ~0.49, got {score1}" + + score2 = calculate_load_score(20, 35, 30) + assert score2 < score1, "Lower usage should have lower score" + + score3 = calculate_load_score(85, 70, 65) + assert score3 > score1, "Higher usage should have higher score" + + print(f" ✓ Low load (20%, 35%, 30%): {score2:.2f}") + print(f" ✓ Med load (45%, 60%, 40%): {score1:.2f}") + print(f" ✓ High load (85%, 70%, 65%): {score3:.2f}") + + return True + + except Exception as e: + print(f" ✗ FAILED: {e}") + return False + + +def test_load_classification(): + """Test load status classification.""" + print("\n✓ Testing classify_load_status()...") + + try: + assert classify_load_status(0.28) == "low", "0.28 should be 'low'" + assert classify_load_status(0.55) == "moderate", "0.55 should be 'moderate'" + assert classify_load_status(0.82) == "high", "0.82 should be 'high'" + + print(f" ✓ Score 0.28 = {classify_load_status(0.28)}") + print(f" ✓ Score 0.55 = {classify_load_status(0.55)}") + print(f" ✓ Score 0.82 = {classify_load_status(0.82)}") + + return True + + except Exception as e: + print(f" ✗ FAILED: {e}") + return False + + +def test_latency_classification(): + """Test network latency classification.""" + print("\n✓ Testing classify_latency()...") + + try: + status1, desc1 = classify_latency(25) + assert status1 == "excellent", "25ms should be 'excellent'" + + status2, desc2 = classify_latency(75) + assert status2 == "good", "75ms should be 'good'" + + status3, desc3 = classify_latency(150) + assert status3 == "fair", "150ms should be 'fair'" + + status4, desc4 = classify_latency(250) + assert status4 == "poor", "250ms should be 'poor'" + + print(f" ✓ 25ms: {status1} - {desc1}") + print(f" ✓ 75ms: {status2} - {desc2}") + print(f" ✓ 150ms: {status3} - {desc3}") + print(f" ✓ 250ms: {status4} - {desc4}") + + return True + + except Exception as e: + print(f" ✗ FAILED: {e}") + return False + + +def test_load_report_formatting(): + """Test load report formatting.""" + print("\n✓ Testing format_load_report()...") + + try: + metrics = MachineMetrics( + host='web-01', + cpu_pct=45.0, + mem_pct=60.0, + disk_pct=40.0, + load_score=0.49, + status='moderate' + ) + + report = format_load_report(metrics) + + assert 'web-01' in report, "Report must include hostname" + assert '0.49' in report, "Report must include load score" + assert 'moderate' in report, "Report must include status" + + print(f" ✓ Report generated:") + for line in report.split('\n'): + print(f" {line}") + + return True + + except Exception as e: + print(f" ✗ FAILED: {e}") + return False + + +def test_dry_run_execution(): + """Test dry-run mode for operations.""" + print("\n✓ Testing dry-run execution...") + + try: + from sshsync_wrapper import execute_on_all + + result = execute_on_all("uptime", dry_run=True) + + assert result.get('dry_run') == True, "Must indicate dry-run mode" + assert 'command' in result, "Must include command" + assert 'message' in result, "Must include message" + + print(f" ✓ Dry-run mode working") + print(f" ✓ Command: {result.get('command')}") + print(f" ✓ Message: {result.get('message')}") + + return True + + except Exception as e: + print(f" ✗ FAILED: {e}") + return False + + +def main(): + """Run all integration tests.""" + print("=" * 70) + print("INTEGRATION TESTS - Tailscale SSH Sync Agent") + print("=" * 70) + + tests = [ + ("Host status check", test_host_status_basic), + ("List hosts", test_list_hosts), + ("Get groups", test_get_groups), + ("Tailscale status", test_tailscale_status), + ("Network summary", test_network_summary), + ("Format helpers", test_format_helpers), + ("Load score calculation", test_load_score_calculation), + ("Load classification", test_load_classification), + ("Latency classification", test_latency_classification), + ("Load report formatting", test_load_report_formatting), + ("Dry-run execution", test_dry_run_execution), + ] + + results = [] + for test_name, test_func in tests: + passed = test_func() + results.append((test_name, passed)) + + # Summary + print("\n" + "=" * 70) + print("SUMMARY") + print("=" * 70) + + for test_name, passed in results: + status = "✅ PASS" if passed else "❌ FAIL" + print(f"{status}: {test_name}") + + passed_count = sum(1 for _, p in results if p) + total_count = len(results) + + print(f"\nResults: {passed_count}/{total_count} passed") + + if passed_count == total_count: + print("\n🎉 All tests passed!") + else: + print(f"\n⚠️ {total_count - passed_count} test(s) failed") + + return passed_count == total_count + + +if __name__ == "__main__": + success = main() + sys.exit(0 if success else 1) diff --git a/tests/test_validation.py b/tests/test_validation.py new file mode 100644 index 0000000..77c744f --- /dev/null +++ b/tests/test_validation.py @@ -0,0 +1,177 @@ +#!/usr/bin/env python3 +""" +Tests for validators. +""" + +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent / 'scripts')) + +from utils.validators import * + + +def test_validate_host(): + """Test host validation.""" + # Valid host + assert validate_host("web-01") == "web-01" + assert validate_host(" web-01 ") == "web-01" # Strips whitespace + + # With valid list + assert validate_host("web-01", ["web-01", "web-02"]) == "web-01" + + # Invalid format + try: + validate_host("web@01") # Invalid character + assert False, "Should have raised ValidationError" + except ValidationError: + pass + + print("✓ validate_host() passed") + return True + + +def test_validate_group(): + """Test group validation.""" + # Valid group + assert validate_group("production") == "production" + assert validate_group("PRODUCTION") == "production" # Lowercase normalization + + # With valid list + assert validate_group("production", ["production", "staging"]) == "production" + + # Invalid + try: + validate_group("invalid!", ["production"]) + assert False, "Should have raised ValidationError" + except ValidationError: + pass + + print("✓ validate_group() passed") + return True + + +def test_validate_path_exists(): + """Test path existence validation.""" + # Valid path + path = validate_path_exists("/tmp", must_be_dir=True) + assert isinstance(path, Path) + + # Invalid path + try: + validate_path_exists("/nonexistent_12345") + assert False, "Should have raised ValidationError" + except ValidationError: + pass + + print("✓ validate_path_exists() passed") + return True + + +def test_validate_timeout(): + """Test timeout validation.""" + # Valid timeouts + assert validate_timeout(10) == 10 + assert validate_timeout(1) == 1 + assert validate_timeout(600) == 600 + + # Too low + try: + validate_timeout(0) + assert False, "Should have raised ValidationError" + except ValidationError: + pass + + # Too high + try: + validate_timeout(1000) + assert False, "Should have raised ValidationError" + except ValidationError: + pass + + print("✓ validate_timeout() passed") + return True + + +def test_validate_command(): + """Test command validation.""" + # Safe commands + assert validate_command("ls -la") == "ls -la" + assert validate_command("uptime") == "uptime" + + # Dangerous commands (should fail without allow_dangerous) + try: + validate_command("rm -rf /") + assert False, "Should have blocked dangerous command" + except ValidationError: + pass + + # But should work with allow_dangerous + assert validate_command("rm -rf /tmp/test", allow_dangerous=True) + + print("✓ validate_command() passed") + return True + + +def test_validate_hosts_list(): + """Test list validation.""" + # Valid list + hosts = validate_hosts_list(["web-01", "web-02"]) + assert len(hosts) == 2 + assert "web-01" in hosts + + # Empty list + try: + validate_hosts_list([]) + assert False, "Should have raised ValidationError for empty list" + except ValidationError: + pass + + print("✓ validate_hosts_list() passed") + return True + + +def test_get_invalid_hosts(): + """Test finding invalid hosts.""" + # Test with mix of valid and invalid + # (This would require actual SSH config, so we test the function exists) + result = get_invalid_hosts(["web-01", "nonexistent-host-12345"]) + assert isinstance(result, list) + + print("✓ get_invalid_hosts() passed") + return True + + +def main(): + """Run all validation tests.""" + print("=" * 70) + print("VALIDATION TESTS") + print("=" * 70) + + tests = [ + test_validate_host, + test_validate_group, + test_validate_path_exists, + test_validate_timeout, + test_validate_command, + test_validate_hosts_list, + test_get_invalid_hosts, + ] + + passed = 0 + for test in tests: + try: + if test(): + passed += 1 + except Exception as e: + print(f"✗ {test.__name__} failed: {e}") + import traceback + traceback.print_exc() + + print(f"\nResults: {passed}/{len(tests)} passed") + return passed == len(tests) + + +if __name__ == "__main__": + success = main() + sys.exit(0 if success else 1)